def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) out = io.BytesIO() audio.save_wav(audio.inv_preemphasis(wav), out) return out.getvalue()
def synthesize(self, text, out): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio.save_wav(wav, out) return
def on_get(self, req, res): if not req.params.get('text'): raise falcon.HTTPBadRequest() text = req.params.get('text') speaker = 5 if not req.params.get('speaker') else int( req.params.get('speaker')) wav = tts(model, req.params.get('text'), 0, speaker) # wav = audio.inv_preemphasis(wav) # wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) res.data = out.getvalue() res.content_type = 'audio/wav'
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): # 将中文转换为注音字符 text = Pinyin().get_pinyin(text, " ", tone_marks='numbers') cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # 注音字符到序列 seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)} wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } linears = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict) wav = audio.inv_spectrogram(linears.T) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, in_file): src_spectrogram = audio.spectrogram(in_file, num_src_freq=hparams.num_src_freq, frame_length_ms=hparams.src_frame_length_ms).astype(np.float32) feed_dict = { self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)], self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text, reference_mel): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.mel_targets: [np.asarray(reference_mel, dtype=np.float32)] } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, input_path): s, sr = sf.read(input_path) spec = audio.melspectrogram(s).astype(np.float32).T feed_dict = { self.model.inputs: [np.asarray(spec, dtype=np.float32)], self.model.input_lengths: np.asarray([spec.shape[0]], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): seq = textinput.to_sequence( text, force_lowercase=hparams.force_lowercase, expand_abbreviations=hparams.expand_abbreviations) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } spec, alignments = self.session.run( [self.model.linear_outputs[0], self.model.alignments[0]], feed_dict=feed_dict) out = io.BytesIO() audio.save_wav(audio.inv_spectrogram(spec.T), out) return out.getvalue(), alignments
def synthesize(self, text): # for demo_server cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) # print(type(out)) return out.getvalue() # returns bytes obj
def synthesize(self, text): text = arpa.to_arpa(text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() #audio.save_wav(wav, out) audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample") print("finishhhhhhhhhhhhhhh") return out.getvalue()
def synthesize(self, text, base_path, idx): seq = text_to_sequence(text) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } input_seq, wav, alignment = self.session.run( [self.inputs, self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) input_seq = sequence_to_text(input_seq) plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx), input_seq) return out.getvalue()
def main(args): os.makedirs(args.model_dir, exist_ok=True) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, params=hparams, config=RunConfig( save_summary_steps=args.summary_interval, save_checkpoints_steps=args.checkpoint_interval, session_config=SESS_CFG, # log_step_count_steps=100, keep_checkpoint_max=2)) if args.mode == 'train': os.makedirs(args.data_dir, exist_ok=True) estimator.train(input_fn=lambda: train_input_fn(args.data_dir)) elif args.mode == 'predict': assert len(args.texts), "No text to predict" results = estimator.predict( input_fn=lambda: predict_input_fn(args.texts)) for idx, wav in enumerate(results): wav = inv_preemphasis(wav) # wav = wav[:find_endpoint(wav)] # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False) save_wav(wav, 'output_{}.wav'.format(idx)) # break elif args.mode == 'export': os.makedirs(args.export_dir, exist_ok=True) estimator.export_saved_model( args.export_dir, tf.estimator.export.build_raw_serving_input_receiver_fn( { 'inputs': tf.placeholder( dtype=tf.int32, shape=(None, None), name='inputs'), 'lengths': tf.placeholder( dtype=tf.int32, shape=(None, ), name='lengths'), }, default_batch_size=None), # assets_extra=None, # as_text=False, # checkpoint_path=None, # experimental_mode=ModeKeys.PREDICT ) else: raise KeyError('Unknown Mode <{}>'.format(args.mode))
def save_audio(): # model instance has spectrogram data which was processed last spectrogram = model.spectrogram #TODO: change this specification waveform = audio.inv_spectrogram(spectrogram.T) audio.save_wav( waveform, os.path.join( log_dir, 'iteration_{.updater.iteration}-audio.wav'.format(trainer))) plot.plot_alignment( alignment, os.path.join( log_dir, 'iteration_{.updater.iteration}-align.png'.format(trainer)), info='%s, %s, %s, iteration_{.updater.iteration}, loss=%.5f'. format(args.model, commit, time_string(), trainer, loss)) log('Input: %s' % textinput.to_string(input_seq))
def synthesize(self, path_in, path_re, mel_targets=None, reference_mel=None, alignment_path=None): wav_in = audio.load_wav(path_in) wav_re = audio.load_wav(path_re) mel_in = audio.melspectrogram(wav_in).astype(np.float32) mel_re = audio.melspectrogram(wav_re).astype(np.float32) # print(mel_jp) feed_dict = { self.model.inputs: [mel_in.T], self.model.input_lengths: np.asarray([len(mel_in)], dtype=np.int32), self.model.inputs_jp: [mel_re.T], } # if mel_targets is not None: # mel_targets = np.expand_dims(mel_targets, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)}) # if reference_mel is not None: # reference_mel = np.expand_dims(reference_mel, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) wav_out, alignments = self.session.run( [self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav_out) end_point = audio.find_endpoint(wav) wav = wav[:end_point] nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前时间 randomNum = random.randint(0, 100) # 生成的随机整数n,其中0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) uniqueNum = str(nowTime) + str(randomNum) out_dir = "static\\out\\" + uniqueNum + ".wav" out_name = uniqueNum + ".wav" audio.save_wav(wav, out_dir) out = io.BytesIO() audio.save_wav(wav, out) # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1 # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path)) return out_dir, out_name
def synthesize(self, text): #print('synthesize:',text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #text=sentence_to_pinyin(text) #print('text:',text) #print('cleaner_names:',cleaner_names) seq = text_to_sequence_zh(text, cleaner_names) print(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # g2p = G2p() c_text=text.split('|')[0] p_text=text.split('|')[1] c_seq = text_to_sequence(c_text, cleaner_names) p_seq = text_to_sequence(p_text, cleaner_names) feed_dict = { self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)], self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)], self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32), self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text, title): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) cwd = os.getcwd() audio_dir = cwd + "/narration/saved_audio/" + title + ".wav" print(audio_dir) with open(audio_dir, "wb") as f: f.write(out.getvalue()) os.system("aplay " + audio_dir) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) pprint('Text: ' + text) #pprint('Seq') #pprint(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } pprint(self.wav_output) pprint('>>> Getting wav') wav = self.session.run(self.wav_output, feed_dict=feed_dict) pprint('>>> Gotten wav') #wav = audio.inv_preemphasis(wav) # The audio is typically ~13 seconds unless truncated: #wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): with Synthesizer.mutex: if not Synthesizer.processing: Synthesizer.processing = True cleaner_names = [ x.strip() for x in hparams.cleaners.split(',') ] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) Synthesizer.processing = False return out.getvalue() else: return None
def synthesize(self, text, mel_targets=None, reference_mel=None, alignment_path=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if mel_targets is not None: mel_targets = np.expand_dims(mel_targets, 0) feed_dict.update({ self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32) }) if reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) wav, alignments = self.session.run([self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) end_point = audio.find_endpoint(wav) wav = wav[:end_point] out = io.BytesIO() audio.save_wav(wav, out) n_frame = int( end_point / (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1 text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False)) plot.plot_alignment(alignments[:, :n_frame], alignment_path, info='%s' % (text)) return out.getvalue()
def synthesize(self, images_dir, output_wav_dir): for path, _, filenames in os.walk(images_dir): for i in trange(len(filenames)): test_file = filenames[i] if str.endswith(test_file, '.png'): base_file_name, _ = os.path.splitext(test_file) raw_image = imread(os.path.join(path, test_file), mode='RGB') processed_image = imresize(raw_image, (224, 224, 3)) feed_dict = { self.model.inputs: [np.asarray(processed_image, dtype=np.float32)], } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio_out_path = os.path.join( output_wav_dir, 'eval-{}.wav'.format(base_file_name)) audio.save_wav(wav, audio_out_path) print('Wav - {} generated successfully!'.format( audio_out_path))
def synthesize(self, lab_name): lab = np.load(lab_name) lab = np.expand_dims(lab, axis=0) feed_dict = { self.model.inputs: lab, self.model.input_lengths: np.asarray([lab.shape[1]], dtype=np.int32), # change 0 to 1 or others based on the speaker self.model.speaker_ids: np.asarray([2], dtype=np.int32) } wav, mel_outputs = self.session.run( [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) _len = audio.find_endpoint(wav) wav = wav[:_len] _len = audio.find_endpoint(wav) wav = wav[:_len] mel_output = mel_output[:frames, :] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue(), mel_outputs
def synthesize(self, text, identity, path=None, path_align=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence2(text, cleaner_names)[:-1] print(seq) print(sequence_to_text2(seq)) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.identities: np.asarray([identity], dtype=np.int32), } wav, alignment = self.session.run([self.wav_output, self.alignment], feed_dict=feed_dict) if path_align is not None: plot.plot_alignment(alignment, path_align) wav = audio.inv_preemphasis(wav) #wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() if path is not None: audio.save_wav(wav, path) else: audio.save_wav(wav, './1.wav') return out.getvalue()
def synthesize(self, text, return_wav=False): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav, alignment = self.session.run([self.wav_output, self.alignment], feed_dict=feed_dict) audio_endpoint = audio.find_endpoint(wav) alignment_endpoint = find_alignment_endpoint(alignment.shape, audio_endpoint / len(wav)) wav = wav[:audio_endpoint] alignment = alignment[:, :alignment_endpoint] if return_wav: return wav, alignment out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue(), alignment
def synthesize(self, text1, text2): seq1 = textinput_fr.to_sequence( text1, force_lowercase=hparams.force_lowercase, expand_abbreviations=hparams.expand_abbreviations) seq2 = textinput_fr.to_sequence( text2, force_lowercase=hparams.force_lowercase, expand_abbreviations=False) feed_dict = { self.model.inputs1: [np.asarray(seq1, dtype=np.int32)], self.model.input_lengths1: np.asarray([len(seq1)], dtype=np.int32), self.model.inputs2: [np.asarray(seq2, dtype=np.int32)], self.model.input_lengths2: np.asarray([len(seq2)], dtype=np.int32) } spec, alignments1, alignments2 = self.session.run([ self.model.linear_outputs[0], self.model.alignments1[0], self.model.alignments2[0] ], feed_dict=feed_dict) out = io.BytesIO() audio.save_wav(audio.inv_spectrogram(spec.T), out) return out.getvalue(), alignments1, alignments2
def synthesize(self, text, reference_mel=None, gst_index=None, gst_scale=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } if reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def train(log_dir, input_path, checkpoint_path, is_restore): # Log the info log('Checkpoint path: %s' % checkpoint_path) log('Loading training data from: %s' % input_path) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = DataFeeder(coord, input_path, hparams) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model('tacotron', hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) # Bookkeeping: step = 0 saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Train! with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if is_restore: # Restore from a checkpoint if the user requested it. restore_path = '%s' % (checkpoint_path) saver.restore(sess, restore_path) log('Resuming from checkpoint') else: log('Starting new training') feeder.start_in_session(sess) while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_interval = time.time() - start_time message = 'Step %d, %.03f sec, loss=%.05f' % (step, loss, time_interval) log(message) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % hparams.summary_interval == 0: log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % hparams.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0] ]) waveform = audio.inv_spectrogram(spectrogram.T) audio.save_wav( waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) plot.plot_alignment( alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, step=%d, loss=%.5f' % ('tacotron', time_string(), step, loss)) log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) coord.request_stop(e)
def train(log_dir, args): run_name = args.name or args.model log_dir = os.path.join(args.base_dir, 'logs-%s' % run_name) os.makedirs(log_dir, exist_ok=True) infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url) checkpoint_path = os.path.join(log_dir, 'model.ckpt') with open(args.input, encoding='utf-8') as f: metadata = [row.strip().split('|') for row in f] metadata = sorted(metadata, key=lambda x: x[2]) data_element = get_dataset(metadata, args.data_dir, hparams) global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(data_element['input'], data_element['input_lengths'], data_element['mel_targets'], data_element['linear_targets']) model.add_loss() model.add_optimizer(global_step) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) sess = tf.Session() sess.run(tf.global_variables_initializer()) for _ in range(int(args.max_iter)): start_time = time.time() step, mel_loss, lin_loss, loss, opt = sess.run([ global_step, model.mel_loss, model.linear_loss, model.loss, model.optimize ]) end_time = time.time() message = 'Step %7d [%.03f sec/step, loss = %.05f (mel : %.05f + lin : %.05f)]' % ( step, end_time - start_time, loss, mel_loss, lin_loss) log(message) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step)) raise Exception('Loss Exploded') if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0] ]) waveform = audio.inv_spectrogram(spectrogram.T) audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, step=%d, loss=%.5f' % (args.model, time_string(), step, loss)) log('Input: %s' % sequence_to_text(input_seq))
def train(log_dir, args): commit = get_git_commit() if args.git else 'None' checkpoint_path = os.path.join(log_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) log('Checkpoint path: %s' % checkpoint_path) log('Loading training data from: %s' % input_path) log('Using model: %s' % args.model) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = DataFeeder(coord, input_path, hparams) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Train! with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%d' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) else: log('Starting new training run at commit: %s' % commit, slack=True) feeder.start_in_session(sess) while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0]]) waveform = audio.inv_spectrogram(spectrogram.T) audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss)) log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, trans_ckpt_dir=None): commit = get_git_commit() if args.git else 'None' checkpoint_path = os.path.join(log_dir, 'model.ckpt') if trans_ckpt_dir != None: trans_checkpoint_path = os.path.join(trans_ckpt_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) log('Checkpoint path: %s' % trans_checkpoint_path) log('Loading training data from: %s' % input_path) log('Using model: %s' % args.model) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = DataFeeder(coord, input_path, hparams) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Train! with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%d' % (trans_checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) else: log('Starting new training run at commit: %s' % commit, slack=True) feeder.start_in_session(sess) while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0] ]) waveform = audio.inv_spectrogram(spectrogram.T) audio.save_wav( waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) plot.plot_alignment( alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss)) log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)