class Synthesizer: def load(self, checkpoint_path, model_name='tacotron'): inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') with tf.variable_scope('model') as scope: self.model = Tacotron(hparams) self.model.initialize(inputs, input_lengths) self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0]) # 读取已有模型 self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path) def synthesize(self, text): # 将中文转换为注音字符 text = Pinyin().get_pinyin(text, " ", tone_marks='numbers') cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # 注音字符到序列 seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)} wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None, config=None): inputs = features['inputs'] lengths = features['lengths'] mel_targets = None linear_targets = None train_hooks = [] global_step = tf.train.get_global_step() if mode == tf.estimator.ModeKeys.TRAIN: mel_targets = labels['mel_targets'] linear_targets = labels['linear_targets'] with tf.variable_scope('model'): model = Tacotron(params) model.initialize(inputs, lengths, mel_targets, linear_targets) if mode == tf.estimator.ModeKeys.TRAIN: model.add_loss() model.add_optimizer(global_step) # train_hooks.extend([ # LoggingTensorHook( # [global_step, model.loss, tf.shape(model.linear_outputs)], # every_n_secs=60, # ) # ]) outputs = tf.map_fn(inv_spectrogram_tensorflow, model.linear_outputs) if mode == tf.estimator.ModeKeys.TRAIN: with tf.variable_scope('stats') as scope: tf.summary.histogram('linear_outputs', model.linear_outputs) tf.summary.histogram('linear_targets', model.linear_targets) tf.summary.histogram('mel_outputs', model.mel_outputs) tf.summary.histogram('mel_targets', model.mel_targets) tf.summary.scalar('loss_mel', model.mel_loss) tf.summary.scalar('loss_linear', model.linear_loss) tf.summary.scalar('learning_rate', model.learning_rate) tf.summary.scalar('loss', model.loss) gradient_norms = [tf.norm(grad) for grad in model.gradients] tf.summary.histogram('gradient_norm', gradient_norms) tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) tf.summary.audio('outputs', outputs, hparams.sample_rate, max_outputs=1) tf.summary.merge_all() return tf.estimator.EstimatorSpec(mode, predictions=outputs, loss=getattr(model, 'loss', None), train_op=getattr(model, 'optimize', None), eval_metric_ops=None, export_outputs=None, training_chief_hooks=None, training_hooks=train_hooks, scaffold=None, evaluation_hooks=None, prediction_hooks=None)
class Synthesizer: def load(self, step): enc_input = tf.placeholder(tf.int32, [1, None]) sequence_length = tf.placeholder(tf.int32, [1]) dec_input = tf.placeholder(tf.float32, [1, None, mel_dim]) self.model = Tacotron() self.model.initialize(enc_input, sequence_length, dec_input) self.enc_input = self.model.enc_input[0] self.sequence_length = self.model.sequence_length[0] self.mel_output = self.model.mel_output[0] self.alignment = self.model.alignment[0] self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore( self.session, os.path.join(checkpoint_dir, 'model.ckpt-{}'.format(step))) def synthesize(self, args, text, idx): seq = text_to_sequence(text) dec_input = np.zeros((1, 200, mel_dim), dtype='float32') pred = [] for i in range(1, 201): mel_out, alignment = self.session.run( [self.mel_output, self.alignment], feed_dict={ self.model.enc_input: [np.asarray(seq, dtype=np.int32)], self.model.sequence_length: np.asarray([len(seq)], dtype=np.int32), self.model.dec_input: dec_input }) if i < 200: dec_input[:, i, :] = mel_out[5 * i - 1, :] pred.extend(mel_out[5 * (i - 1):5 * i, :]) np.save(os.path.join(args.save_dir, 'mel-{}'.format(idx)), pred, allow_pickle=False) input_seq = sequence_to_text(seq) alignment_dir = os.path.join(args.save_dir, 'align-{}.png'.format(idx)) plot_alignment(alignment, alignment_dir, input_seq)
def train(args): save_dir = './checkpoint/1' checkpoint_path = os.path.join(save_dir, 'model.ckpt') coord = tf.train.Coordinator() feeder = DataFeeder(coord, mode=1) model = Tacotron() model.initialize(feeder.enc_input, feeder.sequence_length, feeder.dec_input, feeder.mel_target) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) with tf.Session() as sess: try: sess.run(tf.global_variables_initializer()) if args.step: load_dir = '{}-{}'.format(checkpoint_path, args.step) saver.restore(sess, load_dir) feeder.start_in_session(sess) while not coord.should_stop(): step, loss, optimize = sess.run( [model.global_step, model.loss, model.optimize]) print('Step: {}, Loss: {:.5f}'.format(step, loss)) if step % checkpoint_step == 0: saver.save(sess, checkpoint_path, global_step=step) input_seq, alignment, pred, target = \ sess.run([model.enc_input[0], model.alignment[0], model.mel_output[0], model.mel_target[0]]) input_seq = sequence_to_text(input_seq) alignment_dir = os.path.join( save_dir, 'step-{}-align.png'.format(step)) plot_alignment(alignment, alignment_dir, input_seq) except Exception as e: traceback.print_exc() coord.request_stop(e)
class Synthesizer: def load(self, checkpoint_path, model_name='tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') with tf.variable_scope('model') as scope: self.model = Tacotron(hparams) self.model.initialize(inputs, input_lengths) pprint('>>> Model Linear Ouputs:') pprint(self.model.linear_outputs[0]) self.wav_output = audio.inv_spectrogram_tensorflow( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path) def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) pprint('Text: ' + text) #pprint('Seq') #pprint(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } pprint(self.wav_output) pprint('>>> Getting wav') wav = self.session.run(self.wav_output, feed_dict=feed_dict) pprint('>>> Gotten wav') #wav = audio.inv_preemphasis(wav) # The audio is typically ~13 seconds unless truncated: #wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def train(log_dir, args): checkpoint_path = os.path.join(log_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) # 显示模型的路径信息 log('Checkpoint path: %s' % checkpoint_path) log('Loading training data from: %s' % input_path) coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = DataFeeder(coord, input_path, hparams) # 初始化模型 global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = Tacotron(hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.stop_token_targets, global_step) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=1) # 开始训练 with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) feeder.start_in_session(sess) while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % args.checkpoint_interval == 0)) if step % args.summary_interval == 0: summary_writer.add_summary(sess.run(stats), step) # 每隔一定的训练步数生成检查点 if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') input_seq, spectrogram, alignment = sess.run([ model.inputs[0], model.linear_outputs[0], model.alignments[0] ]) waveform = audio.inv_spectrogram(spectrogram.T) # 合成样音 audio.save_wav( waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) time_string = datetime.now().strftime('%Y-%m-%d %H:%M') # 画Encoder-Decoder对齐图 infolog.plot_alignment( alignment, os.path.join(log_dir, 'step-%d-align.png' % step), info='%s, %s, step=%d, loss=%.5f' % (args.model, time_string, step, loss)) # 显示合成样音的文本 log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)