def _eval_tgt(synth, args, checkpoint_path, output_dir, hparams, sentences, flag_to_wav, checkpoint_eal): synth_dir = os.path.join(output_dir, 'eval', 'wav') if flag_to_wav else os.path.join( output_dir, 'eval', 'npy') os.makedirs(synth_dir, exist_ok=True) # Set up denormalisation parameters for synthesis mean_path = os.path.abspath( os.path.join(args.base_dir, args.training_dir, 'pml_data/mean.dat')) std_path = os.path.abspath( os.path.join(args.base_dir, args.training_dir, 'pml_data/std.dat')) mean_norm = None std_norm = None if os.path.isfile(mean_path) and os.path.isfile(std_path): mean_norm = np.fromfile(mean_path, 'float32') std_norm = np.fromfile(std_path, 'float32') else: warnings.warn( 'No mean or standard deviation files found at locations {} and {}'. format(mean_path, std_path)) print('Synthesizing to {}...'.format(synth_dir)) if flag_to_wav: wavs = synth.synthesize(sentences, to_wav=True, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) for i, wav in enumerate(wavs): path = os.path.join(synth_dir, 'eval-%d.wav' % i) print('Writing {}...'.format(path)) if args.variant not in ['tacotron_orig', 'tacotron_bk2orig']: sp.wavwrite(path, wav, hparams.sample_rate, norm_max_ifneeded=True, verbose=0) else: with open(path, 'wb') as f: f.write(wav) else: import pdb pdb.set_trace() tgt_features_matrix = synth.synthesize(sentences, to_wav=False, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) name_list = get_file_list( '/home/dawna/tts/qd212/data/lj/merlinData/file_id_list.scp' )[13050:13050 + 50] for i, f in enumerate(tgt_features_matrix): if i < 50: path = os.path.join(synth_dir, '%s.npy' % name_list[i]) else: path = os.path.join(synth_dir, 'eval-%d.npy' % i) print('Writing {}...'.format(path)) np.save(path, f, allow_pickle=False)
#!/usr/bin/python ''' Copyright(C) 2016 Engineering Department, University of Cambridge, UK. License Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Author Gilles Degottex <*****@*****.**> ''' import sys from lib import sigproc as sp if __name__ == "__main__": print('Normalise {}'.format(sys.argv[1])) wav, fs, enc = sp.wavread(sys.argv[1]) wav, meta = lib.sigproc.interfaces.sv56demo(wav, fs) sp.wavwrite(sys.argv[1], wav, fs, enc)
def synthesizef(fs, shift=0.005, dftlen=4096, ff0=None, flf0=None, fspec=None, flspec=None, ffwlspec=None, ffwcep=None, fmcep=None, fpdd=None, fmpdd=None, fnm=None, ffwnm=None, nm_cont=False, fsyn=None, verbose=1): ''' Call the synthesis from python using file inputs and outputs ''' if ff0: f0 = np.fromfile(ff0, dtype=np.float32) if flf0: f0 = np.fromfile(flf0, dtype=np.float32) f0[f0 > 0] = np.exp(f0[f0 > 0]) ts = (shift) * np.arange(len(f0)) f0s = np.vstack((ts, f0)).T if fspec: SPEC = np.fromfile(fspec, dtype=np.float32) SPEC = SPEC.reshape((len(f0), -1)) if flspec: SPEC = np.fromfile(flspec, dtype=np.float32) SPEC = np.exp(SPEC.reshape((len(f0), -1))) if ffwlspec: FWLSPEC = np.fromfile(ffwlspec, dtype=np.float32) FWLSPEC = FWLSPEC.reshape((len(f0), -1)) SPEC = np.exp(sp.fwbnd2linbnd(FWLSPEC, fs, dftlen, smooth=True)) if ffwcep: FWCEP = np.fromfile(ffwcep, dtype=np.float32) FWCEP = FWCEP.reshape((len(f0), -1)) SPEC = np.exp(sp.fwcep2loghspec(FWCEP, fs, dftlen)) if fmcep: # pragma: no cover # Cannot test this because it needs SPTK MCEP = np.fromfile(fmcep, dtype=np.float32) MCEP = MCEP.reshape((len(f0), -1)) SPEC = sp.mcep2spec(MCEP, sp.bark_alpha(fs), dftlen) NM = None pdd_thresh = 0.75 # For this value, see: # G. Degottex and D. Erro, "A uniform phase representation for the harmonic model in speech synthesis applications," EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014. if fpdd: PDD = np.fromfile(fpdd, dtype=np.float32) PDD = PDD.reshape((len(f0), -1)) NM = PDD.copy() NM[PDD < pdd_thresh] = 0.0 NM[PDD > pdd_thresh] = 1.0 if fmpdd: # pragma: no cover # Cannot test this because it needs SPTK MPDD = np.fromfile(fmpdd, dtype=np.float32) MPDD = MPDD.reshape((len(f0), -1)) PDD = sp.mcep2spec(MPDD, sp.bark_alpha(fs), dftlen) NM = PDD.copy() NM[PDD < pdd_thresh] = 0.0 NM[PDD > pdd_thresh] = 1.0 if fnm: NM = np.fromfile(fnm, dtype=np.float32) NM = NM.reshape((len(f0), -1)) if ffwnm: FWNM = np.fromfile(ffwnm, dtype=np.float32) FWNM = FWNM.reshape((len(f0), -1)) NM = sp.fwbnd2linbnd(FWNM, fs, dftlen) syn = synthesize(fs, f0s, SPEC, NM=NM, nm_cont=nm_cont, verbose=verbose) if fsyn: sp.wavwrite(fsyn, syn, fs, norm_max_ifneeded=True, verbose=verbose) return syn
def train(log_dir, args, input): commit = get_git_commit() if args.git else 'None' checkpoint_path = os.path.join(log_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, input) log('Checkpoint path: %s' % checkpoint_path) log('Loading training data from: %s' % input_path) log('Using model: %s' % args.variant) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: if args.eal_dir: from tacotron.datafeeder import DataFeeder_EAL feeder = DataFeeder_EAL(coord, input_path, hparams, args.eal_dir) else: from tacotron.datafeeder import DataFeeder feeder = DataFeeder(coord, input_path, hparams) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.variant, hparams) if args.eal_dir: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.pml_targets, is_training=True, eal=True, locked_alignments=feeder.locked_alignments, flag_trainAlign=args.eal_trainAlign, flag_trainJoint=args.eal_trainJoint, alignScale=args.eal_alignScale) else: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.pml_targets, is_training=True, gta=True) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model, eal_dir=args.eal_dir) # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Set up fixed alignment synthesizer alignment_synth = AlignmentSynthesizer() # Set up text for synthesis fixed_sentence = 'Scientists at the CERN laboratory say they have discovered a new particle.' # Set up denormalisation parameters for synthesis mean_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/mean.dat')) std_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/std.dat')) log('Loading normalisation mean from: {}'.format(mean_path)) log('Loading normalisation standard deviation from: {}'.format(std_path)) mean_norm = None std_norm = None if os.path.isfile(mean_path) and os.path.isfile(std_path): mean_norm = np.fromfile(mean_path, 'float32') std_norm = np.fromfile(std_path, 'float32') # Train! # import pdb # flag_pdb = False # pdb.set_trace() # args.checkpoint_interval = 2 # args.num_steps = 5 with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) # pdb.set_trace() if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%d' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) elif args.eal_dir and args.eal_ckpt: if args.eal_trainAlign or args.eal_trainJoint: list_var = tf.trainable_variables() + [v for v in tf.global_variables() if 'moving' in v.name] saver_eal = tf.train.Saver(list_var) saver_eal.restore(sess, args.eal_ckpt) log('Loaded weights and batchNorm cache of checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) elif args.eal_ft: saver.restore(sess, args.eal_ckpt) log('Refining the model from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) else: list_var = [var for var in tf.global_variables() if 'optimizer' not in var.name] saver_eal = tf.train.Saver(list_var) saver_eal.restore(sess, args.eal_ckpt) log('Initializing the weights from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) # args.num_steps *= 2 # sess.run(global_step.assign(0)) else: log('Starting new training run at commit: %s' % commit, slack=True) feeder.start_in_session(sess) step = 0 # initialise step variable so can use in while condition while not coord.should_stop() and step <= args.num_steps: # pdb.set_trace() start_time = time.time() if args.eal_trainAlign: step, loss, loss_align, opt = sess.run([global_step, model.loss, model.loss_align, model.optimize]) # try: # step, loss, loss_align, opt, tmp_a, tmp_ar = sess.run([global_step, model.loss, model.loss_align, model.optimize, # model.alignments, model.alignments_ref]) # except: # print("Oops!",sys.exc_info()[0],"occured.") # flag_pdb = True # if flag_pdb or np.isnan(loss_align): # pdb.set_trace() # flag_pdb = False time_window.append(time.time() - start_time) loss_window.append(loss_align) message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_align=%.05f]' % ( step, time_window.average, loss, loss_align, loss_window.average) elif args.eal_trainJoint: step, loss, loss_align, loss_joint, opt = sess.run([global_step, model.loss, model.loss_align, model.loss_joint, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss_joint) message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_joint=%.05f]' % ( step, time_window.average, loss, loss_align, loss_window.average) else: step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') summary_elements = [] # if the model has linear spectrogram features, use them to synthesize audio if hasattr(model, 'linear_targets'): input_seq, alignment, target_spectrogram, spectrogram = sess.run([ model.inputs[0], model.alignments[0], model.linear_targets[0], model.linear_outputs[0]]) output_waveform = audio.inv_spectrogram(spectrogram.T) target_waveform = audio.inv_spectrogram(target_spectrogram.T) audio.save_wav(output_waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) audio.save_wav(target_waveform, os.path.join(log_dir, 'step-%d-target-audio.wav' % step)) # otherwise, synthesize audio from PML vocoder features elif hasattr(model, 'pml_targets'): input_seq, alignment, target_pml_features, pml_features = sess.run([ model.inputs[0], model.alignments[0], model.pml_targets[0], model.pml_outputs[0]]) cfg = Configuration(hparams.sample_rate, hparams.pml_dimension) synth = PMLSynthesizer(cfg) output_waveform = synth.pml_to_wav(pml_features, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) target_waveform = synth.pml_to_wav(target_pml_features, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) sp.wavwrite(os.path.join(log_dir, 'step-%d-target-audio.wav' % step), target_waveform, hparams.sample_rate, norm_max_ifneeded=True) sp.wavwrite(os.path.join(log_dir, 'step-%d-audio.wav' % step), output_waveform, hparams.sample_rate, norm_max_ifneeded=True) # we need to adjust the output and target waveforms so the values lie in the interval [-1.0, 1.0] output_waveform /= 1.05 * np.max(np.abs(output_waveform)) target_waveform /= 1.05 * np.max(np.abs(target_waveform)) summary_elements.append( tf.summary.audio('ideal-%d' % step, np.expand_dims(target_waveform, 0), hparams.sample_rate), ) summary_elements.append( tf.summary.audio('sample-%d' % step, np.expand_dims(output_waveform, 0), hparams.sample_rate), ) # get the alignment for the top sentence in the batch random_attention_plot = plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-random-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % ( args.variant, commit, time_string(), step, loss)) summary_elements.append( tf.summary.image('attention-%d' % step, random_attention_plot), ) # also process the alignment for a fixed sentence for comparison alignment_synth.load('%s-%d' % (checkpoint_path, step), hparams, model_name=args.variant) fixed_alignment = alignment_synth.synthesize(fixed_sentence) fixed_attention_plot = plot.plot_alignment(fixed_alignment, os.path.join(log_dir, 'step-%d-fixed-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % ( args.variant, commit, time_string(), step, loss)) summary_elements.append( tf.summary.image('fixed-attention-%d' % step, fixed_attention_plot), ) # save the audio and alignment to tensorboard (audio sample rate is hyperparameter) merged = sess.run(tf.summary.merge(summary_elements)) summary_writer.add_summary(merged, step) log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)