def _run_eval(args, checkpoint_path, output_dir, hparams, sentences): mel_dir = os.path.join(output_dir, 'mel') wav_dir = os.path.join(output_dir, 'wav') plot_dir = os.path.join(output_dir, 'plot') #Create output path if it doesn't exist os.makedirs(mel_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') for i, texts in enumerate(tqdm(sentences)): basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] synth.synthesize(texts, basenames, mel_dir, wav_dir, plot_dir, None) log('synthesized mel spectrograms at {}'.format(mel_dir)) log('plot mel spectrograms at {}'.format(wav_dir)) log('synthesized wavs at {}'.format(wav_dir)) return mel_dir, wav_dir
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step #Test on entire test set test_batches_per_group = n * 2 if self._args.test_max_len or self._args.TEST else len( self._test_meta) examples = [ self._get_test_groups() for i in range(test_batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] if self._args.test_max_len: batches = batches[::-1] else: np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches, r
def preprocess(args, audio_dir, taco_dir, hparams): output_dir = os.path.join(args.base_dir, 'wavernn_data') quant_dir = os.path.join(output_dir, 'quant') mels_dir = os.path.join(output_dir, 'mels') os.makedirs(output_dir, exist_ok=True) os.makedirs(quant_dir, exist_ok=True) os.makedirs(mels_dir, exist_ok=True) audio_files = get_files(audio_dir) mels_files = get_files(taco_dir) # This will take a while depending on size of dataset dataset_ids = [] for i, path in enumerate(zip(audio_files, mels_files)): audio_id = path[0].split('/')[-1][6:-4] mels_id = path[1].split('/')[-1][4:-4] assert (mels_id == audio_id) dataset_ids.append(audio_id) np.save(f'{quant_dir}/{audio_id}.npy', convert_gta_audio(path[0])) np.save(f'{mels_dir}/{mels_id}.npy', convert_gta_mels(path[1])) log('%i/%i : audio: %s mel: %s' % (i + 1, len(audio_files), audio_id, mels_id)) dataset_ids_unique = list(set(dataset_ids)) with open(f'{output_dir}/dataset_ids.pkl', 'wb') as file: pickle.dump(dataset_ids_unique, file)
def tacotron_synthesize(args, hparams, checkpoint, ppgs=None, speakers=None, Lf0s=None): output_dir = args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path #checkpoint_path = '/home/zhaoxt20/vae_tac_myself/exp_multi_2020.4.1_2DPPgs+ref_same_speaker_dif_sentence/pretrained_model/tacotron_model.ckpt-45000' log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) return run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s)
def tacotron_synthesize(logId, sentences): # Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log("logId={} , sentences={}".format(logId, sentences)) # basenames = logId # texts = sentences # synth.synthesizev1(texts, basenames, eval_dir, log_dir, None) # mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) t1 = time.time() for i, texts in enumerate(tqdm(sentences)): # basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] basenames = [logId] wavPaths = synth.synthesizev1(texts, basenames, eval_dir, log_dir, None) t2 = time.time() log('logId={} , synthesized mel spectrograms at {} cost time={}'.format( logId, eval_dir, (t2 - t1))) # with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: # for i, texts in enumerate(tqdm(sentences)): # start = time.time() # basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] # # basenames = logId # mel_filenames, speaker_ids = synth.synthesizev1(texts, basenames, eval_dir, log_dir, None) # # for elems in zip(texts, mel_filenames, speaker_ids): # file.write('|'.join([str(x) for x in elems]) + '\n') # log('synthesized mel spectrograms at {}'.format(eval_dir)) return wavPaths[0]
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, reference_mel=args.reference_audio) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T else: raise ValueError( "Evaluation without reference audio. Please provide path to reference audio." ) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None, reference_mel=reference_mel) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def main(): parser = argparse.ArgumentParser() parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron')) parser.add_argument('--output', default='training') parser.add_argument('--dataset', required=True, choices=['blizzard', 'ljspeech', 'nick']) parser.add_argument('--num_workers', type=int, default=cpu_count()) parser.add_argument( '--hparams', default='', help= 'Hyperparameter overrides as a comma-separated list of name=value pairs' ) parser.add_argument('--validation_size', type=int, default=0) parser.add_argument('--test_size', type=int, default=0) args = parser.parse_args() hparams.parse(args.hparams) log(hparams_debug_string()) if args.dataset == 'blizzard': preprocess_blizzard(args, hparams) elif args.dataset == 'ljspeech': preprocess_ljspeech(args, hparams) elif args.dataset == 'nick': preprocess_nick(args, hparams)
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) npy_data = np.load(mel_filename) npy_data = npy_data.reshape((-1,)) npy_data.tofile("f32_for_lpcnet.f32") log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis_sytle_transfer(args, synth_metadata_filename, checkpoint_path, output_dir, hparams): synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(args, checkpoint_path, hparams) texts, basenames, basenames_refs, mel_filenames, \ mel_ref_filenames_emt, mel_ref_filenames_spk,\ emt_labels, spk_labels = get_filenames_from_metadata(synth_metadata_filename, args.input_dir, args.flip_spk_emt) synth.synthesize(texts, basenames, synth_dir, synth_dir, mel_filenames, mel_ref_filenames_emt=mel_ref_filenames_emt, mel_ref_filenames_spk=mel_ref_filenames_spk, emt_labels_synth=emt_labels, spk_labels_synth=spk_labels)
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def wavenet_synthesize(args, hparams, checkpoint): output_dir = 'wavenet_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) if 'Both' in checkpoint: checkpoint = checkpoint.replace('Both', 'Tacotron-2') elif 'Tacotron-2' in checkpoint: checkpoint = checkpoint.replace('Tacotron-2', 'Both') else: #Synthesizing separately raise AssertionError( 'Cannot restore checkpoint: {}, did you train a model?'.format( checkpoint)) try: #Try loading again checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) run_synthesis(args, checkpoint_path, output_dir, hparams)
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [None, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths') with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) self.model.initialize(inputs, input_lengths) self.final_outputs = self.model.final_outputs self.alignments = self.model.alignments self.stop_token_outputs = self.model.stop_token_outputs self.gta = gta self._hparams = hparams #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 log('Loading checkpoint: %s' % checkpoint_path) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def load_checkpoint(self, args, hparams, checkpoint): # ./Tacotron-2/tacotron/synthesize.py:tacotron_synthesize output_dir = 'tacotron_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) # ./Tacotron-2/tacotron/synthesize.py:run_live log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) self.model = synth
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) print( '\nGenerated {} train batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def setup_log(log_path, checkpoint_path, input_path): infolog.init(log_path, 'emt4_disc', None) log('hi') log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format('emt4_disc')) log(hparams_debug_string())
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.vad_batch_size * _num_per_batch # Test on entire test set examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: len(x[-1])) examples = (np.vstack([ex[0] for ex in examples]), np.vstack([ex[1] for ex in examples])) batches = [(examples[0][i:i + n], examples[1][i:i + n]) for i in range(0, len(examples[-1]) + 1 - n, n)] if len(examples[-1]) % n != 0: batches.append((examples[0][-(len(examples[-1]) % n):], examples[1][-(len(examples[-1]) % n):])) self.test_steps = len(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( sum([len(batch) for batch in batches]), n, time.time() - start)) return batches
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, speaker_id=args.speaker_id) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() if args.speaker_id is not None: mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=[args.speaker_id[i]]) else: mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=None) file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0])) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def tacotron_synthesize(args, hparams, checkpoint, sentences=None): output_dir = 'tacotron_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) if 'Both' in checkpoint: checkpoint = checkpoint.replace('Both', 'Tacotron-2') elif 'Tacotron-2' in checkpoint: checkpoint = checkpoint.replace('Tacotron-2', 'Both') else: raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint)) try: #Try loading again checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) if args.mode == 'eval': return run_eval(args, checkpoint_path, output_dir, hparams, sentences) elif args.mode == 'synthesis': return run_synthesis(args, checkpoint_path, output_dir, hparams) else: run_live(args, checkpoint_path, hparams)
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, targets, gta=gta) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs self.linear_outputs = self.model.linear_outputs if ( hparams.predict_linear and not gta) else None self.alignment = self.model.alignments[0] self.gta = gta self._hparams = hparams log('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def synthesize(args, input_dir, output_dir, checkpoint_path, hparams): # device device = torch.device('cuda' if args.use_cuda else 'cpu') # Initialize Model model = Model(rnn_dims=hparams.rnn_dims, fc_dims=hparams.fc_dims, bits=hparams.wavernn_bits, pad=hparams.wavernn_pad, upsample_factors = hparams.upsample_scales,\ feat_dims=hparams.feat_dims, compute_dims=hparams.compute_dims, res_out_dims=hparams.res_out_dims, res_blocks=hparams.res_blocks,\ hop_length = hparams.hop_size, sample_rate=hparams.sample_rate).to(device) # Load Model if args.use_cuda: checkpoint = torch.load(checkpoint_path) else: checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) log('Loading model from {}'.format(checkpoint_path)) model.load_state_dict(checkpoint['state_dict']) # Synth from Mels to Wave filenames = [ f for f in sorted(os.listdir(input_dir)) if f.endswith('.npy') ] for i, filename in tqdm(enumerate(filenames)): mel = np.load(os.path.join(input_dir, filename)).T save_wavernn_wav(model.generate(mel), '{}/{}_generated.wav'.format(output_dir, i), hparams.sample_rate)
def load(self, checkpoint_path, hparams, model_name='WaveNet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=[1, None, hparams.num_mels], name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(1, 1), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=None, synthesis_length=self.synthesis_length) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def synthesize(args, input_dir, output_dir, checkpoint_path, hparams): # device device = torch.device('cuda' if args.use_cuda else 'cpu') # Initialize Model model = WaveRNN(hparams.wavernn_bits, hparams.hop_size, hparams.num_mels, device).to(device) # Load Model if args.use_cuda: checkpoint = torch.load(checkpoint_path) else: checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) log('Loading model from {}'.format(checkpoint_path)) model.load_state_dict(checkpoint['state_dict']) # Synth from Mels to Wave filenames = [ f for f in sorted(os.listdir(input_dir)) if f.endswith('.npy') ] for i, filename in tqdm(enumerate(filenames)): mel = np.load(os.path.join(input_dir, filename)).T save_wavernn_wav(model.generate(mel), f'{output_dir}/{i}_generated.wav', hparams.sample_rate)
def tacotron_synthesize(args, hparams, checkpoint, sentences=None): output_dir = 'tacotron_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) if 'Both' in checkpoint: checkpoint = checkpoint.replace('Both', 'Tacotron-2') elif 'Tacotron-2' in checkpoint: checkpoint = checkpoint.replace('Tacotron-2', 'Both') else: raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint)) try: #Try loading again checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) wavenet_in_dir = None if args.mode == 'eval': wavenet_in_dir = run_eval(args, checkpoint_path, output_dir, hparams, sentences) elif args.mode == 'synthesis': run_synthesis(args, checkpoint_path, output_dir, hparams) else: run_live(args, checkpoint_path, hparams) return wavenet_in_dir
def init_tacotron2(args): # t2 print('\n#####################################') if args.model == 'Tacotron': print('\nInitialising Tacotron Model...\n') t2_hparams = hparams.parse(args.hparams) try: checkpoint_path = tf.train.get_checkpoint_state( args.taco_checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format( args.taco_checkpoint)) output_dir = 'tacotron_' + args.output_dir eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') print('eval_dir:', eval_dir) print('args.mels_dir:', args.mels_dir) # Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, t2_hparams) return synth, eval_dir, log_dir
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step log("feeder.py:_enqueue_next_train_group():row162:before examples") examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] log("feeder.py:_enqueue_next_train_group():row164:after examples") # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) print(strftime("---%a, %d %b %Y %H:%M:%S +0000", localtime())) print( 'feeder.py:_enqueue_next_train_group():row168:Generated {} train batches of size {} in {:.3f} sec' .format(len(batches), n, time.time() - start)) for batch in batches: #print('\nfeeder.py:_enqueue_next_train_group():row171') feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def publish(args, hparams, checkpoint_path): log(hparams_debug_string()) if not os.path.exists(args.book): raise ValueError('{}: {}'.format('No such file or directory', args.book)) speaker_id = args.speaker_id synth = Synthesizer() synth.load(checkpoint_path, hparams) with open_file(args.book) as f: text = f.read() if args.lang == 'kr': kkma = Kkma() sents = kkma.sentences(text) else: sents = nltk.sent_tokenize(text) full_mels = None silence = np.full((100, hparams.num_mels), hparams.min_level_db, np.float32) for i, line in enumerate(tqdm(sents)): text = line.strip() if text: mels = generate_fast(synth, text, speaker_id, play=False) if i > 0: full_mels = np.concatenate((full_mels, silence), axis=0) # padding silence between sents full_mels = np.concatenate((full_mels, mels), axis=0) else: full_mels = mels save_path = change_file_ext(args.book, '.wav') log('saving to wav file...') wav = audio.inv_mel_spectrogram(full_mels.T, hparams) audio.save_wav(wav, save_path, sr=hparams.sample_rate)
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath( args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) delta_size = hparams.tacotron_synthesis_batch_size if hparams.tacotron_synthesis_batch_size < len( sentences) else len(sentences) batch_sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), delta_size) ] start = time.time() for i, batch in enumerate(tqdm(batch_sentences)): audio.save_wav( synth.eval(batch), os.path.join(log_dir, 'wavs', 'eval_batch_{:03}.wav'.format(i)), hparams) log('\nGenerated total batch of {} in {:.3f} sec'.format( delta_size, time.time() - start)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath( args.mels_dir) # mels_dir = wavenet_input_dir # Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): if is_korean_text(text): text = normalize_number(text) # 한글을 자소 단위로 쪼갠다. text = split_to_jamo(text, hparams.cleaners) mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.vad_batch_size examples = [ self._get_next_example() for _ in range(_num_per_batch) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: len(x[-1])) examples = (np.vstack([ex[0] for ex in examples]), np.vstack([ex[1] for ex in examples])) batches = [(examples[0][i:i + n], examples[1][i:i + n]) for i in range(0, len(examples[-1]) + 1 - n, n)] if len(examples[-1]) % n != 0: batches.append((examples[0][-(len(examples[-1]) % n):], examples[1][-(len(examples[-1]) % n):])) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def tacotron_synthesize(args, hparams, checkpoint, sentences=None): output_dir = 'tacotron_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if args.mode == 'eval': return run_eval(args, checkpoint_path, output_dir, hparams, sentences) elif args.mode == 'synthesis': return run_synthesis(args, checkpoint_path, output_dir, hparams) else: run_live(args, checkpoint_path, hparams)
def get_filenames_from_metadata(synth_metadata_filename, input_dir, flip_spk_emt=False): with open(synth_metadata_filename, encoding='utf-8') as f: metadata = [ line.strip().split('|') for line in f if not (line.startswith('#')) ] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[6]) for x in metadata]) * frame_shift_ms / (3600) log('Synthesis - Loaded metadata for {} examples ({:.2f} hours)'. format(len(metadata), hours)) # log('Starting Synthesis') texts = [m[7] for m in metadata] mel_filenames = [ os.path.join(input_dir, m[0], 'mels', m[2]) for m in metadata ] basenames = [ os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames ] basenames_refs = [m[11] + '_' + m[13] for m in metadata] mel_ref_filenames_emt = [] mel_ref_filenames_spk = [] emt_labels = [] spk_labels = [] for m in metadata: dataset = m[0] if m[12] == 'same': mel_ref_filenames_emt.append( os.path.join(input_dir, dataset, 'mels', m[2])) else: if 'accent' in synth_metadata_filename: dataset_emt = 'vctk' else: dataset_emt = 'emth' if m[12][0] == 'h' else 'emt4' if m[12][0] == 'h': m[12] = m[12][1:] mel_ref_filenames_emt.append( os.path.join(input_dir, dataset_emt, 'mels', m[12])) if m[14] == 'same': mel_ref_filenames_spk.append( os.path.join(input_dir, dataset, 'mels', m[2])) else: mel_ref_filenames_spk.append( os.path.join(input_dir, 'jessa', 'mels', m[14])) emt_labels.append(m[8]) spk_labels.append(m[9]) if flip_spk_emt: mel_ref_filenames_emt_tmp = mel_ref_filenames_emt mel_ref_filenames_emt = mel_ref_filenames_spk mel_ref_filenames_spk = mel_ref_filenames_emt_tmp return (texts, basenames, basenames_refs, mel_filenames, mel_ref_filenames_emt, mel_ref_filenames_spk, emt_labels, spk_labels)
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: self._datadir = os.path.dirname(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), tf.placeholder(tf.float32, [None, None, hparams.pml_dimension], 'pml_targets'), ] # Create queue for buffering data: queue = tf.FIFOQueue( 8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.pml_targets = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) self.pml_targets.set_shape(self._placeholders[4].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences): log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model)) log('Synthesizing mel-spectrograms from text..') wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) log('Synthesizing audio from mel-spectrograms.. (This may take a while)') wavenet_synthesize(args, hparams, wave_checkpoint) log('Tacotron-2 TTS synthesis complete!')
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step #Test on entire test set examples = [self._get_test_groups() for i in range(len(self._test_meta))] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) return batches, r
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, targets, gta=gta) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs self.alignment = self.model.alignments[0] self.gta = gta self._hparams = hparams log('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, gta=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) log('starting synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): text = meta[5] mel_filename = os.path.join(mel_dir, meta[1]) wav_filename = os.path.join(wav_dir, meta[0]) mel_output_filename = synth.synthesize(text, i+1, synth_dir, None, mel_filename) file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def run_live(args, checkpoint_path, hparams): #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = 'Thank you for testing our features. see you soon.' log(leave) generate_fast(synth, leave) sleep(2) break
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._train_offset = 0 self._test_offset = 0 # Load metadata self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) #Train test split if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split(indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) #Make sure test_indices is a multiple of batch_size else round up len_test_indices = self._round_up(len(test_indices), hparams.tacotron_batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn't originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -(hparams.max_abs_value + .1) else: self._target_pad = -0.1 #Mark finished sequences with 1s self._token_pad = 1. with tf.device('/cpu:0'): # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), ] # Create queue for buffering data queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) self.targets_lengths.set_shape(self._placeholders[5].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='eval_queue') self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_linear_targets.set_shape(self._placeholders[4].shape) self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not (self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = g[idx] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format( hparams.cin_channels, c.shape)) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
def train(args, log_dir, hparams): log('\n#############################################################\n') log('Tacotron Train\n') log('###########################################################\n') checkpoint = tacotron_train(args, log_dir, hparams) tf.reset_default_graph() if checkpoint is None: raise('Error occured while training Tacotron, Exiting!') log('\n#############################################################\n') log('Tacotron GTA Synthesis\n') log('###########################################################\n') input_path = tacotron_synthesize(args, hparams, checkpoint) log('\n#############################################################\n') log('Wavenet Train\n') log('###########################################################\n') wavenet_train(args, log_dir, hparams, input_path)