def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, speaker_id=args.speaker_id) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() if args.speaker_id is not None: mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=[args.speaker_id[i]]) else: mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None, speaker_id=None) file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0])) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis_sytle_transfer(args, synth_metadata_filename, checkpoint_path, output_dir, hparams): synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(args, checkpoint_path, hparams) texts, basenames, basenames_refs, mel_filenames, \ mel_ref_filenames_emt, mel_ref_filenames_spk,\ emt_labels, spk_labels = get_filenames_from_metadata(synth_metadata_filename, args.input_dir, args.flip_spk_emt) synth.synthesize(texts, basenames, synth_dir, synth_dir, mel_filenames, mel_ref_filenames_emt=mel_ref_filenames_emt, mel_ref_filenames_spk=mel_ref_filenames_spk, emt_labels_synth=emt_labels, spk_labels_synth=spk_labels)
def run_single(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) # Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) # Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] # sentences=[[sentences]] print(sentences) log('Starting Synthesis Single') for i, texts in enumerate(tqdm(sentences)): start = time.time() #basenames = ['batch_{:03d}_sentence_{:03d}'.format(i, j) for j in range(len(texts))] #mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) print(texts, eval_dir, log_dir) synth.synthesize(texts, None, eval_dir, log_dir, None) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def tacotron_synthesize(sentences): os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # ignore warnings https://stackoverflow.com/questions/47068709/ output_dir = 'A' checkpoint_path = tf.train.get_checkpoint_state('trained_model').model_checkpoint_path print('####### checkpoint_path', checkpoint_path) synth = Synthesizer() synth.load(checkpoint_path) os.makedirs(output_dir, exist_ok=True) for i, text in enumerate(sentences): synth.synthesize(text, i + 1, output_dir, None) print('Results at: {}'.format(output_dir))
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, reference_mel=args.reference_audio) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T else: raise ValueError( "Evaluation without reference audio. Please provide path to reference audio." ) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None, reference_mel=reference_mel) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) npy_data = np.load(mel_filename) npy_data = npy_data.reshape((-1,)) npy_data.tofile("f32_for_lpcnet.f32") log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath( args.mels_dir) # mels_dir = wavenet_input_dir # Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): if is_korean_text(text): text = normalize_number(text) # 한글을 자소 단위로 쪼갠다. text = split_to_jamo(text, hparams.cleaners) mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Set inputs batch wise sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis(args, checkpoint_path, output_dir): metadata_filename = os.path.join(args.input_dir, 'train.txt') print(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, gta=args.GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) print('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) if args.GTA==True: synth_dir = os.path.join(output_dir, 'gta') else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) print('starting synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): text = meta[5] mel_filename = os.path.join(mel_dir, meta[1]) wav_filename = os.path.join(wav_dir, meta[0]) mel_output_filename = synth.synthesize(text, None, i+1, synth_dir, None, mel_filename) file.write('{}|{}|{}|{}\n'.format(text, mel_filename, mel_output_filename, wav_filename)) print('synthesized mel spectrograms at {}'.format(synth_dir))
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') print(hparams_debug_string()) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, gta=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) print('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) #Set inputs batch wise metadata = [ metadata[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size) ] print('Starting Synthesis') log('Starting Synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w', encoding="utf-8") as file: for i, meta in enumerate(tqdm(metadata)): texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] basenames = [ os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, synth_dir, None, mel_filenames) for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') print('synthesized mel spectrograms at {}'.format(synth_dir)) log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def run_eval(args, checkpoint_path, output_dir, hparams, text, step, cwd): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist #os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) #os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) log('Starting Synthesis') synth.synthesize(text, step, eval_dir, log_dir, None, cwd) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] if hparams.tacotron_reference_waveform: # mel_reference_filename = ["/home/aperquin/Programmes/Tacotron-2-from-reference/training_data/mels/mel-p225_046.npy"]*len(basenames) # mel_reference_filename = ["/home/aperquin/Programmes/Tacotron-2-from-reference/training_data/mels/mel-p226_306.npy"]*len(basenames) # mel_reference_filename = ["/home/aperquin/Programmes/Tacotron-2-from-reference/training_data/mels/mel-p297_247.npy"]*len(basenames) # mel_reference_filename = ["/home/aperquin/Programmes/Tacotron-2-from-reference/training_data/mels/mel-p376_076.npy"]*len(basenames) mel_reference_filename = [args.mel_reference] * len(basenames) else: mel_reference_filename = None mel_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, None, mel_reference_filename) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis(args, checkpoint_path, output_dir, sentences): metadata_filename = os.path.join(args.input_dir, 'train.txt') print(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, gta=args.GTA) wav = load_wav(args.reference_audio) reference_mel = melspectrogram(wav).transpose() with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) print('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) if args.GTA==True: synth_dir = os.path.join(output_dir, 'gta') else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) os.makedirs(os.path.join(synth_dir, 'wavs/'), exist_ok=True) print('starting synthesis') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: #for i, meta in enumerate(tqdm(metadata)): #text = meta[5] for i, text in enumerate(tqdm(sentences)): mel_output_filename = synth.synthesize(text=text, index=i+1, out_dir=synth_dir, log_dir=None, mel_filename=None, reference_mel=reference_mel) mels = np.load(mel_output_filename) wav = audio.inv_mel_spectrogram(mels.T) audio.save_wav(wav, os.path.join(synth_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(i+1))) with open(os.path.join(synth_dir, 'wavs/speech-wav-{:05d}.txt'.format(i+1)), 'w') as tf: tf.write(text) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T) audio.save_wav(wav, os.path.join(synth_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(i+1))) #file.write('{}|{}|{}|{}\n'.format(text, mel_filename, mel_output_filename, wav_filename)) print('synthesized mel spectrograms at {}'.format(synth_dir))
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename, speaker_id = synth.synthesize([text], [i + 1], eval_dir, log_dir, None) file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0])) log('synthesized mel spectrograms at {}'.format(eval_dir))
def run_eval(args, checkpoint_path, output_dir): # print(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path) eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(hparams.sentences)): start = time.time() mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename))
def run_eval(args, checkpoint_path, output_dir): print(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path) eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') wav = load_wav(args.reference_audio) reference_mel = melspectrogram(wav).transpose() #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(hparams.sentences)): start = time.time() mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None, reference_mel) file.write('{}|{}\n'.format(text, mel_filename)) print('synthesized mel spectrograms at {}'.format(eval_dir))
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio,sr=hparams.sample_rate) reference_mel = audio.melspectrogram(ref_wav,hparams).astype(np.float32).T else: #raise ValueError("Evaluation without reference audio. Please provide path to reference audio.") reference_mel = None synth.load(checkpoint_path, hparams, reference_mel=reference_mel) #Set inputs batch wise sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = ['batch_{:03d}_sentence_{:03d}'.format(i, j) for j in range(len(texts))] mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None, reference_mel=reference_mel) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, gta=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) log('starting synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): text = meta[5] mel_filename = os.path.join(mel_dir, meta[1]) wav_filename = os.path.join(wav_dir, meta[0]) mel_output_filename = synth.synthesize(text, i+1, synth_dir, None, mel_filename) file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.modify_vae_dim is not None: eval_dir += '-modify' log_dir += '-modify' modify_vae_dim = [int(dim) for dim in args.modify_vae_dim.split(',') ] if args.modify_vae_dim else None if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() if args.reference_mel is not None and args.modify_vae_dim is None: synth.load(checkpoint_path, hparams, vae_code_mode='auto') elif args.reference_mel is not None and args.modify_vae_dim is not None: synth.load(checkpoint_path, hparams, vae_code_mode='modify') else: synth.load(checkpoint_path, hparams, vae_code_mode='feed') #Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: trange = tqdm(sentences) for i, texts in enumerate(trange): if args.modify_vae_dim is None: start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] if args.reference_mel is not None: mel_filenames = [ args.reference_mel for j in range(len(texts)) ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, mel_filenames) else: mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, None) log_dir = None #save plots and wavs for the first batch only, for human inspection for elems in zip(texts, mel_output_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') else: scales = [-2, -1, 0, 1, 2] for dim in modify_vae_dim: for scale in scales: start = time.time() basenames = [ 'dim_{}_batch_{}_sentence_{}_mu+({}*sigma)'.format( dim, i, j, scale) for j in range(len(texts)) ] if args.reference_mel is not None: mel_filenames = [ args.reference_mel for j in range(len(texts)) ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, mel_filenames, dim, scale) else: mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, None, dim, scale) trange.set_postfix({ 'modified_dim': dim, 'value': 'mu+({}*sigma)'.format(scale) }) trange.update(1 / len(scales) / len(modify_vae_dim)) trange.refresh() for elems in zip(texts, mel_output_filenames, speaker_ids): file.write('|'.join( [str(x) for x in elems + (dim, scale)]) + '\n') log_dir = None #save plots and wavs for the first batch only, for human inspection log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis_multiple(args, checkpoint_path, output_dir, hparams, model_suffix): n_spk_per_accent = 2 n_text_per_spk = 5 synth_dir = os.path.join(output_dir, 'wavs', model_suffix, time_string()) os.makedirs(synth_dir, exist_ok=True) synth = Synthesizer() synth.load(args, checkpoint_path, hparams) with open(args.train_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] if args.remove_long_samps: len_before = len(metadata) metadata = [ f for f in metadata if not (f[10].endswith('_023.wav')) ] metadata = [ f for f in metadata if not (f[10].endswith('_021.wav')) ] metadata = [f for f in metadata if int(f[6]) < 500] print("Removed Long Samples - before: {}, after: {}".format( len_before, len(metadata))) #only synthesize long samples metadata = [f for f in metadata if int(f[6]) > 200] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[6]) for x in metadata]) * frame_shift_ms / (3600) print('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) df = pd.DataFrame(metadata, columns=[ 'dataset', 'audio_filename', 'mel_filename', 'linear_filename', 'spk_emb_filename', 'time_steps', 'mel_frames', 'text', 'emt_label', 'spk_label', 'basename', 'sex' ]) chosen_accents = ['0', '3'] assert (len(chosen_accents) <= 2) acc_names = [ 'American', 'Australian', 'Canadian', 'English', 'Indian', 'Irish', 'NewZealand', 'NorthernIrish', 'Scottish', 'SouthAfrican', 'Welsh' ] df_acc = df[df['emt_label'].isin(chosen_accents)] # spk_idxs = sorted(frozenset(df_acc['spk_label'].unique())) texts = [] mel_filenames = [] mel_ref_filenames_emt = [] mel_ref_filenames_spk = [] basenames = [] basenames_refs = [] for i, acc in enumerate(chosen_accents): df_acc_spks = df_acc[df_acc['emt_label'] == acc]['spk_label'].unique() chosen_spks = np.random.choice(df_acc_spks, n_spk_per_accent, replace=False) for spk in chosen_spks: df_spk = df_acc[df_acc['spk_label'] == spk] idxs = np.random.choice(df_spk.index, n_text_per_spk, replace=False) for idx in idxs: # for j in range(5): for acc_ref in chosen_accents: texts.append(df_acc.loc[idx].text) mel_filename = os.path.join(args.input_dir, df_acc.loc[idx].dataset, 'mels', df_acc.loc[idx].mel_filename) mel_filenames.append(mel_filename) mel_ref_filenames_spk.append(mel_filename) basenames.append('{}_{}_{}'.format( df_acc.loc[idx].basename.split('.')[0], acc_names[int(acc)][:2], df_acc.loc[idx].sex)) df_other_acc = df_acc[df_acc['emt_label'] == acc_ref] row = df_other_acc.loc[np.random.choice( df_other_acc.index, 1)] mel_ref_filenames_emt.append( os.path.join(args.input_dir, row.dataset.iloc[0], 'mels', row.mel_filename.iloc[0])) basenames_refs.append('{}'.format(acc_names[int( row.emt_label)][:2])) #,j)) if args.flip_spk_emt: mel_ref_filenames_emt_tmp = mel_ref_filenames_emt mel_ref_filenames_emt = mel_ref_filenames_spk mel_ref_filenames_spk = mel_ref_filenames_emt_tmp print('Starting Synthesis on {} samples'.format( len(mel_filenames) // len(chosen_accents))) synth.synthesize(texts, basenames, synth_dir, synth_dir, mel_filenames, basenames_refs=basenames_refs, mel_ref_filenames_emt=mel_ref_filenames_emt, mel_ref_filenames_spk=mel_ref_filenames_spk)
def get_style_embeddings(args, checkpoint_path, output_dir, hparams): emb_dir = os.path.join(output_dir, 'embeddings') os.makedirs(emb_dir, exist_ok=True) meta_path = os.path.join(emb_dir, 'meta.tsv') emb_emt_path = os.path.join(emb_dir, 'emb_emt.tsv') emb_spk_path = os.path.join(emb_dir, 'emb_spk.tsv') with open(args.train_filename, encoding='utf-8') as f: metadata = [ line.strip().split('|') for line in f if not (line.startswith('#')) ] df_meta = get_metadata_df(args.train_filename, args) spk_ids = df_meta.spk_label.unique() spk_ids_chosen = np.sort(np.random.choice(spk_ids, args.n_spk)) #make sure first user is in embeddings (zo - the one with emotions) # if not(0 in spk_ids_chosen): # spk_ids_chosen = np.sort(np.append(spk_ids_chosen,0)) # if args.unpaired: # chosen_idx = [] # for id in spk_ids_chosen: # spk_rows = df_meta[df_meta.loc[:, 'spk_label'] == id] # chosen_idxs = np.random.choice(spk_rows.index.values, args.n_per_spk) # for idx in chosen_idxs: # row = df_meta # for i in range(4): # if i ==0: # # # df_meta_chosen = df_meta.iloc[np.array(sorted(chosen_idx))] # # mel_filenames = [os.path.join(args.input_dir, row.dataset, 'mels', row.mel_filename) for idx, row in # df_meta_chosen.iterrows()] # # # texts = list(df_meta_chosen.text) chosen_idx = [] for id in spk_ids_chosen: spk_rows = df_meta[df_meta.loc[:, 'spk_label'] == id] # if id ==0: # for emt in range(4): # emt_rows = spk_rows[spk_rows.loc[:, 'emt_label'] == emt] # chosen_idx += list(np.random.choice(emt_rows.index.values, args.n_emt)) # else: chosen_idx += list( np.random.choice(spk_rows.index.values, args.n_per_spk)) df_meta_chosen = df_meta.iloc[np.array(sorted(chosen_idx))] mel_filenames = [ os.path.join(args.input_dir, row.dataset, 'mels', row.mel_filename) for idx, row in df_meta_chosen.iterrows() ] texts = list(df_meta_chosen.text) synth = Synthesizer() synth.load(args, checkpoint_path, hparams) print("getting embedding for {} samples".format(len(mel_filenames))) emb_emt, emb_spk, emb_mo_emt, emb_mo_spk, emb_cont_emt = synth.synthesize( texts, None, None, None, mel_filenames, mel_ref_filenames_emt=mel_filenames, mel_ref_filenames_spk=mel_filenames, emb_only=True) #SAVE META + EMBEDDING CSVS columns_to_keep = [ 'dataset', 'mel_filename', 'mel_frames', 'emt_label', 'spk_label', 'basename', 'sex' ] df = df_meta_chosen.loc[:, columns_to_keep] df['real'] = 'real' df_synth = df.copy() df_synth['real'] = 'synth' df = pd.concat([df, df_synth]) df.to_csv(meta_path, sep='\t', index=False) # if args.emt_attn: # emb_emt = np.vstack((emb_emt, emb_mo_emt)) emb_spk = np.vstack((emb_spk, emb_mo_spk)) # pd.DataFrame(emb_emt).to_csv(emb_emt_path,sep='\t',index=False, header=False) pd.DataFrame(emb_spk).to_csv(emb_spk_path, sep='\t', index=False, header=False) print(len(emb_emt)) print(emb_emt.shape)
import argparse parser = argparse.ArgumentParser() parser.add_argument('--checkpoint', required=False, help='Full path to model checkpoint', default="tacotron/tmp/tacotron-20180906/model.ckpt") parser.add_argument('--text', required=False, help='Text to synthesize', default="Hello World") parser.add_argument('--output', required=False, help='File path of output', default="HelloWorld.wav") args = parser.parse_args() checkpoint = str(args.checkpoint) text = str(args.text) output = str(args.output) print("Checkpoint: " + checkpoint) print("Text: " + text) print("Output: " + output) print("") print("Loading model...") synthesizer = Synthesizer() synthesizer.load(checkpoint) print("Loading model completed!") print("") print("Sythesizing text...") with open(output, 'wb') as file: file.write(synthesizer.synthesize(text)) print("Sythesizing text completed!") print("")
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') log_dir = os.path.join(output_dir, 'logs-gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') log_dir = os.path.join(output_dir, 'logs-natural') if args.modify_vae_dim is not None: synth_dir += '-modify' log_dir += '-modify' modify_vae_dim = [int(dim) for dim in args.modify_vae_dim.split(',') ] if args.modify_vae_dim else None #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') log(hparams_debug_string()) synth = Synthesizer() if GTA or args.modify_vae_dim is None: synth.load(checkpoint_path, hparams, gta=GTA, vae_code_mode='auto') else: synth.load(checkpoint_path, hparams, gta=GTA, vae_code_mode='modify') with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) #Set inputs batch wise metadata = [ metadata[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: trange = tqdm(metadata) for i, meta in enumerate(trange): if GTA or args.modify_vae_dim is None: texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] basenames = [ os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, synth_dir, log_dir, mel_filenames) log_dir = None #save plots and wavs for the first batch only, for human inspection for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') else: scales = [-2, -1, 0, 1, 2] for dim in modify_vae_dim: for scale in scales: texts = [m[5] for m in meta] mel_filenames = [ os.path.join(mel_dir, m[1]) for m in meta ] wav_filenames = [ os.path.join(wav_dir, m[0]) for m in meta ] basenames = [ 'dim_{}-'.format(dim) + os.path.basename(m).replace('.npy', '').replace( 'mel-', '') + '-mu+({}*sigma)'.format(scale) for m in mel_filenames ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, synth_dir, log_dir, mel_filenames, dim, scale) trange.set_postfix({ 'modified_dim': dim, 'value': 'mu+({}*sigma)'.format(scale) }) trange.update(1 / len(scales) / len(modify_vae_dim) * len(trange)) trange.refresh() for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join( [str(x) for x in elems + (dim, scale)]) + '\n') break #synthesize spectrograms for the first batch only, for human inspection log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, reference_mels=args.reference_audio) if args.reference_audio is not None: print('reference_audio:', args.reference_audio) ref_wav = load_wav(args.reference_audio.strip(), hparams.sample_rate) reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T else: if hparams.use_style_encoder == True: print("*******************************") print( "TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") else: #raise ValueError("You must set the reference audio if you don't want to use GSTs.") print("233") #Set inputs batch wise ppgs = [ ppgs[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(ppgs), hparams.tacotron_synthesis_batch_size) ] Lf0s = [ Lf0s[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(Lf0s), hparams.tacotron_synthesis_batch_size) ] if args.reference_audio is not None: reference_mels = [reference_mel] * len(ppgs) log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(ppgs)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] if args.reference_audio is not None: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, [reference_mels[i]], Lf0s[i]) else: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, None, Lf0s[i]) for elems in zip(texts, mel_filenames, [speakers[i]]): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def synthesize_random(args, checkpoint_path, output_dir, hparams, model_suffix): n_emt = 4 if not (args.paired) else 1 n_txts_per_emotion = 5 if not (args.paired) else 10 synth_dir = os.path.join(output_dir, 'random', model_suffix, time_string()) os.makedirs(synth_dir, exist_ok=True) synth = Synthesizer() synth.load(args, checkpoint_path, hparams) meta_save_path = os.path.join(synth_dir, 'meta.csv') df = pd.read_csv( r'C:\Users\t-mawhit\Documents\code\Tacotron-2\data\zo_jessa_train_test.csv' ) df_train = df[df.train_test == 'train'] df_test = df[df.train_test == 'test'] #synthesize 20 random samples from zo and jessa, 5 in each emotion #change emotion df_test_zo = df_test[df_test.dataset == 'emt4'] df_test_jessa = df_test[df_test.dataset == 'jessa'] df_test_use = df_test_jessa if not ( args.zo) else df_test_zo[df_test_zo.emt_label == 0] np.random.seed(2) chosen_texts_idxs = np.random.choice(df_test_use.index, n_txts_per_emotion * n_emt, replace=False) df_test_use_texts_rows = df_test_use.loc[chosen_texts_idxs] meta = df_test_use_texts_rows.copy() meta['basename'] = '' idx = 0 texts = [] mel_filenames = [] mel_ref_filenames_emt = [] mel_ref_filenames_spk = [] basenames = [] basenames_refs = [] emt_labels = [] spk_labels = [] for i in range(n_emt): df_test_zo_emt = df_test_zo[df_test_zo.emt_label == i] for j in range(n_txts_per_emotion): row = df_test_use_texts_rows.iloc[idx] texts.append(row.text) mel_filenames.append( os.path.join(args.input_dir, row.dataset, 'mels', row.mel_filename)) if args.paired: mel_ref_filenames_spk.append( os.path.join(args.input_dir, row.dataset, 'mels', row.mel_filename)) mel_ref_filenames_emt.append( os.path.join(args.input_dir, row.dataset, 'mels', row.mel_filename)) else: row_spk = df_test_use.loc[np.random.choice(df_test_use.index)] mel_ref_filenames_spk.append( os.path.join(args.input_dir, row_spk.dataset, 'mels', row_spk.mel_filename)) row_emt = df_test_zo_emt.loc[np.random.choice( df_test_zo_emt.index)] mel_ref_filenames_emt.append( os.path.join(args.input_dir, row_emt.dataset, 'mels', row_emt.mel_filename)) basename = '{}'.format(row.basename.split('.')[0]) basename_ref = 'e{}'.format(i) basenames.append(basename) basenames_refs.append(basename_ref) emt_label = row_emt.emt_label if not ( args.paired) else row.emt_label spk_label = row_spk.spk_label if not ( args.paired) else row.spk_label emt_labels.append(int(emt_label)) spk_labels.append(int(spk_label)) meta.iloc[idx, 8] = emt_label meta.iloc[idx, 9] = spk_label meta.iloc[idx, 10] = 'mel-{}_{}.npy'.format(basename, basename_ref) idx += 1 meta.to_csv(meta_save_path, index=False) print('Starting Synthesis on {} samples'.format(len(mel_filenames))) synth.synthesize(texts, basenames, synth_dir, synth_dir, mel_filenames, basenames_refs=basenames_refs, mel_ref_filenames_emt=mel_ref_filenames_emt, mel_ref_filenames_spk=mel_ref_filenames_spk, emt_labels_synth=emt_labels, spk_labels_synth=spk_labels)
import argparse from tacotron.utils import makedirs, str2bool from tacotron.synthesizer import Synthesizer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--load_path', required=True) parser.add_argument('--sample_path', default="samples") parser.add_argument('--text', required=True) parser.add_argument('--num_speakers', default=1, type=int) parser.add_argument('--speaker_id', default=0, type=int) parser.add_argument('--checkpoint_step', default=None, type=int) parser.add_argument('--is_korean', default=True, type=str2bool) config = parser.parse_args() makedirs(config.sample_path) synthesizer = Synthesizer() synthesizer.load(config.load_path, config.num_speakers, config.checkpoint_step) audio = synthesizer.synthesize(texts=[config.text], base_path=config.sample_path, speaker_ids=[config.speaker_id], attention_trim=False, isKorean=config.is_korean)[0]