def process_merlin_label(bin_label_fname, text_lab_dir, phonedim=416, subphonedim=9): text_label = os.path.join(text_lab_dir, basename(bin_label_fname) + '.lab') assert os.path.isfile( text_label), 'No text file for %s ' % (basename(bin_label_fname)) labfrombin = get_speech(bin_label_fname, phonedim + subphonedim) ## fraction through phone (forwards) fraction_through_phone_forwards = labfrombin[:, -1] ## This is a suprisingly noisy signal which never seems to start at 0.0! Find minima:- (minima, ) = argrelextrema(fraction_through_phone_forwards, np.less) ## first frame is always a start: minima = np.insert(minima, 0, 0) ## check size against text file: labfromtext = merlin_state_label_to_phone(text_label) assert labfromtext.shape[0] == minima.shape[0] lab = labfrombin[ minima, : -subphonedim] ## discard frame level feats, and take first frame of each phone return lab
def process(fpath, worlddir='', outdir='', scaler=''): assert scaler speech = load_sentence(fpath, worlddir=worlddir, outdir=outdir) norm_speech = standardise_acoustics(speech, scaler) np.save('%s/full_world/%s' % (outdir, basename(fpath)), norm_speech.astype(np.float32)) np.save('%s/coarse_world/%s' % (outdir, basename(fpath)), norm_speech[::4, :].astype(np.float32))
def load_attention(fpath): try: attention_guide_file = "{}/{}".format( hp.attention_guide_dir, basename(fpath) + ".npy") except TypeError: attention_guide_file = "{}/{}".format( hp.attention_guide_dir, basename(fpath.decode('utf-8')) + ".npy") attention_guide = read_floats_from_8bit(attention_guide_file) return fpath, attention_guide
def load_merlin_label(fpath): try: label_file = "{}/{}".format(hp.merlin_label_dir, basename(fpath) + ".npy") except TypeError: label_file = "{}/{}".format( hp.merlin_label_dir, basename(fpath.decode('utf-8')) + ".npy") label = np.load( label_file ) ## TODO: could use read_floats_from_8bit format return fpath, label
def main_work(): ################################################# # ============= Process command line ============ a = ArgumentParser() a.add_argument('-b', dest='binlabdir', required=True) a.add_argument('-t', dest='text_lab_dir', required=True) a.add_argument('-n', dest='norm_info_fname', required=True) a.add_argument('-o', dest='outdir', required=True) a.add_argument('-binext', dest='binext', required=False, default='lab') a.add_argument('-skipterminals', action='store_true', default=False) opts = a.parse_args() # =============================================== safe_makedir(opts.outdir) norm_info = get_speech(opts.norm_info_fname, 425)[:, :-9] data_min = norm_info[0, :] data_max = norm_info[1, :] data_range = data_max - data_min text_label_files = set( [basename(f) for f in glob.glob(opts.text_lab_dir + '/*.lab')]) binary_label_files = sorted(glob.glob(opts.binlabdir + '/*.' + opts.binext)) print binary_label_files for binlab in binary_label_files: base = basename(binlab) if base not in text_label_files: continue print base lab = process_merlin_label(binlab, opts.text_lab_dir) if opts.skipterminals: lab = lab[ 1: -1, :] ## NB: dont remove 2 last as in durations, as the final punct does't features here norm_lab = minmax_norm(lab, data_min, data_max) if 0: ## piano roll style plot: pl.imshow(norm_lab, interpolation='nearest') pl.gray() pl.savefig('/afs/inf.ed.ac.uk/user/o/owatts/temp/fig.pdf') sys.exit('abckdubv') np.save(opts.outdir + '/' + base, norm_lab)
def copy_synth_SSRN_GL(hp, outdir): safe_makedir(outdir) dataset = load_data(hp, mode="synthesis") fnames, texts = dataset['fpaths'], dataset['texts'] bases = [basename(fname) for fname in fnames] mels = [np.load(os.path.join(hp.coarse_audio_dir, base + '.npy')) for base in bases] lengths = [a.shape[0] for a in mels] mels = list2batch(mels, 0) g = SSRNGraph(hp, mode="synthesize"); print("Graph (ssrn) loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') print('Run SSRN...') Z = synth_mel2mag(hp, mels, g, sess) for i, mag in enumerate(Z): print("Working on %s"%(bases[i])) mag = mag[:lengths[i]*hp.r,:] ### trim to generated length wav = spectrogram2wav(hp, mag) soundfile.write(outdir + "/%s.wav"%(base), wav, hp.sr)
def load_sentence(fpath, worlddir='', outdir=''): assert worlddir and outdir, () mel = np.load(fpath) #print (mel.shape) base = basename(fpath) streams = [] for (stream, dim) in [('lf0', 1), ('mgc', 60), ('bap', 1)]: fname = '%s/%s/%s.%s' % (worlddir, stream, base, stream) speech = get_speech(fname, dim) #print (fname) #print (speech.shape) if stream == 'lf0': speech, vuv = interpolate_through_unvoiced(speech) streams.extend([speech, vuv]) else: streams.append(speech) composed = np.hstack(streams) target_frames, _ = mel.shape actual_frames, _ = composed.shape #print (target_frames, actual_frames) diff = target_frames - actual_frames if diff < 0: sys.exit('world features too short') elif diff > 0: composed = np.pad(composed, ((0, diff), (0, 0)), mode='constant') return composed
def load_merlin_label(fpath): label_file = "{}/{}".format(hp.merlin_label_dir, basename(fpath) + ".npy") label = np.load( label_file ) ## TODO: could use read_floats_from_8bit format return fpath, label
def main_work(): ################################################# # ============= Process command line ============ a = ArgumentParser() a.add_argument('-c', dest='config', required=True, type=str) a.add_argument('-speaker', default='', type=str) a.add_argument('-N', dest='num_sentences', default=0, type=int) a.add_argument('-babble', action='store_true') a.add_argument('-ncores', type=int, default=1, help='Number of CPUs for Griffin-Lim stage') a.add_argument('-odir', type=str, default='', help='Alternative place to put output samples') a.add_argument('-t2m_epoch', default=-1, type=int, help='Default: use latest (-1)') a.add_argument('-ssrn_epoch', default=-1, type=int, help='Default: use latest (-1)') opts = a.parse_args() # =============================================== hp = load_config(opts.config) outdir = opts.odir if outdir: outdir = os.path.join(outdir, basename(opts.config)) if hp.multispeaker: assert opts.speaker, 'Please specify a speaker from speaker_list with -speaker flag' assert opts.speaker in hp.speaker_list if opts.babble: babble(hp, num_sentences=opts.num_sentences) else: synthesize(hp, speaker_id=opts.speaker, num_sentences=opts.num_sentences, \ ncores=opts.ncores, topoutdir=outdir, t2m_epoch=opts.t2m_epoch, ssrn_epoch=opts.ssrn_epoch)
def main_work(): ################################################# # ============= Process command line ============ a = ArgumentParser() a.add_argument('-b', dest='binlabdir', required=True) a.add_argument('-f', dest='audio_dir', required=True) a.add_argument('-n', dest='norm_info_fname', required=True) a.add_argument('-o', dest='outdir', required=True) a.add_argument('-binext', dest='binext', required=False, default='lab') a.add_argument('-ir', dest='inrate', type=float, default=5.0) a.add_argument('-or', dest='outrate', type=float, default=12.5) opts = a.parse_args() # =============================================== safe_makedir(opts.outdir) norm_info = get_speech(opts.norm_info_fname, 425)[:, -9:] data_min = norm_info[0, :] data_max = norm_info[1, :] data_range = data_max - data_min audio_files = set( [basename(f) for f in glob.glob(opts.audio_dir + '/*.npy')]) binary_label_files = sorted(glob.glob(opts.binlabdir + '/*.' + opts.binext)) for binlab in binary_label_files: base = basename(binlab) if base not in audio_files: continue print base positions = process_merlin_positions(binlab, opts.audio_dir, inrate=opts.inrate, outrate=opts.outrate) norm_positions = minmax_norm(positions, data_min, data_max) np.save(opts.outdir + '/' + base, norm_positions)
def proc(fpath, text_length, hp): base = basename(fpath) melfile = hp.coarse_audio_dir + os.path.sep + base + '.npy' attfile = hp.attention_guide_dir + os.path.sep + base # without '.npy' if not os.path.isfile(melfile): print('file %s not found' % (melfile)) return speech_length = np.load(melfile).shape[0] att = get_attention_guide(text_length, speech_length, g=hp.g) save_floats_as_8bit(att, attfile)
def load_merlin_label(fpath): label_file = "{}/{}".format(hp.merlin_label_dir, basename(fpath) + ".npy") label = np.load( label_file ) ## TODO: could use read_floats_from_8bit format label = np.float32(label) if hp.select_central: central_ind = get_labels_indices(hp.merlin_lab_dim) label = label[:, central_ind == 1] assert (label.shape[1] == hp.merlin_lab_dim) return fpath, label
def load_attention(fpath): attention_guide_file = "{}/{}".format(hp.attention_guide_dir, basename(fpath) + ".npy") if hp.attention_guide_fa: # To use the MSE Attention loss with FA attention matrix attention_guide = np.load(attention_guide_file) attention_guide = np.transpose( attention_guide) # FA attention is transposed else: attention_guide = read_floats_from_8bit( attention_guide_file) return fpath, attention_guide
def copy_synth_GL(hp, outdir): safe_makedir(outdir) dataset = load_data(hp, mode="synthesis") fnames, texts = dataset['fpaths'], dataset['texts'] bases = [basename(fname) for fname in fnames] for base in bases: print("Working on file %s" % (base)) mag = np.load(os.path.join(hp.full_audio_dir, base + '.npy')) wav = spectrogram2wav(hp, mag) soundfile.write(outdir + "/%s.wav" % (base), wav, hp.sr)
def process_merlin_positions(bin_label_fname, audio_dir, phonedim=416, subphonedim=9, \ inrate=5.0, outrate=12.5): audio_fname = os.path.join(audio_dir, basename(bin_label_fname) + '.npy') assert os.path.isfile( audio_fname), 'No audio file for %s ' % (basename(bin_label_fname)) audio = np.load(audio_fname) labfrombin = get_speech(bin_label_fname, phonedim + subphonedim) positions = labfrombin[:, -subphonedim:] nframes, dim = positions.shape assert dim == 9 new_nframes, _ = audio.shape old_x = np.linspace((inrate / 2.0), nframes * inrate, nframes, endpoint=False) ## place points at frame centres f = interpolate.interp1d( old_x, positions, axis=0, kind='nearest', bounds_error=False, fill_value='extrapolate' ) ## nearest to avoid weird averaging effects near segment boundaries new_x = np.linspace((outrate / 2.0), new_nframes * outrate, new_nframes, endpoint=False) new_positions = f(new_x) return new_positions
def make_mel_batch(hp, fnames, oracle=True): ## TODO: refactor with list2batch ? lengths = [] if oracle: source = hp.coarse_audio_dir bases = [basename(fname) for fname in fnames] mels = [os.path.join(hp.coarse_audio_dir, base + '.npy') for base in bases] else: mels = fnames mels = [np.load(melfile) for melfile in mels] mel_batch = np.zeros((len(mels), hp.max_T, hp.n_mels), np.float32) for (i,mel) in enumerate(mels): length,n = mel.shape mel_batch[i,:length,:] = mel lengths.append(length * hp.r) return mel_batch, lengths
def compute_validation(hp, model_type, epoch, inputs, synth_graph, sess, speaker_codes, \ valid_filenames, validation_set_reference, duration_data=None, validation_labels=None, position_in_phone_data=None): if model_type == 't2m': ## TODO: coded_text2mel here validation_set_predictions_tensor, lengths = synth_text2mel(hp, inputs, synth_graph, sess, speaker_data=speaker_codes, duration_data=duration_data, labels=validation_labels, position_in_phone_data=position_in_phone_data) validation_set_predictions = split_batch(validation_set_predictions_tensor, lengths) score = compute_dtw_error(validation_set_reference, validation_set_predictions) elif model_type == 'ssrn': validation_set_predictions_tensor = synth_mel2mag(hp, inputs, synth_graph, sess) lengths = [len(ref) for ref in validation_set_reference] validation_set_predictions = split_batch(validation_set_predictions_tensor, lengths) score = compute_simple_LSD(validation_set_reference, validation_set_predictions) else: info('compute_validation cannot handle model type %s: dummy value (0.0) supplied as validation score'%(model_type)); return 0.0 ## store parameters for later use:- valid_dir = '%s-%s/validation_epoch_%s'%(hp.logdir, model_type, epoch) safe_makedir(valid_dir) hp.validation_sentences_to_synth_params = min(hp.validation_sentences_to_synth_params, len(valid_filenames)) #if less sentences match the validation pattern than the value of 'hp.validation_sent_to_synth' for i in range(hp.validation_sentences_to_synth_params): np.save(os.path.join(valid_dir, basename(valid_filenames[i])), validation_set_predictions[i]) return score
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences < len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \ for fpath in fpaths ] labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)] # for i, mag in enumerate(Z): # print("Working on %s"%(bases[i])) # mag = mag[:lengths[i]*hp.r,:] ### trim to generated length # if hp.vocoder=='magphase_compressed': # mag = denorm(mag, s, hp.normtype) # streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45]) # wav = magphase_synth_from_compressed(streams, samplerate=hp.sr) # elif hp.vocoder=='griffin_lim': # wav = spectrogram2wav(hp, mag) # else: # sys.exit('Unsupported vocoder type: %s'%(hp.vocoder)) # #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav) # soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr) # Plot attention alignments for i in range(num_sentences): plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
def load_attention(fpath): attention_guide_file = "{}/{}".format(hp.attention_guide_dir, basename(fpath) + ".npy") attention_guide = read_floats_from_8bit(attention_guide_file) return fpath, attention_guide
def load_data(hp, mode="train", audio_extension='.wav'): '''Loads data Args: mode: "train" / "validation" / "synthesis" / "demo". ''' assert mode in ('train', 'synthesis', 'validation', 'demo') logging.info('Start loading data in mode: %s' % (mode)) get_speaker_codes = (hp.multispeaker != [] ) ## False if hp.multispeaker is empty list #import pdb;pdb.set_trace() dataset_df_path = os.path.join(hp.featuredir, 'dataset_' + mode + '.csv') # In demo mode, we change the "dataset" with only one line each time and do not want to use always the same df #if os.path.exists(dataset_df_path) and mode != 'demo': if 0: dataset_df = pd.read_csv(dataset_df_path) dataset = {} #import pdb;pdb.set_trace() # this does not work in train mode because of problem with doing pd.eval() with bytes try: dataset['texts'] = np.array( [pd.eval(e) for e in dataset_df['texts'].tolist()]) except AttributeError: #that is why we do this dataset['texts'] = np.array( [ast.literal_eval(e) for e in dataset_df['texts'].tolist()]) # I think this cause an error when trying training: # tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to DecodeRaw has length 105 that is not a multiple of 4, the size of int32 dataset['fpaths'] = dataset_df['fpaths'].tolist( ) ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist dataset['text_lengths'] = dataset_df['text_lengths'].tolist( ) ## only used in training (where length information lost due to string format) - TODO: good motivation for this format? dataset['audio_lengths'] = dataset_df['audio_lengths'].tolist( ) ## might be [] dataset['label_lengths'] = dataset_df['label_lengths'].tolist( ) ## might be [] if get_speaker_codes: dataset['speakers'] = dataset_df['speakers'].tolist() if hp.use_external_durations: dataset['durations'] = dataset_df['durations'].tolist() else: if mode in ['synthesis', 'demo']: get_speaker_codes = False ## never read speaker from transcript for synthesis -- take user-specified speaker instead # Load vocabulary char2idx, idx2char = load_vocab(hp) if mode in ["train", "validation"]: transcript = os.path.join(hp.transcript) elif mode == 'synthesis': transcript = os.path.join(hp.test_transcript) else: transcript = './demo/transcript.csv' if hp.multispeaker: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) fpaths, text_lengths, texts, speakers, durations = [], [], [], [], [] audio_lengths, label_lengths = [], [] lines = codecs.open(transcript, 'r', 'utf-8').readlines() too_long_count_frames = 0 too_long_count_text = 0 no_data_count = 0 nframes = 0 ## default 'False' value for line in tqdm(lines, desc='load_data'): line = line.strip('\n\r |') if line == '': continue fields = line.strip().split("|") assert len(fields) >= 1, fields if len(fields) > 1: assert len(fields) >= 3, fields fname = fields[0] if len(fields) > 1: unnorm_text, norm_text = fields[1:3] else: norm_text = None # to test if audio only if hp.validpatt: if mode == "train": if hp.validpatt in fname: continue elif mode == "validation": if hp.validpatt not in fname: continue if len(fields) >= 4: phones = fields[3] if norm_text is None: letters_or_phones = [ ] # [0] ## dummy 'text' (1 character of padding) where we are using audio only elif hp.input_type == 'phones': if 'speaker_dependent_phones' in hp.multispeaker: speaker_code = speaker else: speaker_code = '' phones = phones_normalize( phones, char2idx, speaker_code=speaker_code ) # in case of phones, all EOS markers are assumed included letters_or_phones = [char2idx[char] for char in phones] elif hp.input_type == 'letters': text = text_normalize(norm_text, hp) + "E" # E: EOS letters_or_phones = [char2idx[char] for char in text] text_length = len(letters_or_phones) if text_length > hp.max_N: #print('number of letters/phones for %s is %s, exceeds max_N %s: skip it'%(fname, text_length, hp.max_N)) too_long_count_text += 1 continue if mode in ["train", "validation"] and os.path.exists( hp.coarse_audio_dir): mel = "{}/{}".format(hp.coarse_audio_dir, fname + ".npy") if not os.path.exists(mel): logging.debug('no file %s' % (mel)) no_data_count += 1 continue nframes = np.load(mel).shape[0] if nframes > hp.max_T: #print('number of frames for %s is %s, exceeds max_T %s: skip it'%(fname, nframes, hp.max_T)) too_long_count_frames += 1 continue audio_lengths.append(nframes) texts.append(np.array(letters_or_phones, np.int32)) fpath = os.path.join(hp.waveforms, fname + audio_extension) fpaths.append(fpath) text_lengths.append(text_length) ## get speaker before phones in case need to get speaker-dependent phones if get_speaker_codes: assert len(fields) >= 5, fields speaker = fields[4] speaker_ix = speaker2ix[speaker] speakers.append(np.array(speaker_ix, np.int32)) if hp.merlin_label_dir: ## only get shape here -- get the data later try: label_length, label_dim = np.load("{}/{}".format( hp.merlin_label_dir, basename(fpath) + ".npy")).shape except TypeError: label_length, label_dim = np.load("{}/{}".format( hp.merlin_label_dir, basename(fpath.decode('utf-8')) + ".npy")).shape label_lengths.append(label_length) assert label_dim == hp.merlin_lab_dim if hp.use_external_durations: assert len(fields) >= 6, fields duration_data = fields[5] duration_data = [ int(value) for value in re.split('\s+', duration_data.strip(' ')) ] duration_data = np.array(duration_data, np.int32) if hp.merlin_label_dir: duration_data = duration_data[ duration_data > 0] ## merlin label contains no skipped items assert len(duration_data) == label_length, ( len(duration_data), label_length, fpath) else: assert len(duration_data) == text_length, ( len(duration_data), text_length, fpath) if nframes: assert duration_data.sum() == nframes * hp.r, ( duration_data.sum(), nframes * hp.r) durations.append(duration_data) # !TODO! check this -- duplicated!? # if hp.merlin_label_dir: ## only get shape here -- get the data later # label_length, _ = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")).shape # label_lengths.append(label_length) #import pdb;pdb.set_trace() if mode == "validation": if len(texts) == 0: logging.error( 'No validation sentences collected: maybe the validpatt %s matches no training data file names?' % (hp.validpatt)) sys.exit(1) logging.info('Loaded data for %s sentences' % (len(texts))) logging.info('Sentences skipped with missing features: %s' % (no_data_count)) logging.info('Sentences skipped with > max_T (%s) frames: %s' % (hp.max_T, too_long_count_frames)) logging.info( 'Additional sentences skipped with > max_N (%s) letters/phones: %s' % (hp.max_N, too_long_count_text)) if mode == 'train' and hp.n_utts > 0: n_utts = hp.n_utts assert n_utts <= len(fpaths) logging.info('Take first %s (n_utts) sentences for training' % (n_utts)) fpaths = fpaths[:n_utts] text_lengths = text_lengths[:n_utts] texts = texts[:n_utts] if get_speaker_codes: speakers = speakers[:n_utts] if audio_lengths: audio_lengths = audio_lengths[:n_utts] if label_lengths: label_lengths = label_lengths[:n_utts] if mode == 'train': ## Return string representation which will be parsed with tf's decode_raw: texts = [text.tostring() for text in texts] if get_speaker_codes: speakers = [speaker.tostring() for speaker in speakers] if hp.use_external_durations: durations = [d.tostring() for d in durations] if mode in ['validation', 'synthesis', 'demo']: ## Prepare a batch of 'stacked texts' (matrix with number of rows==synthesis batch size, and each row an array of integers) stacked_texts = np.zeros((len(texts), hp.max_N), np.int32) for i, text in enumerate(texts): stacked_texts[i, :len(text)] = text texts = stacked_texts if hp.use_external_durations: stacked_durations = np.zeros((len(texts), hp.max_T, hp.max_N), np.int32) for i, dur in enumerate(durations): duration_matrix = durations_to_hard_attention_matrix(dur) duration_matrix = end_pad_for_reduction_shape_sync( duration_matrix, hp) duration_matrix = duration_matrix[0::hp.r, :] m, n = duration_matrix.shape stacked_durations[i, :m, :n] = duration_matrix durations = stacked_durations dataset = {} dataset['texts'] = texts dataset[ 'fpaths'] = fpaths ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist dataset[ 'text_lengths'] = text_lengths ## only used in training (where length information lost due to string format) - TODO: good motivation for this format? dataset['audio_lengths'] = audio_lengths ## might be [] dataset['label_lengths'] = label_lengths ## might be [] dataset_df = dataset.copy() try: dataset_df['texts'] = dataset_df['texts'].tolist() except: # It is already a list pass try: if len(dataset_df['audio_lengths']) == 0: dataset_df['audio_lengths'] = [0] * len(dataset_df['texts']) if len(dataset_df['label_lengths']) == 0: dataset_df['label_lengths'] = [0] * len(dataset_df['texts']) if not os.path.exists(hp.featuredir): os.makedirs(hp.featuredir) pd.DataFrame.to_csv(pd.DataFrame.from_records(dataset_df), dataset_df_path) except: import pdb pdb.set_trace() if get_speaker_codes: dataset['speakers'] = speakers if hp.use_external_durations: dataset['durations'] = durations logging.info('Finished loading data in mode: %s' % (mode)) #import pdb;pdb.set_trace() return dataset
def main_work(): ################################################# # ============= Process command line ============ a = ArgumentParser() a.add_argument('-c', dest='config', required=True, type=str) a.add_argument('-m', dest='model_type', required=True, choices=['t2m', 'ssrn', 'babbler']) opts = a.parse_args() # =============================================== model_type = opts.model_type hp = load_config(opts.config) logdir = hp.logdir + "-" + model_type logger_setup.logger_setup(logdir) info('Command line: %s' % (" ".join(sys.argv))) ### TODO: move this to its own function somewhere. Can be used also at synthesis time? ### Prepare reference data for validation set: ### TODO: alternative to holding in memory? dataset = load_data(hp, mode="validation") valid_filenames, validation_text = dataset['fpaths'], dataset['texts'] speaker_codes = validation_duration_data = position_in_phone_data = None ## defaults if hp.multispeaker: speaker_codes = dataset['speakers'] if hp.use_external_durations: validation_duration_data = dataset['durations'] ## take random subset of validation set to avoid 'This is a librivox recording' type sentences random.seed(1234) v_indices = range(len(valid_filenames)) random.shuffle(v_indices) v = min(hp.validation_sentences_to_evaluate, len(valid_filenames)) v_indices = v_indices[:v] if hp.multispeaker: ## now come back to this after v computed speaker_codes = np.array(speaker_codes)[v_indices].reshape(-1, 1) if hp.use_external_durations: validation_duration_data = validation_duration_data[v_indices, :, :] valid_filenames = np.array(valid_filenames)[v_indices] validation_mags = [np.load(hp.full_audio_dir + os.path.sep + basename(fpath)+'.npy') \ for fpath in valid_filenames] validation_text = validation_text[v_indices, :] validation_labels = None # default if hp.merlin_label_dir: validation_labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \ for fpath in valid_filenames ] validation_labels = list2batch(validation_labels, hp.max_N) if 'position_in_phone' in hp.history_type: def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in dataset['durations'][v_indices]] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) if model_type == 't2m': validation_mels = [np.load(hp.coarse_audio_dir + os.path.sep + basename(fpath)+'.npy') \ for fpath in valid_filenames] validation_inputs = validation_text validation_reference = validation_mels validation_lengths = None elif model_type == 'ssrn': validation_inputs, validation_lengths = make_mel_batch( hp, valid_filenames) validation_reference = validation_mags else: info( 'Undefined model_type {} for making validation inputs -- supply dummy None values' .format(model_type)) validation_inputs = None validation_reference = None ## Get the text and mel inputs for the utts you would like to plot attention graphs for if hp.plot_attention_every_n_epochs and model_type == 't2m': #check if we want to plot attention # TODO do we want to generate and plot attention for validation or training set sentences??? modify attention_inputs accordingly... attention_inputs = validation_text[:hp.num_sentences_to_plot_attention] attention_mels = validation_mels[:hp.num_sentences_to_plot_attention] attention_mels = np.array( attention_mels) #TODO should be able to delete this line...? attention_mels_array = np.zeros( (hp.num_sentences_to_plot_attention, hp.max_T, hp.n_mels), np.float32) # create fixed size array to hold attention mels for i in range(hp.num_sentences_to_plot_attention ): # copy data into this fixed sized array attention_mels_array[ i, :attention_mels[i].shape[0], :attention_mels[i]. shape[1]] = attention_mels[i] attention_mels = attention_mels_array # rename for convenience ## Map to appropriate type of graph depending on model_type: AppropriateGraph = { 't2m': Text2MelGraph, 'ssrn': SSRNGraph, 'babbler': BabblerGraph }[model_type] g = AppropriateGraph(hp) info("Training graph loaded") synth_graph = AppropriateGraph(hp, mode='synthesize', reuse=True) info( "Synthesis graph loaded" ) #reuse=True ensures that 'synth_graph' and 'attention_graph' share weights with training graph 'g' attention_graph = AppropriateGraph(hp, mode='generate_attention', reuse=True) info("Atttention generating graph loaded") #TODO is loading three graphs a problem for memory usage? if 0: print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')) ## [<tf.Variable 'Text2Mel/TextEnc/embed_1/lookup_table:0' shape=(61, 128) dtype=float32_ref>, <tf.Variable 'Text2Mel/TextEnc/C_2/conv1d/kernel:0' shape=(1, 128, 512) dtype=float32_ref>, ... ## TODO: tensorflow.python.training.supervisor deprecated: --> switch to tf.train.MonitoredTrainingSession sv = tf.train.Supervisor(logdir=logdir, save_model_secs=0, global_step=g.global_step) ## Get the current training epoch from the name of the model that we have loaded latest_checkpoint = tf.train.latest_checkpoint(logdir) if latest_checkpoint: epoch = int( latest_checkpoint.strip('/ ').split('/')[-1].replace( 'model_epoch_', '')) else: #did not find a model checkpoint, so we start training from scratch epoch = 0 ## If save_every_n_epochs > 0, models will be stored here every n epochs and not ## deleted, regardless of validation improvement etc.:-- safe_makedir(logdir + '/archive/') with sv.managed_session() as sess: if 0: ## Set to 1 to debug NaNs; at tfdbg prompt, type: run -f has_inf_or_nan ## later: lt -f has_inf_or_nan -n .*AudioEnc.* os.system('rm -rf {}/tmp_tfdbg/'.format(logdir)) sess = tf_debug.LocalCLIDebugWrapperSession(sess, dump_root=logdir + '/tmp_tfdbg/') if hp.initialise_weights_from_existing: info('=====Initialise some variables from existing model(s)=====') sess.graph._unsafe_unfinalize( ) ## !!! https://stackoverflow.com/questions/41798311/tensorflow-graph-is-finalized-and-cannot-be-modified/41798401 for (scope, checkpoint) in hp.initialise_weights_from_existing: var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) info('----From existing model %s:----' % (checkpoint)) if var_list: ## will be empty when training t2m but looking at ssrn saver = tf.train.Saver(var_list=var_list) saver.restore(sess, checkpoint) for var in var_list: info(' %s' % (var.name)) else: info(' No variables!') info( '========================================================') if hp.restart_from_savepath: #set this param to list: [path_to_t2m_model_folder, path_to_ssrn_model_folder] # info('Restart from these paths:') info(hp.restart_from_savepath) # assert len(hp.restart_from_savepath) == 2 restart_from_savepath1, restart_from_savepath2 = hp.restart_from_savepath restart_from_savepath1 = os.path.abspath(restart_from_savepath1) restart_from_savepath2 = os.path.abspath(restart_from_savepath2) sess.graph._unsafe_unfinalize( ) ## !!! https://stackoverflow.com/questions/41798311/tensorflow-graph-is-finalized-and-cannot-be-modified/41798401 sess.run(tf.global_variables_initializer()) print('Restore parameters') if model_type == 't2m': var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel') saver1 = tf.train.Saver(var_list=var_list) latest_checkpoint = tf.train.latest_checkpoint( restart_from_savepath1) saver1.restore(sess, restart_from_savepath1) print("Text2Mel Restored!") elif model_type == 'ssrn': var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs') saver2 = tf.train.Saver(var_list=var_list) latest_checkpoint = tf.train.latest_checkpoint( restart_from_savepath2) saver2.restore(sess, restart_from_savepath2) print("SSRN Restored!") epoch = int( latest_checkpoint.strip('/ ').split('/')[-1].replace( 'model_epoch_', '')) # TODO: this counter won't work if training restarts in same directory. ## Get epoch from gs? loss_history = [] #any way to restore loss history too? #plot attention generated from freshly initialised model if hp.plot_attention_every_n_epochs and model_type == 't2m' and epoch == 0: # ssrn model doesn't generate alignments get_and_plot_alignments( hp, epoch - 1, attention_graph, sess, attention_inputs, attention_mels, logdir + "/alignments") # epoch-1 refers to freshly initialised model current_score = compute_validation( hp, model_type, epoch, validation_inputs, synth_graph, sess, speaker_codes, valid_filenames, validation_reference, duration_data=validation_duration_data, validation_labels=validation_labels, position_in_phone_data=position_in_phone_data) info('validation epoch {0}: {1:0.3f}'.format(epoch, current_score)) while 1: progress_bar_text = '%s/%s; ep. %s' % (hp.config_name, model_type, epoch) for batch_in_current_epoch in tqdm(range(g.num_batch), total=g.num_batch, ncols=80, leave=True, unit='b', desc=progress_bar_text): gs, loss_components, _ = sess.run( [g.global_step, g.loss_components, g.train_op]) loss_history.append(loss_components) ### End of epoch: validate? if hp.validate_every_n_epochs: if epoch % hp.validate_every_n_epochs == 0: loss_history = np.array(loss_history) train_loss_mean_std = np.concatenate( [loss_history.mean(axis=0), loss_history.std(axis=0)]) loss_history = [] train_loss_mean_std = ' '.join([ '{:0.3f}'.format(score) for score in train_loss_mean_std ]) info('train epoch {0}: {1}'.format(epoch, train_loss_mean_std)) current_score = compute_validation( hp, model_type, epoch, validation_inputs, synth_graph, sess, speaker_codes, valid_filenames, validation_reference, duration_data=validation_duration_data, validation_labels=validation_labels, position_in_phone_data=position_in_phone_data) info('validation epoch {0:0}: {1:0.3f}'.format( epoch, current_score)) ### End of epoch: plot attention matrices? ################################# if hp.plot_attention_every_n_epochs and model_type == 't2m' and epoch % hp.plot_attention_every_n_epochs == 0: # ssrn model doesn't generate alignments get_and_plot_alignments(hp, epoch, attention_graph, sess, attention_inputs, attention_mels, logdir + "/alignments") ### Save end of each epoch (all but the most recent 5 will be overwritten): stem = logdir + '/model_epoch_{0}'.format(epoch) sv.saver.save(sess, stem) ### Check if we should archive (to files which won't be overwritten): if hp.save_every_n_epochs: if epoch % hp.save_every_n_epochs == 0: info('Archive model %s' % (stem)) for fname in glob.glob(stem + '*'): shutil.copy(fname, logdir + '/archive/') epoch += 1 if epoch > hp.max_epochs: info('Max epochs ({}) reached: end training'.format( hp.max_epochs)) return print("Done")
def main_work(): ################################################# # ============= Process command line ============ a = ArgumentParser() a.add_argument( '-meldir', required=True, type=str, help= 'existing directory with mels - features are padding to match length of these ' ) a.add_argument('-worlddir', required=True, type=str, help='existing directory containing world features') a.add_argument('-outdir', required=True, type=str) a.add_argument('-testpatt', required=False, type=str, default='') a.add_argument('-ncores', default=1, type=int, help='Number of cores for parallel processing') opts = a.parse_args() # =============================================== # hp = load_config(opts.config) fpaths = sorted(glob.glob(opts.meldir + '/*.npy')) # [:10] normkind = 'meanvar' if normkind == 'minmax': scaler = MinMaxScaler() elif normkind == 'meanvar': scaler = StandardScaler() else: sys.exit('aedvsv') if opts.testpatt: train_fpaths = [p for p in fpaths if opts.testpatt not in basename(p)] else: train_fpaths = fpaths for fpath in tqdm(train_fpaths, desc='First pass to get norm stats'): data = load_sentence(fpath, worlddir=opts.worlddir, outdir=opts.outdir) scaler = update_normalisation_stats(data, scaler) safe_makedir(opts.outdir) safe_makedir(opts.outdir + '/full_world/') safe_makedir(opts.outdir + '/coarse_world/') if 0: process(fpaths[0], worlddir=opts.worlddir, outdir=opts.outdir, scaler=scaler) sys.exit('aedvsfv') executor = ProcessPoolExecutor(max_workers=opts.ncores) futures = [] for fpath in fpaths: futures.append( executor.submit(process, fpath, worlddir=opts.worlddir, outdir=opts.outdir, scaler=scaler)) proc_list = [ future.result() for future in tqdm(futures, desc='Second pass (parallel) to do normalisation') ] if normkind == 'minmax': mini = scaler.data_min_ ## TODO: per speaker... maxi = scaler.data_max_ stats = np.vstack([mini, maxi]) elif normkind == 'meanvar': mean = scaler.mean_ ## TODO: per speaker... std = scaler.scale_ stats = np.vstack([mean, std]) else: sys.exit('aedvsv2') np.save(opts.outdir + '/norm_stats', stats)
def synthesize(self, text=None, emo_code=None, mels=None, speaker_id='', num_sentences=0, ncores=1, topoutdir=''): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert self.hp.vocoder in [ 'griffin_lim', 'world' ], 'Other vocoders than griffin_lim/world not yet supported' if text is not None: text_to_phonetic(text=text) dataset = load_data(self.hp, mode='demo') else: dataset = load_data( self.hp, mode="synthesis" ) #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if self.hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in self.hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences < len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if self.hp.merlin_label_dir: labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \ for fpath in fpaths ] labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) if mels is not None: emo_code = encode_audio2emo(self.hp, mels, self.g1, self.sess) K, V = encode_text(self.hp, L, self.g1, self.sess, emo_mean=emo_code, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(self.hp, K, V, text_lengths, self.g1, self.sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(self.hp, L, self.g1, self.sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(self.hp, Y, self.g2, self.sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = self.hp.sampledir outdir = os.path.join( topoutdir, 't2m%s_ssrn%s' % (self.t2m_epoch, self.ssrn_epoch)) if speaker_id: outdir += '_speaker-%s' % (speaker_id) safe_makedir(outdir) print("Generating wav files, will save to following dir: %s" % (outdir)) assert self.hp.vocoder in [ 'griffin_lim', 'world' ], 'Other vocoders than griffin_lim/world not yet supported' if ncores == 1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i] * self.hp.r, :] ### trim to generated length synth_wave(self.hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i] * self.hp.r, :] ### trim to generated length futures.append( executor.submit(synth_wave, self.hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)] # Plot attention alignments for i in range(num_sentences): plot_alignment(self.hp, alignments[i], utt_idx=i + 1, t2m_epoch=self.t2m_epoch, dir=outdir) self.outdir = outdir
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences <= len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [] for fpath in fpaths: label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) if hp.select_central: central_ind = get_labels_indices(hp.merlin_lab_dim) label = label[:,central_ind==1] labels.append(label) labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off text_lengths = get_text_lengths(L) hp.text_lengths = text_lengths + 1 # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") if hp.norm == None : t2m_layer_norm = False hp.norm = 'layer' hp.lr = 0.001 hp.beta1 = 0.9 hp.beta2 = 0.999 hp.epsilon = 0.00000001 hp.decay_lr = True hp.batchsize = {'t2m': 32, 'ssrn': 8} else: t2m_layer_norm = True g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") if t2m_layer_norm == False: hp.norm = None hp.lr = 0.0002 hp.beta1 = 0.5 hp.beta2 = 0.9 hp.epsilon = 0.000001 hp.decay_lr = False hp.batchsize = {'t2m': 16, 'ssrn': 8} with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) # Plot trimmed attention alignment with filename print("Plot attention, will save to following dir: %s"%(outdir)) print("File | CDP | Ain") for i, mag in enumerate(Z): outfile = os.path.join(outdir, bases[i]) trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]] plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile) CDP = getCDP(trimmed_alignment) APin, APout = getAP(trimmed_alignment) print("%s | %.2f | %.2f"%( bases[i], CDP, APin)) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)]