def __init__(self, hp, model_type='t2m', t2m_epoch=-1, ssrn_epoch=-1): self.t2m_epoch = t2m_epoch self.ssrn_epoch = ssrn_epoch self.hp = hp if model_type == 't2m': self.g1 = Text2MelGraph(hp, mode="synthesize") print("Graph 1 (t2m) loaded") elif model_type == 'unsup': self.g1 = Graph_style_unsupervised(hp, mode="synthesize") print("Graph 1 (unsup) loaded") self.g2 = SSRNGraph(hp, mode="synthesize") print("Graph 2 (ssrn) loaded") self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(self.sess, hp, model_type, t2m_epoch) else: self.t2m_epoch = restore_latest_model_parameters( self.sess, hp, model_type) if ssrn_epoch > -1: restore_archived_model_parameters(self.sess, hp, 'ssrn', ssrn_epoch) else: self.ssrn_epoch = restore_latest_model_parameters( self.sess, hp, 'ssrn')
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences < len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \ for fpath in fpaths ] labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)] # for i, mag in enumerate(Z): # print("Working on %s"%(bases[i])) # mag = mag[:lengths[i]*hp.r,:] ### trim to generated length # if hp.vocoder=='magphase_compressed': # mag = denorm(mag, s, hp.normtype) # streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45]) # wav = magphase_synth_from_compressed(streams, samplerate=hp.sr) # elif hp.vocoder=='griffin_lim': # wav = spectrogram2wav(hp, mag) # else: # sys.exit('Unsupported vocoder type: %s'%(hp.vocoder)) # #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav) # soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr) # Plot attention alignments for i in range(num_sentences): plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
def main_work(): # ============= Process command line ============ a = ArgumentParser() a.add_argument('-c', dest='config', required=True, type=str) a.add_argument('-m', dest='model_type', required=True, choices=['t2m', 'unsup']) a.add_argument('-t', dest='task', required=True, choices=[ 'acoustic_analysis', 'compute_codes', 'reduce_codes', 'compute_opensmile_features', 'show_plot', 'ICE_TTS', 'ICE_TTS_server' ]) a.add_argument('-r', dest='reduction_method', required=False, choices=['pca', 'tsne', 'umap']) a.add_argument('-p', dest='port', required=False, type=int, default=5000) a.add_argument('-s', dest='set', required=False, type=str, default='train') opts = a.parse_args() print('opts') print(opts) # =============================================== model_type = opts.model_type method = opts.reduction_method hp = load_config(opts.config) logdir = hp.logdir + "-" + model_type port = opts.port mode = opts.set config_name = opts.config.split('/')[-1].split('.')[0] logger_setup.logger_setup(logdir) info('Command line: %s' % (" ".join(sys.argv))) print(logdir) task = opts.task if task == 'compute_codes': if model_type == 't2m': g = Text2MelGraph(hp, mode="synthesize") print("Graph 1 (t2m) loaded") elif model_type == 'unsup': g = Graph_style_unsupervised(hp, mode="synthesize") print("Graph 1 (unsup) loaded") codes = compute_unsupervised_embeddings(hp, g, model_type, mode=mode) save_embeddings(codes, logdir, mode=mode) #emo_cats=get_emo_cats(hp) #save(emo_cats, logdir, filename='emo_cats') elif task == 'reduce_codes': try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) #import pdb;pdb.set_trace() model, results = embeddings_reduction(embed, method=method) save_embeddings(results, logdir, filename='emo_codes_' + method, mode=mode) save(model, logdir, filename='code_reduction_model_' + method) elif task == 'compute_opensmile_features': compute_opensmile_features(hp, audio_extension='.wav', mode=mode) elif task == 'show_plot': embed = load_embeddings(logdir, filename='emo_codes_' + method) scatter_plot(embed) elif task == 'ICE_TTS': from interface import ICE_TTS embed = load_embeddings(logdir)[:, 0, :] embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method) from PyQt5.QtWidgets import QApplication app = QApplication(sys.argv) ice = ICE_TTS(hp, embed_reduc, embed) ice.show() sys.exit(app.exec_()) elif task == 'ICE_TTS_server': # import pdb;pdb.set_trace() from server.ice_tts_server import ICE_TTS_server try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) print('Loading embeddings') embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method, mode=mode) from itertools import product train_codes_pca = np.load( os.path.join(logdir, 'emo_codes_pca_train.npy')) pca_model = pickle.load( open(os.path.join(logdir, 'code_reduction_model_pca.pkl'), 'rb')) min_xy = train_codes_pca.min(axis=0) max_xy = train_codes_pca.max(axis=0) xs = np.mgrid[min_xy[0]:max_xy[0]:100j] ys = np.mgrid[min_xy[1]:max_xy[1]:100j] X = np.array(list(product(xs, ys))) codes = pca_model.inverse_transform(X) # X=np.load('X.npy') # codes=np.load('codes.npy') print('Loading emo cats') emo_cats = get_emo_cats(hp) #emo_cats=load(logdir, filename='emo_cats') #import pdb;pdb.set_trace() ice = ICE_TTS_server(hp, X, codes, emo_cats, model_type=model_type, port=port) # ice=ICE_TTS_server(hp, embed_reduc, embed, emo_cats, model_type=model_type, port=port) #ice=ICE_TTS_server(hp, embed_reduc, embed, model_type=model_type) #ice=ICE_TTS_server(hp, embed_reduc, embed, n_polar_axes=4, model_type=model_type) elif task == 'acoustic_analysis': directory = 'results/' + config_name if not os.path.exists(directory): os.makedirs(directory) import seaborn as sns from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.linear_model import LinearRegression from pandas.plotting import scatter_matrix # from pandas.plotting._matplotlib.misc import scatter_matrix import matplotlib.pyplot as plt from scipy.stats import pearsonr import itertools print('MODE', mode) try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] embed_valid = load_embeddings(logdir, mode='validation')[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) embed_valid = load_embeddings(logdir, mode='validation') conf_name = 'eGeMAPSv01a' feature_path = os.path.join(hp.featuredir, 'opensmile_features', conf_name, 'feat_df_' + mode + '.csv') feat_df = pd.read_csv(feature_path) feat_df = feat_df.drop(columns=['Unnamed: 0']) feature_path = os.path.join(hp.featuredir, 'opensmile_features', conf_name, 'feat_df_' + 'validation' + '.csv') feat_df_valid = pd.read_csv(feature_path) #import pdb;pdb.set_trace() feat_df_valid = feat_df_valid.drop(columns=['Unnamed: 0']) feat_df = abbridge_column_names(feat_df) feat_df_valid = abbridge_column_names(feat_df_valid) # Mean normalization (with same mean and variance computed from training data) feat_df = (feat_df - feat_df.mean()) / feat_df.std() feat_df_valid = (feat_df_valid - feat_df.mean()) / feat_df.std() model, coeff_df = regression_feat_embed(pd.DataFrame(embed), feat_df) corrs_embed_df = test_regression(model, pd.DataFrame(embed_valid), feat_df_valid) print('Correlations:') print(corrs_embed_df.sort_values(0)[::-1][:20]) corrs_embed_df.sort_values(0)[::-1][:20].to_csv(directory + '/correlations.csv') selected = select_features(corrs_embed_df, feat_df_valid, intra_corr_thresh=0.7, corr_thresh=0.3) print(selected.to_latex().replace('\_sma3', ' ').replace( 'nz', '').replace('\_', '').replace('amean', 'mean').replace('semitoneFrom27.5Hz', '')) selected.to_csv(directory + '/selected_correlations.csv') # print('Gradients:') # print(coeff_df) #method='pca' embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method, mode=mode) embed_reduc_valid = load_embeddings(logdir, filename='emo_codes_' + method, mode='validation') model_reduc, coeff_reduc_df = regression_feat_embed( pd.DataFrame(embed_reduc), feat_df) corrs_embed_reduc_df = test_regression(model_reduc, pd.DataFrame(embed_reduc_valid), feat_df_valid) print('Correlations:') print(corrs_embed_reduc_df.sort_values(0)[::-1][:20]) corrs_embed_df.sort_values(0)[::-1][:20].to_csv( directory + '/correlations_reduc.csv') selected_reduc = select_features(corrs_embed_reduc_df, feat_df_valid, intra_corr_thresh=0.7, corr_thresh=0.25) print(selected.to_latex().replace('\_sma3', ' ').replace( 'nz', '').replace('\_', '').replace('amean', 'mean').replace('semitoneFrom27.5Hz', '')) selected_reduc.to_csv(directory + '/selected_correlations_reduc.csv') feat_predictions_df = pd.DataFrame(model.predict(embed)) feat_predictions_df.index = feat_df.index feat_predictions_df.columns = feat_df.columns feat_df[selected.index] feat_predictions_df[selected.index] # just checking it seems correct # print(pearsonr(feat_df[selected.index]['F0semitoneFrom27.5Hz_sma3nz_percentile50.0'],feat_predictions_df[selected.index]['F0semitoneFrom27.5Hz_sma3nz_percentile50.0'] )) # selected_feats=selected.index.to_list() # fig, axs = plt.subplots(nrows=sc.shape[0], ncols=sc.shape[1], figsize=(100, 100)) # for pair in itertools.product(range(len(selected)), repeat=2): # x=feat_df[selected_feats[pair[0]]] # y=feat_predictions_df[selected_feats[pair[1]]] # axs[pair[0], pair[1]].scatter(x, y, alpha=0.2) # fig.savefig('figures/scatter_matrix.png') h = 100 selected_feats = selected.index.to_list() fig, axs = plt.subplots(nrows=len(selected), ncols=1, figsize=(h / len(selected) * 3, h)) for i in range(len(selected)): x = feat_df[selected_feats[i]] y = feat_predictions_df[selected_feats[i]] axs[i].scatter(x, y, alpha=0.2) fig.savefig(directory + '/scatter_plots_feats.png') #print(corrs_embed_reduc_df) print('Gradients:') print(coeff_reduc_df) coeff_reduc_df.to_csv(directory + '/gradients.csv') normalized_gradients = coeff_reduc_df.div( ((coeff_reduc_df**2).sum(axis=1))**0.5, axis=0) plt.cla() plt.clf() plt.close() # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values) sc = scatter_plot(embed_reduc, c=feat_df['F0 mean'].values) plot_gradients(normalized_gradients, selected_reduc, ax=sc.get_figure().gca()) sc.get_figure().savefig(directory + '/scatter_F0_mean_' + method + '.png') plt.cla() plt.clf() plt.close() # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values) sc = scatter_plot(embed_reduc, c=feat_df['F0 percentile50.0'].values) plot_gradients(normalized_gradients, selected_reduc, ax=sc.get_figure().gca()) sc.get_figure().savefig(directory + '/scatter_F0_percentile50.0_' + method + '.png') print(feat_df.columns) # import pdb;pdb.set_trace() plt.cla() plt.clf() plt.close() # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values) sc = scatter_plot(embed_reduc, c=feat_df['F3amplitudeLogRelF0 stdNorm'].values) plot_gradients(normalized_gradients, selected_reduc, ax=sc.get_figure().gca()) sc.get_figure().savefig(directory + '/scatter_F3amplitudeLogRelF0_stdNorm_' + method + '.png') plt.cla() plt.clf() plt.close() # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values) sc = scatter_plot(embed_reduc, c=feat_df['stdVoicedSegmentLengthSec'].values) plot_gradients(normalized_gradients, selected_reduc, ax=sc.get_figure().gca()) sc.get_figure().savefig(directory + '/scatter_stdVoicedSegmentLengthSec_' + method + '.png') plt.cla() plt.clf() plt.close() hist = sns.distplot(feat_df['F0 mean']) hist.get_figure().savefig(directory + '/hist_F0_mean_' + method + '.png') # hist=sns.distplot(feat_df['F3amplitudeLogRelF0 stddevNorm']) # hist.get_figure().savefig('figures/hist_F3amplitudeLogRelF0_stddevNorm_'+method+'.png') #mi=mi_regression_feat_embed(pd.DataFrame(embed_reduc), feat_df) #print('mi',mi.sort_values(0)[::-1][:20]) #print('mi',mi.sort_values(1)[::-1][:20]) # Plot corrs heatmaps plt.close() corrs_heatmap_feats = sns.heatmap(feat_df.corr().abs(), xticklabels=False) corrs_heatmap_feats.get_figure().savefig(directory + '/corrs_heatmap_feats.pdf', bbox_inches='tight') plt.close() embed_corr = pd.DataFrame(embed).corr().abs() embed_corr_heatmap = sns.heatmap(embed_corr) embed_corr_heatmap.get_figure().savefig(directory + '/embed_corr_heatmap.pdf', bbox_inches='tight') plt.close() corr_feat_embed = pd.concat([pd.DataFrame(embed), feat_df], axis=1).corr().abs() sns.set(font_scale=0.2) corr_feat_embed_heatmap = sns.heatmap(corr_feat_embed, xticklabels=False) # add_margin(corr_feat_embed_heatmap,x=0.1,y=0.0) corr_feat_embed_heatmap.get_figure().savefig( directory + '/corr_feat_embed_heatmap.pdf', bbox_inches='tight') else: print('Wrong task, does not exist')
def main_work(): # ============= Process command line ============ a = ArgumentParser() a.add_argument('-c', dest='config', required=True, type=str) a.add_argument('-m', dest='model_type', required=True, choices=['t2m', 'unsup']) a.add_argument('-t', dest='task', required=True, choices=[ 'compute_gradients', 'compute_codes', 'reduce_codes', 'compute_opensmile_features', 'show_plot', 'ICE_TTS', 'ICE_TTS_server' ]) a.add_argument('-r', dest='reduction_method', required=False, choices=['pca', 'tsne', 'umap']) a.add_argument('-p', dest='port', required=False, type=int, default=5000) opts = a.parse_args() print('opts') print(opts) # =============================================== model_type = opts.model_type method = opts.reduction_method hp = load_config(opts.config) logdir = hp.logdir + "-" + model_type port = opts.port mode = 'validation' logger_setup.logger_setup(logdir) info('Command line: %s' % (" ".join(sys.argv))) print(logdir) task = opts.task if task == 'compute_codes': if model_type == 't2m': g = Text2MelGraph(hp, mode="synthesize") print("Graph 1 (t2m) loaded") elif model_type == 'unsup': g = Graph_style_unsupervised(hp, mode="synthesize") print("Graph 1 (unsup) loaded") codes = compute_unsupervised_embeddings(hp, g, model_type, mode=mode) save_embeddings(codes, logdir, mode=mode) #emo_cats=get_emo_cats(hp) #save(emo_cats, logdir, filename='emo_cats') elif task == 'reduce_codes': try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) #import pdb;pdb.set_trace() model, results = embeddings_reduction(embed, method=method) save_embeddings(results, logdir, filename='emo_codes_' + method, mode=mode) save(model, logdir, filename='code_reduction_model_' + method) elif task == 'compute_opensmile_features': compute_opensmile_features(hp, audio_extension='.wav', mode=mode) elif task == 'show_plot': embed = load_embeddings(logdir, filename='emo_codes_' + method) scatter_plot(embed) elif task == 'ICE_TTS': from interface import ICE_TTS embed = load_embeddings(logdir)[:, 0, :] embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method) from PyQt5.QtWidgets import QApplication app = QApplication(sys.argv) ice = ICE_TTS(hp, embed_reduc, embed) ice.show() sys.exit(app.exec_()) elif task == 'ICE_TTS_server': from server.ice_tts_server import ICE_TTS_server try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) print('Loading embeddings') embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method) print('Loading emo cats') emo_cats = get_emo_cats(hp) #emo_cats=load(logdir, filename='emo_cats') #import pdb;pdb.set_trace() ice = ICE_TTS_server(hp, embed_reduc, embed, emo_cats, model_type=model_type, port=port) #ice=ICE_TTS_server(hp, embed_reduc, embed, model_type=model_type) #ice=ICE_TTS_server(hp, embed_reduc, embed, n_polar_axes=4, model_type=model_type) elif task == 'compute_gradients': import seaborn as sns print('MODE', mode) try: embed = load_embeddings(logdir, mode=mode)[:, 0, :] except IndexError: # I may have changed the shape of the matrix ... embed = load_embeddings(logdir, mode=mode) conf_name = 'eGeMAPSv01a' feature_path = os.path.join(hp.featuredir, 'opensmile_features', conf_name, 'feat_df_' + mode + '.csv') feat_df = pd.read_csv(feature_path) feat_df = feat_df.drop(columns=['Unnamed: 0']) corrs_embed_df, coeff_df = regression_feat_embed( pd.DataFrame(embed), feat_df) print('Correlations:') #print(corrs_embed_df) # print('Gradients:') # print(coeff_df) # corrs_heatmap=sns.heatmap(feat_df.corr()) # corrs_heatmap.get_figure().savefig('corrs_heatmap.png') print(corrs_embed_df.sort_values(0)[::-1][:20]) #method='pca' embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method, mode=mode) corrs_embed_reduc_df, coeff_reduc_df = regression_feat_embed( pd.DataFrame(embed_reduc), feat_df) print('Correlations:') #print(corrs_embed_reduc_df) #print('Gradients:') #print(coeff_reduc_df) print(corrs_embed_reduc_df.sort_values(0)[::-1][:20]) #sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values) #sc.get_figure().savefig('scatter_'+method+'.png') mi = mi_regression_feat_embed(pd.DataFrame(embed_reduc), feat_df) print('mi', mi.sort_values(0)[::-1][:20]) print('mi', mi.sort_values(1)[::-1][:20]) else: print('Wrong task, does not exist')
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences <= len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [] for fpath in fpaths: label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) if hp.select_central: central_ind = get_labels_indices(hp.merlin_lab_dim) label = label[:,central_ind==1] labels.append(label) labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off text_lengths = get_text_lengths(L) hp.text_lengths = text_lengths + 1 # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") if hp.norm == None : t2m_layer_norm = False hp.norm = 'layer' hp.lr = 0.001 hp.beta1 = 0.9 hp.beta2 = 0.999 hp.epsilon = 0.00000001 hp.decay_lr = True hp.batchsize = {'t2m': 32, 'ssrn': 8} else: t2m_layer_norm = True g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") if t2m_layer_norm == False: hp.norm = None hp.lr = 0.0002 hp.beta1 = 0.5 hp.beta2 = 0.9 hp.epsilon = 0.000001 hp.decay_lr = False hp.batchsize = {'t2m': 16, 'ssrn': 8} with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) # Plot trimmed attention alignment with filename print("Plot attention, will save to following dir: %s"%(outdir)) print("File | CDP | Ain") for i, mag in enumerate(Z): outfile = os.path.join(outdir, bases[i]) trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]] plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile) CDP = getCDP(trimmed_alignment) APin, APout = getAP(trimmed_alignment) print("%s | %.2f | %.2f"%( bases[i], CDP, APin)) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)]