예제 #1
0
    def __init__(self, hp, model_type='t2m', t2m_epoch=-1, ssrn_epoch=-1):
        self.t2m_epoch = t2m_epoch
        self.ssrn_epoch = ssrn_epoch

        self.hp = hp
        if model_type == 't2m':
            self.g1 = Text2MelGraph(hp, mode="synthesize")
            print("Graph 1 (t2m) loaded")
        elif model_type == 'unsup':
            self.g1 = Graph_style_unsupervised(hp, mode="synthesize")
            print("Graph 1 (unsup) loaded")
        self.g2 = SSRNGraph(hp, mode="synthesize")
        print("Graph 2 (ssrn) loaded")
        self.sess = tf.Session()

        self.sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(self.sess, hp, model_type,
                                              t2m_epoch)
        else:
            self.t2m_epoch = restore_latest_model_parameters(
                self.sess, hp, model_type)
        if ssrn_epoch > -1:
            restore_archived_model_parameters(self.sess, hp, 'ssrn',
                                              ssrn_epoch)
        else:
            self.ssrn_epoch = restore_latest_model_parameters(
                self.sess, hp, 'ssrn')
예제 #2
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences < len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \
                              for fpath in fpaths ]
        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None

    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")
    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)
        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]

        # for i, mag in enumerate(Z):
        #     print("Working on %s"%(bases[i]))
        #     mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
            
        #     if hp.vocoder=='magphase_compressed':
        #         mag = denorm(mag, s, hp.normtype)
        #         streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45])
        #         wav = magphase_synth_from_compressed(streams, samplerate=hp.sr)
        #     elif hp.vocoder=='griffin_lim':                
        #         wav = spectrogram2wav(hp, mag)
        #     else:
        #         sys.exit('Unsupported vocoder type: %s'%(hp.vocoder))
        #     #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav)
        #     soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr)
            

            
        # Plot attention alignments 
        for i in range(num_sentences):
            plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
예제 #3
0
def main_work():
    # ============= Process command line ============
    a = ArgumentParser()
    a.add_argument('-c', dest='config', required=True, type=str)
    a.add_argument('-m',
                   dest='model_type',
                   required=True,
                   choices=['t2m', 'unsup'])
    a.add_argument('-t',
                   dest='task',
                   required=True,
                   choices=[
                       'acoustic_analysis', 'compute_codes', 'reduce_codes',
                       'compute_opensmile_features', 'show_plot', 'ICE_TTS',
                       'ICE_TTS_server'
                   ])
    a.add_argument('-r',
                   dest='reduction_method',
                   required=False,
                   choices=['pca', 'tsne', 'umap'])
    a.add_argument('-p', dest='port', required=False, type=int, default=5000)
    a.add_argument('-s', dest='set', required=False, type=str, default='train')
    opts = a.parse_args()
    print('opts')
    print(opts)
    # ===============================================
    model_type = opts.model_type
    method = opts.reduction_method
    hp = load_config(opts.config)
    logdir = hp.logdir + "-" + model_type
    port = opts.port

    mode = opts.set

    config_name = opts.config.split('/')[-1].split('.')[0]

    logger_setup.logger_setup(logdir)
    info('Command line: %s' % (" ".join(sys.argv)))
    print(logdir)
    task = opts.task
    if task == 'compute_codes':
        if model_type == 't2m':
            g = Text2MelGraph(hp, mode="synthesize")
            print("Graph 1 (t2m) loaded")
        elif model_type == 'unsup':
            g = Graph_style_unsupervised(hp, mode="synthesize")
            print("Graph 1 (unsup) loaded")
        codes = compute_unsupervised_embeddings(hp, g, model_type, mode=mode)
        save_embeddings(codes, logdir, mode=mode)
        #emo_cats=get_emo_cats(hp)
        #save(emo_cats, logdir, filename='emo_cats')
    elif task == 'reduce_codes':
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)
        #import pdb;pdb.set_trace()
        model, results = embeddings_reduction(embed, method=method)
        save_embeddings(results,
                        logdir,
                        filename='emo_codes_' + method,
                        mode=mode)
        save(model, logdir, filename='code_reduction_model_' + method)
    elif task == 'compute_opensmile_features':
        compute_opensmile_features(hp, audio_extension='.wav', mode=mode)
    elif task == 'show_plot':
        embed = load_embeddings(logdir, filename='emo_codes_' + method)
        scatter_plot(embed)
    elif task == 'ICE_TTS':
        from interface import ICE_TTS
        embed = load_embeddings(logdir)[:, 0, :]
        embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method)
        from PyQt5.QtWidgets import QApplication
        app = QApplication(sys.argv)
        ice = ICE_TTS(hp, embed_reduc, embed)
        ice.show()
        sys.exit(app.exec_())
    elif task == 'ICE_TTS_server':

        # import pdb;pdb.set_trace()
        from server.ice_tts_server import ICE_TTS_server
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)

        print('Loading embeddings')
        embed_reduc = load_embeddings(logdir,
                                      filename='emo_codes_' + method,
                                      mode=mode)

        from itertools import product
        train_codes_pca = np.load(
            os.path.join(logdir, 'emo_codes_pca_train.npy'))

        pca_model = pickle.load(
            open(os.path.join(logdir, 'code_reduction_model_pca.pkl'), 'rb'))
        min_xy = train_codes_pca.min(axis=0)
        max_xy = train_codes_pca.max(axis=0)
        xs = np.mgrid[min_xy[0]:max_xy[0]:100j]
        ys = np.mgrid[min_xy[1]:max_xy[1]:100j]
        X = np.array(list(product(xs, ys)))
        codes = pca_model.inverse_transform(X)

        # X=np.load('X.npy')
        # codes=np.load('codes.npy')

        print('Loading emo cats')
        emo_cats = get_emo_cats(hp)
        #emo_cats=load(logdir, filename='emo_cats')
        #import pdb;pdb.set_trace()
        ice = ICE_TTS_server(hp,
                             X,
                             codes,
                             emo_cats,
                             model_type=model_type,
                             port=port)
        # ice=ICE_TTS_server(hp, embed_reduc, embed, emo_cats, model_type=model_type, port=port)
        #ice=ICE_TTS_server(hp, embed_reduc, embed, model_type=model_type)
        #ice=ICE_TTS_server(hp, embed_reduc, embed, n_polar_axes=4, model_type=model_type)

    elif task == 'acoustic_analysis':

        directory = 'results/' + config_name
        if not os.path.exists(directory):
            os.makedirs(directory)

        import seaborn as sns
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        from sklearn.linear_model import LinearRegression
        from pandas.plotting import scatter_matrix
        # from pandas.plotting._matplotlib.misc import scatter_matrix
        import matplotlib.pyplot as plt
        from scipy.stats import pearsonr
        import itertools

        print('MODE', mode)
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
            embed_valid = load_embeddings(logdir, mode='validation')[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)
            embed_valid = load_embeddings(logdir, mode='validation')

        conf_name = 'eGeMAPSv01a'
        feature_path = os.path.join(hp.featuredir, 'opensmile_features',
                                    conf_name, 'feat_df_' + mode + '.csv')
        feat_df = pd.read_csv(feature_path)
        feat_df = feat_df.drop(columns=['Unnamed: 0'])

        feature_path = os.path.join(hp.featuredir, 'opensmile_features',
                                    conf_name,
                                    'feat_df_' + 'validation' + '.csv')
        feat_df_valid = pd.read_csv(feature_path)
        #import pdb;pdb.set_trace()
        feat_df_valid = feat_df_valid.drop(columns=['Unnamed: 0'])

        feat_df = abbridge_column_names(feat_df)
        feat_df_valid = abbridge_column_names(feat_df_valid)

        # Mean normalization (with same mean and variance computed from training data)
        feat_df = (feat_df - feat_df.mean()) / feat_df.std()
        feat_df_valid = (feat_df_valid - feat_df.mean()) / feat_df.std()

        model, coeff_df = regression_feat_embed(pd.DataFrame(embed), feat_df)
        corrs_embed_df = test_regression(model, pd.DataFrame(embed_valid),
                                         feat_df_valid)
        print('Correlations:')
        print(corrs_embed_df.sort_values(0)[::-1][:20])
        corrs_embed_df.sort_values(0)[::-1][:20].to_csv(directory +
                                                        '/correlations.csv')

        selected = select_features(corrs_embed_df,
                                   feat_df_valid,
                                   intra_corr_thresh=0.7,
                                   corr_thresh=0.3)
        print(selected.to_latex().replace('\_sma3', ' ').replace(
            'nz',
            '').replace('\_',
                        '').replace('amean',
                                    'mean').replace('semitoneFrom27.5Hz', ''))
        selected.to_csv(directory + '/selected_correlations.csv')

        # print('Gradients:')
        # print(coeff_df)

        #method='pca'

        embed_reduc = load_embeddings(logdir,
                                      filename='emo_codes_' + method,
                                      mode=mode)
        embed_reduc_valid = load_embeddings(logdir,
                                            filename='emo_codes_' + method,
                                            mode='validation')

        model_reduc, coeff_reduc_df = regression_feat_embed(
            pd.DataFrame(embed_reduc), feat_df)
        corrs_embed_reduc_df = test_regression(model_reduc,
                                               pd.DataFrame(embed_reduc_valid),
                                               feat_df_valid)
        print('Correlations:')
        print(corrs_embed_reduc_df.sort_values(0)[::-1][:20])
        corrs_embed_df.sort_values(0)[::-1][:20].to_csv(
            directory + '/correlations_reduc.csv')

        selected_reduc = select_features(corrs_embed_reduc_df,
                                         feat_df_valid,
                                         intra_corr_thresh=0.7,
                                         corr_thresh=0.25)
        print(selected.to_latex().replace('\_sma3', ' ').replace(
            'nz',
            '').replace('\_',
                        '').replace('amean',
                                    'mean').replace('semitoneFrom27.5Hz', ''))
        selected_reduc.to_csv(directory + '/selected_correlations_reduc.csv')

        feat_predictions_df = pd.DataFrame(model.predict(embed))
        feat_predictions_df.index = feat_df.index
        feat_predictions_df.columns = feat_df.columns

        feat_df[selected.index]
        feat_predictions_df[selected.index]

        # just checking it seems correct
        # print(pearsonr(feat_df[selected.index]['F0semitoneFrom27.5Hz_sma3nz_percentile50.0'],feat_predictions_df[selected.index]['F0semitoneFrom27.5Hz_sma3nz_percentile50.0'] ))

        # selected_feats=selected.index.to_list()
        # fig, axs = plt.subplots(nrows=sc.shape[0], ncols=sc.shape[1], figsize=(100, 100))
        # for pair in itertools.product(range(len(selected)), repeat=2):
        #     x=feat_df[selected_feats[pair[0]]]
        #     y=feat_predictions_df[selected_feats[pair[1]]]
        #     axs[pair[0], pair[1]].scatter(x, y, alpha=0.2)
        # fig.savefig('figures/scatter_matrix.png')

        h = 100
        selected_feats = selected.index.to_list()
        fig, axs = plt.subplots(nrows=len(selected),
                                ncols=1,
                                figsize=(h / len(selected) * 3, h))
        for i in range(len(selected)):
            x = feat_df[selected_feats[i]]
            y = feat_predictions_df[selected_feats[i]]
            axs[i].scatter(x, y, alpha=0.2)
        fig.savefig(directory + '/scatter_plots_feats.png')

        #print(corrs_embed_reduc_df)
        print('Gradients:')
        print(coeff_reduc_df)
        coeff_reduc_df.to_csv(directory + '/gradients.csv')

        normalized_gradients = coeff_reduc_df.div(
            ((coeff_reduc_df**2).sum(axis=1))**0.5, axis=0)

        plt.cla()
        plt.clf()
        plt.close()
        # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values)
        sc = scatter_plot(embed_reduc, c=feat_df['F0 mean'].values)
        plot_gradients(normalized_gradients,
                       selected_reduc,
                       ax=sc.get_figure().gca())
        sc.get_figure().savefig(directory + '/scatter_F0_mean_' + method +
                                '.png')

        plt.cla()
        plt.clf()
        plt.close()
        # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values)
        sc = scatter_plot(embed_reduc, c=feat_df['F0 percentile50.0'].values)
        plot_gradients(normalized_gradients,
                       selected_reduc,
                       ax=sc.get_figure().gca())
        sc.get_figure().savefig(directory + '/scatter_F0_percentile50.0_' +
                                method + '.png')

        print(feat_df.columns)
        # import pdb;pdb.set_trace()
        plt.cla()
        plt.clf()
        plt.close()
        # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values)
        sc = scatter_plot(embed_reduc,
                          c=feat_df['F3amplitudeLogRelF0 stdNorm'].values)
        plot_gradients(normalized_gradients,
                       selected_reduc,
                       ax=sc.get_figure().gca())
        sc.get_figure().savefig(directory +
                                '/scatter_F3amplitudeLogRelF0_stdNorm_' +
                                method + '.png')

        plt.cla()
        plt.clf()
        plt.close()
        # sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values)
        sc = scatter_plot(embed_reduc,
                          c=feat_df['stdVoicedSegmentLengthSec'].values)
        plot_gradients(normalized_gradients,
                       selected_reduc,
                       ax=sc.get_figure().gca())
        sc.get_figure().savefig(directory +
                                '/scatter_stdVoicedSegmentLengthSec_' +
                                method + '.png')

        plt.cla()
        plt.clf()
        plt.close()
        hist = sns.distplot(feat_df['F0 mean'])
        hist.get_figure().savefig(directory + '/hist_F0_mean_' + method +
                                  '.png')

        # hist=sns.distplot(feat_df['F3amplitudeLogRelF0 stddevNorm'])
        # hist.get_figure().savefig('figures/hist_F3amplitudeLogRelF0_stddevNorm_'+method+'.png')

        #mi=mi_regression_feat_embed(pd.DataFrame(embed_reduc), feat_df)
        #print('mi',mi.sort_values(0)[::-1][:20])
        #print('mi',mi.sort_values(1)[::-1][:20])

        # Plot corrs heatmaps
        plt.close()
        corrs_heatmap_feats = sns.heatmap(feat_df.corr().abs(),
                                          xticklabels=False)
        corrs_heatmap_feats.get_figure().savefig(directory +
                                                 '/corrs_heatmap_feats.pdf',
                                                 bbox_inches='tight')

        plt.close()
        embed_corr = pd.DataFrame(embed).corr().abs()
        embed_corr_heatmap = sns.heatmap(embed_corr)
        embed_corr_heatmap.get_figure().savefig(directory +
                                                '/embed_corr_heatmap.pdf',
                                                bbox_inches='tight')

        plt.close()
        corr_feat_embed = pd.concat([pd.DataFrame(embed), feat_df],
                                    axis=1).corr().abs()
        sns.set(font_scale=0.2)
        corr_feat_embed_heatmap = sns.heatmap(corr_feat_embed,
                                              xticklabels=False)
        # add_margin(corr_feat_embed_heatmap,x=0.1,y=0.0)
        corr_feat_embed_heatmap.get_figure().savefig(
            directory + '/corr_feat_embed_heatmap.pdf', bbox_inches='tight')

    else:
        print('Wrong task, does not exist')
def main_work():

    # ============= Process command line ============
    a = ArgumentParser()
    a.add_argument('-c', dest='config', required=True, type=str)
    a.add_argument('-m',
                   dest='model_type',
                   required=True,
                   choices=['t2m', 'unsup'])
    a.add_argument('-t',
                   dest='task',
                   required=True,
                   choices=[
                       'compute_gradients', 'compute_codes', 'reduce_codes',
                       'compute_opensmile_features', 'show_plot', 'ICE_TTS',
                       'ICE_TTS_server'
                   ])
    a.add_argument('-r',
                   dest='reduction_method',
                   required=False,
                   choices=['pca', 'tsne', 'umap'])
    a.add_argument('-p', dest='port', required=False, type=int, default=5000)
    opts = a.parse_args()
    print('opts')
    print(opts)
    # ===============================================
    model_type = opts.model_type
    method = opts.reduction_method
    hp = load_config(opts.config)
    logdir = hp.logdir + "-" + model_type
    port = opts.port

    mode = 'validation'
    logger_setup.logger_setup(logdir)
    info('Command line: %s' % (" ".join(sys.argv)))
    print(logdir)
    task = opts.task
    if task == 'compute_codes':
        if model_type == 't2m':
            g = Text2MelGraph(hp, mode="synthesize")
            print("Graph 1 (t2m) loaded")
        elif model_type == 'unsup':
            g = Graph_style_unsupervised(hp, mode="synthesize")
            print("Graph 1 (unsup) loaded")
        codes = compute_unsupervised_embeddings(hp, g, model_type, mode=mode)
        save_embeddings(codes, logdir, mode=mode)
        #emo_cats=get_emo_cats(hp)
        #save(emo_cats, logdir, filename='emo_cats')
    elif task == 'reduce_codes':
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)
        #import pdb;pdb.set_trace()
        model, results = embeddings_reduction(embed, method=method)
        save_embeddings(results,
                        logdir,
                        filename='emo_codes_' + method,
                        mode=mode)
        save(model, logdir, filename='code_reduction_model_' + method)
    elif task == 'compute_opensmile_features':
        compute_opensmile_features(hp, audio_extension='.wav', mode=mode)
    elif task == 'show_plot':
        embed = load_embeddings(logdir, filename='emo_codes_' + method)
        scatter_plot(embed)
    elif task == 'ICE_TTS':
        from interface import ICE_TTS
        embed = load_embeddings(logdir)[:, 0, :]
        embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method)
        from PyQt5.QtWidgets import QApplication
        app = QApplication(sys.argv)
        ice = ICE_TTS(hp, embed_reduc, embed)
        ice.show()
        sys.exit(app.exec_())
    elif task == 'ICE_TTS_server':

        from server.ice_tts_server import ICE_TTS_server
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)

        print('Loading embeddings')
        embed_reduc = load_embeddings(logdir, filename='emo_codes_' + method)
        print('Loading emo cats')
        emo_cats = get_emo_cats(hp)
        #emo_cats=load(logdir, filename='emo_cats')
        #import pdb;pdb.set_trace()
        ice = ICE_TTS_server(hp,
                             embed_reduc,
                             embed,
                             emo_cats,
                             model_type=model_type,
                             port=port)
        #ice=ICE_TTS_server(hp, embed_reduc, embed, model_type=model_type)
        #ice=ICE_TTS_server(hp, embed_reduc, embed, n_polar_axes=4, model_type=model_type)

    elif task == 'compute_gradients':
        import seaborn as sns
        print('MODE', mode)
        try:
            embed = load_embeddings(logdir, mode=mode)[:, 0, :]
        except IndexError:  # I may have changed the shape of the matrix ...
            embed = load_embeddings(logdir, mode=mode)

        conf_name = 'eGeMAPSv01a'
        feature_path = os.path.join(hp.featuredir, 'opensmile_features',
                                    conf_name, 'feat_df_' + mode + '.csv')
        feat_df = pd.read_csv(feature_path)
        feat_df = feat_df.drop(columns=['Unnamed: 0'])

        corrs_embed_df, coeff_df = regression_feat_embed(
            pd.DataFrame(embed), feat_df)
        print('Correlations:')
        #print(corrs_embed_df)
        # print('Gradients:')
        # print(coeff_df)
        # corrs_heatmap=sns.heatmap(feat_df.corr())
        # corrs_heatmap.get_figure().savefig('corrs_heatmap.png')

        print(corrs_embed_df.sort_values(0)[::-1][:20])

        #method='pca'

        embed_reduc = load_embeddings(logdir,
                                      filename='emo_codes_' + method,
                                      mode=mode)

        corrs_embed_reduc_df, coeff_reduc_df = regression_feat_embed(
            pd.DataFrame(embed_reduc), feat_df)
        print('Correlations:')
        #print(corrs_embed_reduc_df)
        #print('Gradients:')
        #print(coeff_reduc_df)

        print(corrs_embed_reduc_df.sort_values(0)[::-1][:20])

        #sc=scatter_plot(embed_reduc, c=feat_df['F0semitoneFrom27.5Hz_sma3nz_amean'].values)
        #sc.get_figure().savefig('scatter_'+method+'.png')

        mi = mi_regression_feat_embed(pd.DataFrame(embed_reduc), feat_df)

        print('mi', mi.sort_values(0)[::-1][:20])
        print('mi', mi.sort_values(1)[::-1][:20])

    else:
        print('Wrong task, does not exist')
예제 #5
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences <= len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = []
        for fpath in fpaths:
            label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy"))
            if hp.select_central:
                central_ind = get_labels_indices(hp.merlin_lab_dim)
                label = label[:,central_ind==1] 
            labels.append(label)

        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None
   
    if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off
        text_lengths = get_text_lengths(L)
        hp.text_lengths = text_lengths + 1
     
    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")

    if hp.norm == None :
        t2m_layer_norm = False
        hp.norm = 'layer'
        hp.lr = 0.001
        hp.beta1 = 0.9
        hp.beta2 = 0.999
        hp.epsilon = 0.00000001
        hp.decay_lr = True
        hp.batchsize = {'t2m': 32, 'ssrn': 8}
    else:
        t2m_layer_norm = True

    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    if t2m_layer_norm == False:
        hp.norm = None
        hp.lr = 0.0002
        hp.beta1 = 0.5
        hp.beta2 = 0.9
        hp.epsilon = 0.000001
        hp.decay_lr = False
        hp.batchsize = {'t2m': 16, 'ssrn': 8}
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)

        # Plot trimmed attention alignment with filename
        print("Plot attention, will save to following dir: %s"%(outdir))
        print("File |  CDP | Ain")
        for i, mag in enumerate(Z):
            outfile = os.path.join(outdir, bases[i])
            trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]]
            plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile)
            CDP = getCDP(trimmed_alignment)
            APin, APout = getAP(trimmed_alignment)
            print("%s | %.2f | %.2f"%( bases[i], CDP, APin))

        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]