def process_merlin_label(bin_label_fname,
                         text_lab_dir,
                         phonedim=416,
                         subphonedim=9):

    text_label = os.path.join(text_lab_dir, basename(bin_label_fname) + '.lab')
    assert os.path.isfile(
        text_label), 'No text file for %s ' % (basename(bin_label_fname))

    labfrombin = get_speech(bin_label_fname, phonedim + subphonedim)

    ## fraction through phone (forwards)
    fraction_through_phone_forwards = labfrombin[:, -1]

    ## This is a suprisingly noisy signal which never seems to start at 0.0! Find minima:-
    (minima, ) = argrelextrema(fraction_through_phone_forwards, np.less)

    ## first frame is always a start:
    minima = np.insert(minima, 0, 0)

    ## check size against text file:
    labfromtext = merlin_state_label_to_phone(text_label)
    assert labfromtext.shape[0] == minima.shape[0]

    lab = labfrombin[
        minima, :
        -subphonedim]  ## discard frame level feats, and take first frame of each phone

    return lab
Exemple #2
0
def process(fpath, worlddir='', outdir='', scaler=''):
    assert scaler
    speech = load_sentence(fpath, worlddir=worlddir, outdir=outdir)
    norm_speech = standardise_acoustics(speech, scaler)
    np.save('%s/full_world/%s' % (outdir, basename(fpath)),
            norm_speech.astype(np.float32))
    np.save('%s/coarse_world/%s' % (outdir, basename(fpath)),
            norm_speech[::4, :].astype(np.float32))
Exemple #3
0
 def load_attention(fpath):
     try:
         attention_guide_file = "{}/{}".format(
             hp.attention_guide_dir,
             basename(fpath) + ".npy")
     except TypeError:
         attention_guide_file = "{}/{}".format(
             hp.attention_guide_dir,
             basename(fpath.decode('utf-8')) + ".npy")
     attention_guide = read_floats_from_8bit(attention_guide_file)
     return fpath, attention_guide
Exemple #4
0
 def load_merlin_label(fpath):
     try:
         label_file = "{}/{}".format(hp.merlin_label_dir,
                                     basename(fpath) + ".npy")
     except TypeError:
         label_file = "{}/{}".format(
             hp.merlin_label_dir,
             basename(fpath.decode('utf-8')) + ".npy")
     label = np.load(
         label_file
     )  ## TODO: could use read_floats_from_8bit format
     return fpath, label
def main_work():

    #################################################

    # ============= Process command line ============

    a = ArgumentParser()

    a.add_argument('-b', dest='binlabdir', required=True)
    a.add_argument('-t', dest='text_lab_dir', required=True)
    a.add_argument('-n', dest='norm_info_fname', required=True)
    a.add_argument('-o', dest='outdir', required=True)
    a.add_argument('-binext', dest='binext', required=False, default='lab')
    a.add_argument('-skipterminals', action='store_true', default=False)

    opts = a.parse_args()

    # ===============================================

    safe_makedir(opts.outdir)

    norm_info = get_speech(opts.norm_info_fname, 425)[:, :-9]
    data_min = norm_info[0, :]
    data_max = norm_info[1, :]
    data_range = data_max - data_min

    text_label_files = set(
        [basename(f) for f in glob.glob(opts.text_lab_dir + '/*.lab')])
    binary_label_files = sorted(glob.glob(opts.binlabdir + '/*.' +
                                          opts.binext))
    print binary_label_files
    for binlab in binary_label_files:
        base = basename(binlab)
        if base not in text_label_files:
            continue
        print base
        lab = process_merlin_label(binlab, opts.text_lab_dir)
        if opts.skipterminals:
            lab = lab[
                1:
                -1, :]  ## NB: dont remove 2 last as in durations, as the final punct does't features here
        norm_lab = minmax_norm(lab, data_min, data_max)

        if 0:  ## piano roll style plot:
            pl.imshow(norm_lab, interpolation='nearest')
            pl.gray()
            pl.savefig('/afs/inf.ed.ac.uk/user/o/owatts/temp/fig.pdf')
            sys.exit('abckdubv')

        np.save(opts.outdir + '/' + base, norm_lab)
Exemple #6
0
def copy_synth_SSRN_GL(hp, outdir):

    safe_makedir(outdir)

    dataset = load_data(hp, mode="synthesis") 
    fnames, texts = dataset['fpaths'], dataset['texts']
    bases = [basename(fname) for fname in fnames]
    mels = [np.load(os.path.join(hp.coarse_audio_dir, base + '.npy')) for base in bases]
    lengths = [a.shape[0] for a in mels]
    mels = list2batch(mels, 0)

    g = SSRNGraph(hp, mode="synthesize"); print("Graph (ssrn) loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        print('Run SSRN...')
        Z = synth_mel2mag(hp, mels, g, sess)

        for i, mag in enumerate(Z):
            print("Working on %s"%(bases[i]))
            mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length             
            wav = spectrogram2wav(hp, mag)
            soundfile.write(outdir + "/%s.wav"%(base), wav, hp.sr)
Exemple #7
0
def load_sentence(fpath, worlddir='', outdir=''):
    assert worlddir and outdir, ()
    mel = np.load(fpath)

    #print (mel.shape)

    base = basename(fpath)
    streams = []
    for (stream, dim) in [('lf0', 1), ('mgc', 60), ('bap', 1)]:
        fname = '%s/%s/%s.%s' % (worlddir, stream, base, stream)
        speech = get_speech(fname, dim)
        #print (fname)
        #print (speech.shape)
        if stream == 'lf0':
            speech, vuv = interpolate_through_unvoiced(speech)
            streams.extend([speech, vuv])
        else:
            streams.append(speech)
    composed = np.hstack(streams)

    target_frames, _ = mel.shape
    actual_frames, _ = composed.shape

    #print (target_frames, actual_frames)
    diff = target_frames - actual_frames
    if diff < 0:
        sys.exit('world features too short')
    elif diff > 0:
        composed = np.pad(composed, ((0, diff), (0, 0)), mode='constant')

    return composed
Exemple #8
0
 def load_merlin_label(fpath):
     label_file = "{}/{}".format(hp.merlin_label_dir,
                                 basename(fpath) + ".npy")
     label = np.load(
         label_file
     )  ## TODO: could use read_floats_from_8bit format
     return fpath, label
Exemple #9
0
def main_work():

    #################################################
      
    # ============= Process command line ============

    a = ArgumentParser()
    a.add_argument('-c', dest='config', required=True, type=str)
    a.add_argument('-speaker', default='', type=str)
    a.add_argument('-N', dest='num_sentences', default=0, type=int)
    a.add_argument('-babble', action='store_true')
    a.add_argument('-ncores', type=int, default=1, help='Number of CPUs for Griffin-Lim stage')
    a.add_argument('-odir', type=str, default='', help='Alternative place to put output samples')

    a.add_argument('-t2m_epoch', default=-1, type=int, help='Default: use latest (-1)')
    a.add_argument('-ssrn_epoch', default=-1, type=int, help='Default: use latest (-1)')
    
    opts = a.parse_args()
    
    # ===============================================
    hp = load_config(opts.config)
    
    outdir = opts.odir
    if outdir:
        outdir = os.path.join(outdir, basename(opts.config))

    if hp.multispeaker:
        assert opts.speaker, 'Please specify a speaker from speaker_list with -speaker flag'
        assert opts.speaker in hp.speaker_list

    if opts.babble:
        babble(hp, num_sentences=opts.num_sentences)
    else:
        synthesize(hp, speaker_id=opts.speaker, num_sentences=opts.num_sentences, \
                ncores=opts.ncores, topoutdir=outdir, t2m_epoch=opts.t2m_epoch, ssrn_epoch=opts.ssrn_epoch)
def main_work():

    #################################################

    # ============= Process command line ============

    a = ArgumentParser()

    a.add_argument('-b', dest='binlabdir', required=True)
    a.add_argument('-f', dest='audio_dir', required=True)
    a.add_argument('-n', dest='norm_info_fname', required=True)
    a.add_argument('-o', dest='outdir', required=True)
    a.add_argument('-binext', dest='binext', required=False, default='lab')

    a.add_argument('-ir', dest='inrate', type=float, default=5.0)
    a.add_argument('-or', dest='outrate', type=float, default=12.5)

    opts = a.parse_args()

    # ===============================================

    safe_makedir(opts.outdir)

    norm_info = get_speech(opts.norm_info_fname, 425)[:, -9:]
    data_min = norm_info[0, :]
    data_max = norm_info[1, :]
    data_range = data_max - data_min

    audio_files = set(
        [basename(f) for f in glob.glob(opts.audio_dir + '/*.npy')])
    binary_label_files = sorted(glob.glob(opts.binlabdir + '/*.' +
                                          opts.binext))

    for binlab in binary_label_files:
        base = basename(binlab)
        if base not in audio_files:
            continue
        print base
        positions = process_merlin_positions(binlab,
                                             opts.audio_dir,
                                             inrate=opts.inrate,
                                             outrate=opts.outrate)
        norm_positions = minmax_norm(positions, data_min, data_max)

        np.save(opts.outdir + '/' + base, norm_positions)
Exemple #11
0
def proc(fpath, text_length, hp):

    base = basename(fpath)
    melfile = hp.coarse_audio_dir + os.path.sep + base + '.npy'
    attfile = hp.attention_guide_dir + os.path.sep + base  # without '.npy'
    if not os.path.isfile(melfile):
        print('file %s not found' % (melfile))
        return
    speech_length = np.load(melfile).shape[0]
    att = get_attention_guide(text_length, speech_length, g=hp.g)
    save_floats_as_8bit(att, attfile)
Exemple #12
0
 def load_merlin_label(fpath):
     label_file = "{}/{}".format(hp.merlin_label_dir,
                                 basename(fpath) + ".npy")
     label = np.load(
         label_file
     )  ## TODO: could use read_floats_from_8bit format
     label = np.float32(label)
     if hp.select_central:
         central_ind = get_labels_indices(hp.merlin_lab_dim)
         label = label[:, central_ind == 1]
     assert (label.shape[1] == hp.merlin_lab_dim)
     return fpath, label
Exemple #13
0
            def load_attention(fpath):
                attention_guide_file = "{}/{}".format(hp.attention_guide_dir,
                                                      basename(fpath) + ".npy")
                if hp.attention_guide_fa:  # To use the MSE Attention loss with FA attention matrix
                    attention_guide = np.load(attention_guide_file)
                    attention_guide = np.transpose(
                        attention_guide)  # FA attention is transposed
                else:
                    attention_guide = read_floats_from_8bit(
                        attention_guide_file)

                return fpath, attention_guide
Exemple #14
0
def copy_synth_GL(hp, outdir):

    safe_makedir(outdir)

    dataset = load_data(hp, mode="synthesis")
    fnames, texts = dataset['fpaths'], dataset['texts']
    bases = [basename(fname) for fname in fnames]

    for base in bases:
        print("Working on file %s" % (base))
        mag = np.load(os.path.join(hp.full_audio_dir, base + '.npy'))
        wav = spectrogram2wav(hp, mag)
        soundfile.write(outdir + "/%s.wav" % (base), wav, hp.sr)
def process_merlin_positions(bin_label_fname, audio_dir, phonedim=416, subphonedim=9, \
                    inrate=5.0, outrate=12.5):

    audio_fname = os.path.join(audio_dir, basename(bin_label_fname) + '.npy')
    assert os.path.isfile(
        audio_fname), 'No audio file for %s ' % (basename(bin_label_fname))
    audio = np.load(audio_fname)

    labfrombin = get_speech(bin_label_fname, phonedim + subphonedim)

    positions = labfrombin[:, -subphonedim:]

    nframes, dim = positions.shape
    assert dim == 9

    new_nframes, _ = audio.shape

    old_x = np.linspace((inrate / 2.0),
                        nframes * inrate,
                        nframes,
                        endpoint=False)  ## place points at frame centres

    f = interpolate.interp1d(
        old_x,
        positions,
        axis=0,
        kind='nearest',
        bounds_error=False,
        fill_value='extrapolate'
    )  ## nearest to avoid weird averaging effects near segment boundaries

    new_x = np.linspace((outrate / 2.0),
                        new_nframes * outrate,
                        new_nframes,
                        endpoint=False)
    new_positions = f(new_x)

    return new_positions
Exemple #16
0
def make_mel_batch(hp, fnames, oracle=True): ## TODO: refactor with list2batch ?
    lengths = []
    if oracle:
        source = hp.coarse_audio_dir
        bases = [basename(fname) for fname in fnames]
        mels = [os.path.join(hp.coarse_audio_dir, base + '.npy') for base in bases]
    else:
        mels = fnames
    mels = [np.load(melfile) for melfile in mels] 
    mel_batch = np.zeros((len(mels), hp.max_T, hp.n_mels), np.float32)
    for (i,mel) in enumerate(mels):
        length,n = mel.shape
        mel_batch[i,:length,:] = mel
        lengths.append(length * hp.r)
    return mel_batch, lengths
def compute_validation(hp, model_type, epoch, inputs, synth_graph, sess, speaker_codes, \
         valid_filenames, validation_set_reference, duration_data=None, validation_labels=None, position_in_phone_data=None):
    if model_type == 't2m': ## TODO: coded_text2mel here
        validation_set_predictions_tensor, lengths = synth_text2mel(hp, inputs, synth_graph, sess, speaker_data=speaker_codes, duration_data=duration_data, labels=validation_labels, position_in_phone_data=position_in_phone_data)
        validation_set_predictions = split_batch(validation_set_predictions_tensor, lengths)  
        score = compute_dtw_error(validation_set_reference, validation_set_predictions)   
    elif model_type == 'ssrn':
        validation_set_predictions_tensor = synth_mel2mag(hp, inputs, synth_graph, sess)
        lengths = [len(ref) for ref in validation_set_reference]
        validation_set_predictions = split_batch(validation_set_predictions_tensor, lengths)  
        score = compute_simple_LSD(validation_set_reference, validation_set_predictions)
    else:
        info('compute_validation cannot handle model type %s: dummy value (0.0) supplied as validation score'%(model_type)); return 0.0
    ## store parameters for later use:-
    valid_dir = '%s-%s/validation_epoch_%s'%(hp.logdir, model_type, epoch)
    safe_makedir(valid_dir)
    hp.validation_sentences_to_synth_params = min(hp.validation_sentences_to_synth_params, len(valid_filenames)) #if less sentences match the validation pattern than the value of 'hp.validation_sent_to_synth'
    for i in range(hp.validation_sentences_to_synth_params):
        np.save(os.path.join(valid_dir, basename(valid_filenames[i])), validation_set_predictions[i])
    return score
Exemple #18
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences < len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \
                              for fpath in fpaths ]
        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None

    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")
    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)
        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]

        # for i, mag in enumerate(Z):
        #     print("Working on %s"%(bases[i]))
        #     mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
            
        #     if hp.vocoder=='magphase_compressed':
        #         mag = denorm(mag, s, hp.normtype)
        #         streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45])
        #         wav = magphase_synth_from_compressed(streams, samplerate=hp.sr)
        #     elif hp.vocoder=='griffin_lim':                
        #         wav = spectrogram2wav(hp, mag)
        #     else:
        #         sys.exit('Unsupported vocoder type: %s'%(hp.vocoder))
        #     #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav)
        #     soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr)
            

            
        # Plot attention alignments 
        for i in range(num_sentences):
            plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
Exemple #19
0
 def load_attention(fpath):
     attention_guide_file = "{}/{}".format(hp.attention_guide_dir,
                                           basename(fpath) + ".npy")
     attention_guide = read_floats_from_8bit(attention_guide_file)
     return fpath, attention_guide
Exemple #20
0
def load_data(hp, mode="train", audio_extension='.wav'):
    '''Loads data
      Args:
          mode: "train" / "validation" / "synthesis" / "demo".
    '''
    assert mode in ('train', 'synthesis', 'validation', 'demo')
    logging.info('Start loading data in mode: %s' % (mode))
    get_speaker_codes = (hp.multispeaker != []
                         )  ## False if hp.multispeaker is empty list
    #import pdb;pdb.set_trace()
    dataset_df_path = os.path.join(hp.featuredir, 'dataset_' + mode + '.csv')

    # In demo mode, we change the "dataset" with only one line each time and do not want to use always the same df
    #if os.path.exists(dataset_df_path) and mode != 'demo':
    if 0:
        dataset_df = pd.read_csv(dataset_df_path)

        dataset = {}
        #import pdb;pdb.set_trace()

        # this does not work in train mode because of  problem with doing pd.eval() with bytes
        try:
            dataset['texts'] = np.array(
                [pd.eval(e) for e in dataset_df['texts'].tolist()])
        except AttributeError:
            #that is why we do this
            dataset['texts'] = np.array(
                [ast.literal_eval(e) for e in dataset_df['texts'].tolist()])
            # I think this cause an error when trying training:
            # tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to DecodeRaw has length 105 that is not a multiple of 4, the size of int32

        dataset['fpaths'] = dataset_df['fpaths'].tolist(
        )  ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist
        dataset['text_lengths'] = dataset_df['text_lengths'].tolist(
        )  ## only used in training (where length information lost due to string format) - TODO: good motivation for this format?
        dataset['audio_lengths'] = dataset_df['audio_lengths'].tolist(
        )  ## might be []
        dataset['label_lengths'] = dataset_df['label_lengths'].tolist(
        )  ## might be []

        if get_speaker_codes:
            dataset['speakers'] = dataset_df['speakers'].tolist()
        if hp.use_external_durations:
            dataset['durations'] = dataset_df['durations'].tolist()

    else:
        if mode in ['synthesis', 'demo']:
            get_speaker_codes = False  ## never read speaker from transcript for synthesis -- take user-specified speaker instead

        # Load vocabulary
        char2idx, idx2char = load_vocab(hp)

        if mode in ["train", "validation"]:
            transcript = os.path.join(hp.transcript)
        elif mode == 'synthesis':
            transcript = os.path.join(hp.test_transcript)
        else:
            transcript = './demo/transcript.csv'

        if hp.multispeaker:
            speaker2ix = dict(zip(hp.speaker_list,
                                  range(len(hp.speaker_list))))

        fpaths, text_lengths, texts, speakers, durations = [], [], [], [], []
        audio_lengths, label_lengths = [], []
        lines = codecs.open(transcript, 'r', 'utf-8').readlines()

        too_long_count_frames = 0
        too_long_count_text = 0
        no_data_count = 0

        nframes = 0  ## default 'False' value
        for line in tqdm(lines, desc='load_data'):
            line = line.strip('\n\r |')
            if line == '':
                continue
            fields = line.strip().split("|")

            assert len(fields) >= 1, fields
            if len(fields) > 1:
                assert len(fields) >= 3, fields

            fname = fields[0]
            if len(fields) > 1:
                unnorm_text, norm_text = fields[1:3]
            else:
                norm_text = None  # to test if audio only

            if hp.validpatt:
                if mode == "train":
                    if hp.validpatt in fname:
                        continue
                elif mode == "validation":
                    if hp.validpatt not in fname:
                        continue

            if len(fields) >= 4:
                phones = fields[3]

            if norm_text is None:
                letters_or_phones = [
                ]  #  [0] ## dummy 'text' (1 character of padding) where we are using audio only
            elif hp.input_type == 'phones':
                if 'speaker_dependent_phones' in hp.multispeaker:
                    speaker_code = speaker
                else:
                    speaker_code = ''
                phones = phones_normalize(
                    phones, char2idx, speaker_code=speaker_code
                )  # in case of phones, all EOS markers are assumed included
                letters_or_phones = [char2idx[char] for char in phones]
            elif hp.input_type == 'letters':
                text = text_normalize(norm_text, hp) + "E"  # E: EOS
                letters_or_phones = [char2idx[char] for char in text]

            text_length = len(letters_or_phones)

            if text_length > hp.max_N:
                #print('number of letters/phones for %s is %s, exceeds max_N %s: skip it'%(fname, text_length, hp.max_N))
                too_long_count_text += 1
                continue

            if mode in ["train", "validation"] and os.path.exists(
                    hp.coarse_audio_dir):
                mel = "{}/{}".format(hp.coarse_audio_dir, fname + ".npy")
                if not os.path.exists(mel):
                    logging.debug('no file %s' % (mel))
                    no_data_count += 1
                    continue
                nframes = np.load(mel).shape[0]
                if nframes > hp.max_T:
                    #print('number of frames for %s is %s, exceeds max_T %s: skip it'%(fname, nframes, hp.max_T))
                    too_long_count_frames += 1
                    continue
                audio_lengths.append(nframes)

            texts.append(np.array(letters_or_phones, np.int32))

            fpath = os.path.join(hp.waveforms, fname + audio_extension)
            fpaths.append(fpath)
            text_lengths.append(text_length)

            ## get speaker before phones in case need to get speaker-dependent phones
            if get_speaker_codes:
                assert len(fields) >= 5, fields
                speaker = fields[4]
                speaker_ix = speaker2ix[speaker]
                speakers.append(np.array(speaker_ix, np.int32))

            if hp.merlin_label_dir:  ## only get shape here -- get the data later
                try:
                    label_length, label_dim = np.load("{}/{}".format(
                        hp.merlin_label_dir,
                        basename(fpath) + ".npy")).shape
                except TypeError:
                    label_length, label_dim = np.load("{}/{}".format(
                        hp.merlin_label_dir,
                        basename(fpath.decode('utf-8')) + ".npy")).shape
                label_lengths.append(label_length)
                assert label_dim == hp.merlin_lab_dim

            if hp.use_external_durations:
                assert len(fields) >= 6, fields
                duration_data = fields[5]
                duration_data = [
                    int(value)
                    for value in re.split('\s+', duration_data.strip(' '))
                ]
                duration_data = np.array(duration_data, np.int32)
                if hp.merlin_label_dir:
                    duration_data = duration_data[
                        duration_data >
                        0]  ## merlin label contains no skipped items
                    assert len(duration_data) == label_length, (
                        len(duration_data), label_length, fpath)
                else:
                    assert len(duration_data) == text_length, (
                        len(duration_data), text_length, fpath)
                if nframes:
                    assert duration_data.sum() == nframes * hp.r, (
                        duration_data.sum(), nframes * hp.r)
                durations.append(duration_data)

            # !TODO! check this -- duplicated!?
            # if hp.merlin_label_dir: ## only get shape here -- get the data later
            #     label_length, _ = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")).shape
            #     label_lengths.append(label_length)

        #import pdb;pdb.set_trace()

        if mode == "validation":
            if len(texts) == 0:
                logging.error(
                    'No validation sentences collected: maybe the validpatt %s matches no training data file names?'
                    % (hp.validpatt))
                sys.exit(1)

        logging.info('Loaded data for %s sentences' % (len(texts)))
        logging.info('Sentences skipped with missing features: %s' %
                     (no_data_count))
        logging.info('Sentences skipped with > max_T (%s) frames: %s' %
                     (hp.max_T, too_long_count_frames))
        logging.info(
            'Additional sentences skipped with > max_N (%s) letters/phones: %s'
            % (hp.max_N, too_long_count_text))

        if mode == 'train' and hp.n_utts > 0:
            n_utts = hp.n_utts
            assert n_utts <= len(fpaths)
            logging.info('Take first %s (n_utts) sentences for training' %
                         (n_utts))
            fpaths = fpaths[:n_utts]
            text_lengths = text_lengths[:n_utts]
            texts = texts[:n_utts]
            if get_speaker_codes:
                speakers = speakers[:n_utts]
            if audio_lengths:
                audio_lengths = audio_lengths[:n_utts]
            if label_lengths:
                label_lengths = label_lengths[:n_utts]

        if mode == 'train':
            ## Return string representation which will be parsed with tf's decode_raw:
            texts = [text.tostring() for text in texts]
            if get_speaker_codes:
                speakers = [speaker.tostring() for speaker in speakers]
            if hp.use_external_durations:
                durations = [d.tostring() for d in durations]

        if mode in ['validation', 'synthesis', 'demo']:
            ## Prepare a batch of 'stacked texts' (matrix with number of rows==synthesis batch size, and each row an array of integers)
            stacked_texts = np.zeros((len(texts), hp.max_N), np.int32)
            for i, text in enumerate(texts):
                stacked_texts[i, :len(text)] = text
            texts = stacked_texts

            if hp.use_external_durations:
                stacked_durations = np.zeros((len(texts), hp.max_T, hp.max_N),
                                             np.int32)
                for i, dur in enumerate(durations):
                    duration_matrix = durations_to_hard_attention_matrix(dur)
                    duration_matrix = end_pad_for_reduction_shape_sync(
                        duration_matrix, hp)
                    duration_matrix = duration_matrix[0::hp.r, :]
                    m, n = duration_matrix.shape
                    stacked_durations[i, :m, :n] = duration_matrix
                durations = stacked_durations

        dataset = {}
        dataset['texts'] = texts
        dataset[
            'fpaths'] = fpaths  ## at synthesis, fpaths only a way to get bases -- wav files probably do not exist
        dataset[
            'text_lengths'] = text_lengths  ## only used in training (where length information lost due to string format) - TODO: good motivation for this format?
        dataset['audio_lengths'] = audio_lengths  ## might be []
        dataset['label_lengths'] = label_lengths  ## might be []

        dataset_df = dataset.copy()

        try:
            dataset_df['texts'] = dataset_df['texts'].tolist()
        except:
            # It is already a list
            pass
        try:
            if len(dataset_df['audio_lengths']) == 0:
                dataset_df['audio_lengths'] = [0] * len(dataset_df['texts'])
            if len(dataset_df['label_lengths']) == 0:
                dataset_df['label_lengths'] = [0] * len(dataset_df['texts'])
            if not os.path.exists(hp.featuredir): os.makedirs(hp.featuredir)
            pd.DataFrame.to_csv(pd.DataFrame.from_records(dataset_df),
                                dataset_df_path)
        except:
            import pdb
            pdb.set_trace()

        if get_speaker_codes:
            dataset['speakers'] = speakers
        if hp.use_external_durations:
            dataset['durations'] = durations

    logging.info('Finished loading data in mode: %s' % (mode))
    #import pdb;pdb.set_trace()
    return dataset
Exemple #21
0
def main_work():

    #################################################

    # ============= Process command line ============
    a = ArgumentParser()
    a.add_argument('-c', dest='config', required=True, type=str)
    a.add_argument('-m',
                   dest='model_type',
                   required=True,
                   choices=['t2m', 'ssrn', 'babbler'])
    opts = a.parse_args()

    # ===============================================
    model_type = opts.model_type
    hp = load_config(opts.config)
    logdir = hp.logdir + "-" + model_type
    logger_setup.logger_setup(logdir)
    info('Command line: %s' % (" ".join(sys.argv)))

    ### TODO: move this to its own function somewhere. Can be used also at synthesis time?
    ### Prepare reference data for validation set:  ### TODO: alternative to holding in memory?
    dataset = load_data(hp, mode="validation")
    valid_filenames, validation_text = dataset['fpaths'], dataset['texts']

    speaker_codes = validation_duration_data = position_in_phone_data = None  ## defaults
    if hp.multispeaker:
        speaker_codes = dataset['speakers']
    if hp.use_external_durations:
        validation_duration_data = dataset['durations']

    ## take random subset of validation set to avoid 'This is a librivox recording' type sentences
    random.seed(1234)
    v_indices = range(len(valid_filenames))
    random.shuffle(v_indices)
    v = min(hp.validation_sentences_to_evaluate, len(valid_filenames))
    v_indices = v_indices[:v]

    if hp.multispeaker:  ## now come back to this after v computed
        speaker_codes = np.array(speaker_codes)[v_indices].reshape(-1, 1)
    if hp.use_external_durations:
        validation_duration_data = validation_duration_data[v_indices, :, :]

    valid_filenames = np.array(valid_filenames)[v_indices]
    validation_mags = [np.load(hp.full_audio_dir + os.path.sep + basename(fpath)+'.npy') \
                                for fpath in valid_filenames]
    validation_text = validation_text[v_indices, :]
    validation_labels = None  # default
    if hp.merlin_label_dir:
        validation_labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \
                              for fpath in valid_filenames ]
        validation_labels = list2batch(validation_labels, hp.max_N)

    if 'position_in_phone' in hp.history_type:

        def duration2position(duration, fractional=False):
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in dataset['durations'][v_indices]]
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)

    if model_type == 't2m':
        validation_mels = [np.load(hp.coarse_audio_dir + os.path.sep + basename(fpath)+'.npy') \
                                    for fpath in valid_filenames]
        validation_inputs = validation_text
        validation_reference = validation_mels
        validation_lengths = None
    elif model_type == 'ssrn':
        validation_inputs, validation_lengths = make_mel_batch(
            hp, valid_filenames)
        validation_reference = validation_mags
    else:
        info(
            'Undefined model_type {} for making validation inputs -- supply dummy None values'
            .format(model_type))
        validation_inputs = None
        validation_reference = None

    ## Get the text and mel inputs for the utts you would like to plot attention graphs for
    if hp.plot_attention_every_n_epochs and model_type == 't2m':  #check if we want to plot attention
        # TODO do we want to generate and plot attention for validation or training set sentences??? modify attention_inputs accordingly...
        attention_inputs = validation_text[:hp.num_sentences_to_plot_attention]
        attention_mels = validation_mels[:hp.num_sentences_to_plot_attention]
        attention_mels = np.array(
            attention_mels)  #TODO should be able to delete this line...?
        attention_mels_array = np.zeros(
            (hp.num_sentences_to_plot_attention, hp.max_T, hp.n_mels),
            np.float32)  # create fixed size array to hold attention mels
        for i in range(hp.num_sentences_to_plot_attention
                       ):  # copy data into this fixed sized array
            attention_mels_array[
                i, :attention_mels[i].shape[0], :attention_mels[i].
                shape[1]] = attention_mels[i]
        attention_mels = attention_mels_array  # rename for convenience

    ## Map to appropriate type of graph depending on model_type:
    AppropriateGraph = {
        't2m': Text2MelGraph,
        'ssrn': SSRNGraph,
        'babbler': BabblerGraph
    }[model_type]

    g = AppropriateGraph(hp)
    info("Training graph loaded")
    synth_graph = AppropriateGraph(hp, mode='synthesize', reuse=True)
    info(
        "Synthesis graph loaded"
    )  #reuse=True ensures that 'synth_graph' and 'attention_graph' share weights with training graph 'g'
    attention_graph = AppropriateGraph(hp,
                                       mode='generate_attention',
                                       reuse=True)
    info("Atttention generating graph loaded")
    #TODO is loading three graphs a problem for memory usage?

    if 0:
        print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel'))
        ## [<tf.Variable 'Text2Mel/TextEnc/embed_1/lookup_table:0' shape=(61, 128) dtype=float32_ref>, <tf.Variable 'Text2Mel/TextEnc/C_2/conv1d/kernel:0' shape=(1, 128, 512) dtype=float32_ref>, ...

    ## TODO: tensorflow.python.training.supervisor deprecated: --> switch to tf.train.MonitoredTrainingSession
    sv = tf.train.Supervisor(logdir=logdir,
                             save_model_secs=0,
                             global_step=g.global_step)

    ## Get the current training epoch from the name of the model that we have loaded
    latest_checkpoint = tf.train.latest_checkpoint(logdir)
    if latest_checkpoint:
        epoch = int(
            latest_checkpoint.strip('/ ').split('/')[-1].replace(
                'model_epoch_', ''))
    else:  #did not find a model checkpoint, so we start training from scratch
        epoch = 0

    ## If save_every_n_epochs > 0, models will be stored here every n epochs and not
    ## deleted, regardless of validation improvement etc.:--
    safe_makedir(logdir + '/archive/')

    with sv.managed_session() as sess:
        if 0:  ## Set to 1 to debug NaNs; at tfdbg prompt, type:    run -f has_inf_or_nan
            ## later:    lt  -f has_inf_or_nan -n .*AudioEnc.*
            os.system('rm -rf {}/tmp_tfdbg/'.format(logdir))
            sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                        dump_root=logdir +
                                                        '/tmp_tfdbg/')

        if hp.initialise_weights_from_existing:
            info('=====Initialise some variables from existing model(s)=====')
            sess.graph._unsafe_unfinalize(
            )  ## !!! https://stackoverflow.com/questions/41798311/tensorflow-graph-is-finalized-and-cannot-be-modified/41798401
            for (scope, checkpoint) in hp.initialise_weights_from_existing:
                var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope)
                info('----From existing model %s:----' % (checkpoint))
                if var_list:  ## will be empty when training t2m but looking at ssrn
                    saver = tf.train.Saver(var_list=var_list)
                    saver.restore(sess, checkpoint)
                    for var in var_list:
                        info('   %s' % (var.name))
                else:
                    info('   No variables!')
                info(
                    '========================================================')

        if hp.restart_from_savepath:  #set this param to list: [path_to_t2m_model_folder, path_to_ssrn_model_folder]
            # info('Restart from these paths:')
            info(hp.restart_from_savepath)

            # assert len(hp.restart_from_savepath) == 2
            restart_from_savepath1, restart_from_savepath2 = hp.restart_from_savepath
            restart_from_savepath1 = os.path.abspath(restart_from_savepath1)
            restart_from_savepath2 = os.path.abspath(restart_from_savepath2)

            sess.graph._unsafe_unfinalize(
            )  ## !!! https://stackoverflow.com/questions/41798311/tensorflow-graph-is-finalized-and-cannot-be-modified/41798401
            sess.run(tf.global_variables_initializer())

            print('Restore parameters')
            if model_type == 't2m':
                var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             'Text2Mel')
                saver1 = tf.train.Saver(var_list=var_list)
                latest_checkpoint = tf.train.latest_checkpoint(
                    restart_from_savepath1)
                saver1.restore(sess, restart_from_savepath1)
                print("Text2Mel Restored!")
            elif model_type == 'ssrn':
                var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \
                           tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
                saver2 = tf.train.Saver(var_list=var_list)
                latest_checkpoint = tf.train.latest_checkpoint(
                    restart_from_savepath2)
                saver2.restore(sess, restart_from_savepath2)
                print("SSRN Restored!")
            epoch = int(
                latest_checkpoint.strip('/ ').split('/')[-1].replace(
                    'model_epoch_', ''))
            # TODO: this counter won't work if training restarts in same directory.
            ## Get epoch from gs?

        loss_history = []  #any way to restore loss history too?

        #plot attention generated from freshly initialised model
        if hp.plot_attention_every_n_epochs and model_type == 't2m' and epoch == 0:  # ssrn model doesn't generate alignments
            get_and_plot_alignments(
                hp, epoch - 1, attention_graph, sess, attention_inputs,
                attention_mels, logdir +
                "/alignments")  # epoch-1 refers to freshly initialised model

        current_score = compute_validation(
            hp,
            model_type,
            epoch,
            validation_inputs,
            synth_graph,
            sess,
            speaker_codes,
            valid_filenames,
            validation_reference,
            duration_data=validation_duration_data,
            validation_labels=validation_labels,
            position_in_phone_data=position_in_phone_data)
        info('validation epoch {0}: {1:0.3f}'.format(epoch, current_score))

        while 1:
            progress_bar_text = '%s/%s; ep. %s' % (hp.config_name, model_type,
                                                   epoch)
            for batch_in_current_epoch in tqdm(range(g.num_batch),
                                               total=g.num_batch,
                                               ncols=80,
                                               leave=True,
                                               unit='b',
                                               desc=progress_bar_text):
                gs, loss_components, _ = sess.run(
                    [g.global_step, g.loss_components, g.train_op])
                loss_history.append(loss_components)

            ### End of epoch: validate?
            if hp.validate_every_n_epochs:
                if epoch % hp.validate_every_n_epochs == 0:

                    loss_history = np.array(loss_history)
                    train_loss_mean_std = np.concatenate(
                        [loss_history.mean(axis=0),
                         loss_history.std(axis=0)])
                    loss_history = []

                    train_loss_mean_std = ' '.join([
                        '{:0.3f}'.format(score)
                        for score in train_loss_mean_std
                    ])
                    info('train epoch {0}: {1}'.format(epoch,
                                                       train_loss_mean_std))

                    current_score = compute_validation(
                        hp,
                        model_type,
                        epoch,
                        validation_inputs,
                        synth_graph,
                        sess,
                        speaker_codes,
                        valid_filenames,
                        validation_reference,
                        duration_data=validation_duration_data,
                        validation_labels=validation_labels,
                        position_in_phone_data=position_in_phone_data)
                    info('validation epoch {0:0}: {1:0.3f}'.format(
                        epoch, current_score))

            ### End of epoch: plot attention matrices? #################################
            if hp.plot_attention_every_n_epochs and model_type == 't2m' and epoch % hp.plot_attention_every_n_epochs == 0:  # ssrn model doesn't generate alignments
                get_and_plot_alignments(hp, epoch, attention_graph, sess,
                                        attention_inputs, attention_mels,
                                        logdir + "/alignments")

            ### Save end of each epoch (all but the most recent 5 will be overwritten):
            stem = logdir + '/model_epoch_{0}'.format(epoch)
            sv.saver.save(sess, stem)

            ### Check if we should archive (to files which won't be overwritten):
            if hp.save_every_n_epochs:
                if epoch % hp.save_every_n_epochs == 0:
                    info('Archive model %s' % (stem))
                    for fname in glob.glob(stem + '*'):
                        shutil.copy(fname, logdir + '/archive/')

            epoch += 1
            if epoch > hp.max_epochs:
                info('Max epochs ({}) reached: end training'.format(
                    hp.max_epochs))
                return

    print("Done")
Exemple #22
0
def main_work():

    #################################################

    # ============= Process command line ============

    a = ArgumentParser()
    a.add_argument(
        '-meldir',
        required=True,
        type=str,
        help=
        'existing directory with mels - features are padding to match length of these '
    )
    a.add_argument('-worlddir',
                   required=True,
                   type=str,
                   help='existing directory containing world features')
    a.add_argument('-outdir', required=True, type=str)

    a.add_argument('-testpatt', required=False, type=str, default='')

    a.add_argument('-ncores',
                   default=1,
                   type=int,
                   help='Number of cores for parallel processing')
    opts = a.parse_args()

    # ===============================================

    # hp = load_config(opts.config)

    fpaths = sorted(glob.glob(opts.meldir + '/*.npy'))  # [:10]

    normkind = 'meanvar'

    if normkind == 'minmax':
        scaler = MinMaxScaler()
    elif normkind == 'meanvar':
        scaler = StandardScaler()
    else:
        sys.exit('aedvsv')

    if opts.testpatt:
        train_fpaths = [p for p in fpaths if opts.testpatt not in basename(p)]
    else:
        train_fpaths = fpaths

    for fpath in tqdm(train_fpaths, desc='First pass to get norm stats'):

        data = load_sentence(fpath, worlddir=opts.worlddir, outdir=opts.outdir)
        scaler = update_normalisation_stats(data, scaler)

    safe_makedir(opts.outdir)
    safe_makedir(opts.outdir + '/full_world/')
    safe_makedir(opts.outdir + '/coarse_world/')

    if 0:
        process(fpaths[0],
                worlddir=opts.worlddir,
                outdir=opts.outdir,
                scaler=scaler)
        sys.exit('aedvsfv')

    executor = ProcessPoolExecutor(max_workers=opts.ncores)
    futures = []
    for fpath in fpaths:
        futures.append(
            executor.submit(process,
                            fpath,
                            worlddir=opts.worlddir,
                            outdir=opts.outdir,
                            scaler=scaler))

    proc_list = [
        future.result()
        for future in tqdm(futures,
                           desc='Second pass (parallel) to do normalisation')
    ]

    if normkind == 'minmax':
        mini = scaler.data_min_  ## TODO: per speaker...
        maxi = scaler.data_max_
        stats = np.vstack([mini, maxi])
    elif normkind == 'meanvar':
        mean = scaler.mean_  ## TODO: per speaker...
        std = scaler.scale_
        stats = np.vstack([mean, std])
    else:
        sys.exit('aedvsv2')
    np.save(opts.outdir + '/norm_stats', stats)
Exemple #23
0
    def synthesize(self,
                   text=None,
                   emo_code=None,
                   mels=None,
                   speaker_id='',
                   num_sentences=0,
                   ncores=1,
                   topoutdir=''):
        '''
        topoutdir: store samples under here; defaults to hp.sampledir
        t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
        '''
        assert self.hp.vocoder in [
            'griffin_lim', 'world'
        ], 'Other vocoders than griffin_lim/world not yet supported'

        if text is not None:
            text_to_phonetic(text=text)
            dataset = load_data(self.hp, mode='demo')
        else:
            dataset = load_data(
                self.hp, mode="synthesis"
            )  #since mode != 'train' or 'validation', will load test_transcript rather than transcript
        fpaths, L = dataset['fpaths'], dataset['texts']
        position_in_phone_data = duration_data = labels = None  # default
        if self.hp.use_external_durations:
            duration_data = dataset['durations']
            if num_sentences > 0:
                duration_data = duration_data[:num_sentences, :, :]

        if 'position_in_phone' in self.hp.history_type:
            ## TODO: combine + deduplicate with relevant code in train.py for making validation set
            def duration2position(duration, fractional=False):
                ### very roundabout -- need to deflate A matrix back to integers:
                duration = duration.sum(axis=0)
                #print(duration)
                # sys.exit('evs')
                positions = durations_to_position(duration,
                                                  fractional=fractional)
                ###positions = end_pad_for_reduction_shape_sync(positions, hp)
                positions = positions[0::hp.r, :]
                #print(positions)
                return positions

            position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                            for dur in duration_data]
            position_in_phone_data = list2batch(position_in_phone_data,
                                                hp.max_T)

        # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
        if num_sentences > 0:
            assert num_sentences < len(fpaths)
            L = L[:num_sentences, :]
            fpaths = fpaths[:num_sentences]

        bases = [basename(fpath) for fpath in fpaths]

        if self.hp.merlin_label_dir:
            labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \
                                for fpath in fpaths ]
            labels = list2batch(labels, hp.max_N)

        if speaker_id:
            speaker2ix = dict(zip(hp.speaker_list,
                                  range(len(hp.speaker_list))))
            speaker_ix = speaker2ix[speaker_id]

            ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
            speaker_data = np.ones((len(L), 1)) * speaker_ix
        else:
            speaker_data = None

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            if mels is not None:
                emo_code = encode_audio2emo(self.hp, mels, self.g1, self.sess)
            K, V = encode_text(self.hp,
                               L,
                               self.g1,
                               self.sess,
                               emo_mean=emo_code,
                               speaker_data=speaker_data,
                               labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(self.hp, K, V, text_lengths, self.g1, self.sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else:  ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(self.hp, L, self.g1, self.sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(self.hp, Y, self.g2, self.sess)
        stop_clock(t)

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = self.hp.sampledir
        outdir = os.path.join(
            topoutdir, 't2m%s_ssrn%s' % (self.t2m_epoch, self.ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s' % (speaker_id)
        safe_makedir(outdir)
        print("Generating wav files, will save to following dir: %s" %
              (outdir))

        assert self.hp.vocoder in [
            'griffin_lim', 'world'
        ], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores == 1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i] *
                          self.hp.r, :]  ### trim to generated length
                synth_wave(self.hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i] *
                          self.hp.r, :]  ### trim to generated length
                futures.append(
                    executor.submit(synth_wave, self.hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]

        # Plot attention alignments
        for i in range(num_sentences):
            plot_alignment(self.hp,
                           alignments[i],
                           utt_idx=i + 1,
                           t2m_epoch=self.t2m_epoch,
                           dir=outdir)

        self.outdir = outdir
Exemple #24
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences <= len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = []
        for fpath in fpaths:
            label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy"))
            if hp.select_central:
                central_ind = get_labels_indices(hp.merlin_lab_dim)
                label = label[:,central_ind==1] 
            labels.append(label)

        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None
   
    if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off
        text_lengths = get_text_lengths(L)
        hp.text_lengths = text_lengths + 1
     
    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")

    if hp.norm == None :
        t2m_layer_norm = False
        hp.norm = 'layer'
        hp.lr = 0.001
        hp.beta1 = 0.9
        hp.beta2 = 0.999
        hp.epsilon = 0.00000001
        hp.decay_lr = True
        hp.batchsize = {'t2m': 32, 'ssrn': 8}
    else:
        t2m_layer_norm = True

    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    if t2m_layer_norm == False:
        hp.norm = None
        hp.lr = 0.0002
        hp.beta1 = 0.5
        hp.beta2 = 0.9
        hp.epsilon = 0.000001
        hp.decay_lr = False
        hp.batchsize = {'t2m': 16, 'ssrn': 8}
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)

        # Plot trimmed attention alignment with filename
        print("Plot attention, will save to following dir: %s"%(outdir))
        print("File |  CDP | Ain")
        for i, mag in enumerate(Z):
            outfile = os.path.join(outdir, bases[i])
            trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]]
            plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile)
            CDP = getCDP(trimmed_alignment)
            APin, APout = getAP(trimmed_alignment)
            print("%s | %.2f | %.2f"%( bases[i], CDP, APin))

        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]