def dumpFeaturePho(wav_path,
                   textgrid_path,
                   recordings,
                   syllableTierName,
                   phonemeTierName):
    """
    dump the log-mel feature for each phoneme
    :param wav_path: path of the wavs
    :param textgrid_path: path of the textgrid
    :param recordings: recording file names
    :param syllableTierName: syllable textgrid tier name - dianSilence
    :param phonemeTierName: phoneme textgrid tier name - details
    :return:
    """

    # feature dictionary
    dic_pho_embedding = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_embedding[pho] = []

    for artist_path, recording in recordings:
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,
                                         join(artist_path, recording),
                                         syllableTierName,
                                         phonemeTierName)

        wav_full_filename = join(wav_path, artist_path, recording+'.wav')

        log_mel = getMFCCBandsMadmom(audio_fn=wav_full_filename, fs=fs, hopsize_t=hopsize_t)

        # go through all the phonemes in the list
        for ii, pho in enumerate(nestedPhonemeLists):
            print('calculating ', recording, ' and phoneme ', str(ii), ' of ', str(len(nestedPhonemeLists)))
            for p in pho[1]:
                # map from annotated xsampa to a readable notation with a dictionary
                try:
                    key = dic_pho_map[p[2]]
                except KeyError:
                    print(artist_path, recording)
                    print(ii, p[2])
                    raise

                sf = int(round(p[0] * fs / float(hopsize)))  # starting frame
                ef = int(round(p[1] * fs / float(hopsize)))  # ending frame

                log_mel_p = log_mel[sf:ef, :]  # log-mel phoneme

                if len(log_mel_p):
                    dic_pho_embedding[key].append(log_mel_p)

    return dic_pho_embedding
def dumpAudioPhn(wav_path,
                 textgrid_path,
                 recordings,
                 lineTierName,
                 phonemeTierName):
    """
    dump audio of each phone
    :param wav_path:
    :param textgrid_path:
    :param recordings:
    :param lineTierName:
    :param phonemeTierName:
    :return:
    """

    ##-- dictionary feature
    dic_pho_wav = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_wav[pho] = []

    for artist_path, recording in recordings:
        nestedPhonemeLists, numSyllables, numPhonemes \
            = syllableTextgridExtraction(textgrid_path,
                                         join(artist_path, recording),
                                         lineTierName,
                                         phonemeTierName)

        # audio
        wav_full_filename = join(wav_path, artist_path, recording + '.wav')

        data_wav, fs_wav = sf.read(wav_full_filename)

        for ii, pho in enumerate(nestedPhonemeLists):
            print('calculating ', recording, ' and phoneme ', str(ii), ' of ', str(len(nestedPhonemeLists)))
            for p in pho[1]:
                # map from annotated xsampa to readable notation
                try:
                    key = dic_pho_map[p[2]]
                except KeyError:
                    print(artist_path, recording)
                    print(ii, p[2])
                    raise

                st = int(round(p[0] * fs_wav))  # starting time
                et = int(round(p[1] * fs_wav))  # ending time

                pho_wav = data_wav[st: et]

                if len(pho_wav):
                    dic_pho_wav[key].append(pho_wav)

    return dic_pho_wav
def dumpFeaturePho(wav_path, textgrid_path, recordings, syllableTierName,
                   phonemeTierName):
    '''
    dump the MFCC for each phoneme
    :param recordings:
    :return:
    '''

    ##-- dictionary feature
    dic_pho_feature = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_feature[pho] = np.array([])

    for artist_path, recording in recordings:
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,
                                         join(artist_path,recording),
                                         syllableTierName,
                                         phonemeTierName)

        # audio
        wav_full_filename = join(wav_path, artist_path, recording + '.wav')

        mfcc = getMFCCBands2DMadmom(wav_full_filename,
                                    fs,
                                    hopsize_t,
                                    channel=1)

        for ii, pho in enumerate(nestedPhonemeLists):
            print('calculating ', recording, ' and phoneme ', str(ii), ' of ',
                  str(len(nestedPhonemeLists)))
            for p in pho[1]:
                # map from annotated xsampa to readable notation
                try:
                    key = dic_pho_map[p[2]]
                except KeyError:
                    print(artist_path, recording)
                    print(ii, p[2])
                    raise

                sf = int(round(p[0] * fs / float(hopsize)))  # starting frame
                ef = int(round(p[1] * fs / float(hopsize)))  # ending frame

                mfcc_p = mfcc[sf:ef, :]  # phoneme syllable

                if not len(dic_pho_feature[key]):
                    dic_pho_feature[key] = mfcc_p
                else:
                    dic_pho_feature[key] = np.vstack(
                        (dic_pho_feature[key], mfcc_p))

    return dic_pho_feature
def extract_log_mel_of_line(wav_path, textgrid_path, filename, num_line):
    nestedPhonemeLists, numlines, numPhonemes \
        = syllableTextgridExtraction(textgrid_path,
                                     filename,
                                     "line",
                                     "details")

    # audio filename
    wav_full_filename = os.path.join(wav_path, filename + '.wav')

    log_mel = getMFCCBandsMadmom(audio_fn=wav_full_filename, fs=fs, hopsize_t=hopsize_t)

    line_list = nestedPhonemeLists[num_line]

    sf = int(round(line_list[0][0] * fs / float(hopsize)))  # starting frame
    ef = int(round(line_list[0][1] * fs / float(hopsize)))  # ending frame

    return (log_mel[sf:ef], line_list[1])
Esempio n. 5
0
def s_check(textgrid_path,
            recordings,
            parentTierName,
            childTierName):

    for artist_path, recording in recordings:
        nestedLists, _, _   \
            = syllableTextgridExtraction(textgrid_path=textgrid_path,
                                         recording=os.path.join(artist_path,recording),
                                         tier0=parentTierName,
                                         tier1=childTierName)

        for ii, line_list in enumerate(nestedLists):
            print(artist_path, recording ,ii, len(line_list[1]))

            if childTierName=='details':
                for phn in line_list[1]:
                    try:
                        key = dic_pho_map[phn[2]]
                    except:
                        print(artist_path, ii, recording, phn[2])
                        raise KeyError