コード例 #1
0
def data_parser(artist_path,
                wav_path,
                textgrid_path,
                rn,
                score_file,
                lab):
    """parse the wav filename, text grid and score"""

    if not lab:
        # ground truth text grid
        ground_truth_text_grid_file = join(textgrid_path, artist_path, rn + '.TextGrid')

        # wav
        wav_file = join(wav_path, artist_path, rn + '.wav')

        # parse line
        line_list = textGrid2WordList(ground_truth_text_grid_file, whichTier='line')

        # parse syllable
        syllable_list = textGrid2WordList(ground_truth_text_grid_file, whichTier='dianSilence')

        # parse lines of ground truth
        nested_syllable_lists, _, _ = wordListsParseByLines(line_list, syllable_list)

        # parse score
        syllables, pinyins, syllable_durations, bpm = csvScorePinyinParser(score_file)
    else:
        ground_truth_text_grid_file = join(textgrid_path, artist_path, rn + '.lab')
        wav_file = join(wav_path, artist_path, rn + '.mp3')
        line_list = [lab2WordList(ground_truth_text_grid_file, label=True)]
        syllables, syllable_durations, bpm = csvDurationScoreParser(score_file)
        nested_syllable_lists = None
        pinyins = None

    return nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins
コード例 #2
0
def phoDurCollection(full_path_textgrids):
    '''
    collect durations of pho into dictionary
    :param recordings:
    :return:
    '''
    dict_duration_pho = {}
    for full_path_textgrid in full_path_textgrids:

        lineList = textGrid2WordList(full_path_textgrid, whichTier='dian')
        utteranceList = textGrid2WordList(full_path_textgrid,
                                          whichTier='details')

        # parse lines of groundtruth
        nestedPhonemeLists, _, _ = wordListsParseByLines(
            lineList, utteranceList)

        for pho in nestedPhonemeLists:
            for p in pho[1]:
                dur_pho = p[1] - p[0]
                sampa_pho = dic_pho_map[p[2]]

                if sampa_pho not in dict_duration_pho.keys():
                    dict_duration_pho[sampa_pho] = [dur_pho]
                else:
                    dict_duration_pho[sampa_pho].append(dur_pho)
    return dict_duration_pho
コード例 #3
0
def parse_syllable_line_list(ground_truth_text_grid_file, parent_tier,
                             child_tier):

    if not os.path.isfile(ground_truth_text_grid_file):
        is_file_exist = False
        return False, is_file_exist, False
    else:
        is_file_exist = True

        # parse line
        line_list, _ = textGrid2WordList(ground_truth_text_grid_file,
                                         whichTier=parent_tier)

        # parse syllable
        syllable_list, is_syllable_found = textGrid2WordList(
            ground_truth_text_grid_file, whichTier=child_tier)

        # parse lines of ground truth
        nested_syllable_lists, _, _ = wordListsParseByLines(
            line_list, syllable_list)

        return nested_syllable_lists, is_file_exist, is_syllable_found
コード例 #4
0
def onsetFunctionAllRecordings(recordings,
                               textgrid_path,
                               dict_recording_name_mapping,
                               dataset_path,
                               feature_type='mfcc',
                               dmfcc=False,
                               nbf=True,
                               mth='jordi',
                               late_fusion=True):
    """
    ODF and viter decoding
    :param recordings:
    :param textgrid_path:
    :param dict_recording_name_mapping: mapping from "fem_01" to standard format, see filePath.py
    :param dataset_path:
    :param feature_type: 'mfcc', 'mfccBands1D' or 'mfccBands2D'
    :param dmfcc: delta for 'mfcc'
    :param nbf: context frames
    :param mth: jordi, jordi_horizontal_timbral, jan, jan_chan3
    :param late_fusion: Bool
    :return:
    """

    scaler = pickle.load(open(full_path_mfccBands_2D_scaler_onset, 'rb'))

    # kerasModel = _LRHMM.kerasModel(full_path_keras_cnn_am)

    for i_recording, recording_name in enumerate(recordings):

        groundtruth_textgrid_file   = join(textgrid_path, dict_recording_name_mapping[recording_name]+'.TextGrid')
        score_file                  = join(aCapella_root, dataset_path, score_path,      recording_name+'.csv')
        wav_file                    = join(aCapella_root, dataset_path, audio_path,      recording_name+'.wav')

        if not isfile(score_file):
            print 'Score not found: ' + score_file
            continue

        lineList        = textGrid2WordList(groundtruth_textgrid_file, whichTier='line')
        utteranceList   = textGrid2WordList(groundtruth_textgrid_file, whichTier='dianSilence')

        # parse lines of groundtruth
        nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines(lineList, utteranceList)

        # parse score
        syllables, pinyins, syllable_durations, bpm = generatePinyin(score_file)

        # print(pinyins)
        # print(syllable_durations)

        if varin['obs'] == 'tocal':
            # load audio
            audio_monoloader               = ess.MonoLoader(downmix = 'left', filename = wav_file, sampleRate = fs)()
            audio_eqloudloder              = ess.EqloudLoader(filename=wav_file, sampleRate = fs)()

            if mth == 'jordi' or mth == 'jordi_horizontal_timbral' or mth == 'jan':
                mfcc, mfcc_reshaped = featureExtraction(audio_monoloader,
                                                              scaler,
                                                              int(round(0.025 * fs)),
                                                              dmfcc=dmfcc,
                                                              nbf=nbf,
                                                              feature_type='mfccBands2D')

        for i_obs, lineList in enumerate(nestedUtteranceLists):
            if int(bpm[i_obs]):
                sample_start    = int(round(lineList[0][0] * fs))
                sample_end      = int(round(lineList[0][1] * fs))
                frame_start     = int(round(lineList[0][0] * fs / hopsize))
                frame_end       = int(round(lineList[0][1] * fs / hopsize))
                # print(feature.shape)

                obs_path = join('./obs', cnnModel_name, dataset_path)
                obs_filename = recording_name + '_' + str(i_obs + 1) + '.pkl'
                full_obs_name = join(obs_path, obs_filename)

                if varin['obs'] == 'tocal':
                    if mth == 'jordi' or mth == 'jordi_horizontal_timbral' or mth == 'jan':
                        audio_eqloudloder_line = audio_eqloudloder[sample_start:sample_end]
                        mfcc_line          = mfcc[frame_start:frame_end]
                        mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end]

                    mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1)
                    obs     = getOnsetFunction(observations=mfcc_reshaped_line,
                                               model=model_keras_cnn_0,
                                               method=mth)
                    # obs_i   = obs[:,1]
                    obs_i = obs[:, 0]

                    hann = np.hanning(5)
                    hann /= np.sum(hann)

                    obs_i = np.convolve(hann, obs_i, mode='same')

                    # save onset curve
                    print('save onset curve ... ...')
                    obs_dirpath = dirname(full_obs_name)
                    if not exists(obs_dirpath):
                        makedirs(obs_dirpath)
                    pickle.dump(obs_i, open(full_obs_name, 'w'))
                else:
                    obs_i = pickle.load(open(full_obs_name, 'r'))

                if late_fusion:
                    if varin['obs'] == 'viterbi':
                        obs_2 = getOnsetFunction(observations=mfcc_reshaped_line,
                                                 path_keras_cnn=full_path_keras_cnn_1,
                                                 method=mth)
                        obs_2_i = obs_2[:, 1]
                        obs_2_i = np.convolve(hann, obs_2_i, mode='same')
                    else:
                        obs_path_1 = join('./obs', cnnModel_name_1, dataset_path)
                        full_obs_name_1 = join(obs_path_1, obs_filename)
                        obs_2_i = pickle.load(open(full_obs_name_1, 'r'))

                    obs_i = late_fusion_calc(obs_i, obs_2_i, mth=2)

                # organize score
                print('Calculating: '+recording_name+' phrase '+str(i_obs))
                print('ODF Methods: '+mth_ODF+' Late fusion: '+str(fusion))

                time_line      = lineList[0][1] - lineList[0][0]

                lyrics_line    = [ll[2] for ll in lineList[1]]
                groundtruth_syllable = [ll[0]-lineList[0][0] for ll in lineList[1]]

                print('Syllable:')
                print(lyrics_line)

                print('Length of syllables, length of ground truth syllables:')
                print(len(lyrics_line), len(groundtruth_syllable))

                pinyin_score   = pinyins[i_obs]
                pinyin_score   = [ps for ps in pinyin_score if len(ps)]
                duration_score = syllable_durations[i_obs]
                duration_score = np.array([float(ds) for ds in duration_score if len(ds)])
                duration_score = duration_score * (time_line/np.sum(duration_score))

                if varin['decoding'] == 'viterbi':
                    # segmental decoding
                    obs_i[0] = 1.0
                    obs_i[-1] = 1.0
                    i_boundary = viterbiSegmental2(obs_i, duration_score, varin)
                    # # uncomment this section if we want to write boundaries to .syll.lab file
                    filename_syll_lab = join(eval_results_path, dataset_path, recording_name+'_'+str(i_obs+1)+'.syll.lab')
                    label = True

                else:
                    i_boundary = peakPicking(1.0-obs_i)

                    # arg_pp = {'threshold': 0.54, 'smooth': 0, 'fps': 1. / hopsize_t, 'pre_max': hopsize_t,
                    #           'post_max': hopsize_t}
                    # # peak_picking = OnsetPeakPickingProcessor(threshold=threshold,smooth=smooth,fps=fps,pre_max=pre_max,post_max=post_max)
                    # peak_picking = OnsetPeakPickingProcessor(**arg_pp)
                    # i_boundary = peak_picking.process(obs_i)
                    # i_boundary = np.append(i_boundary, (len(obs_i) - 1) * hopsize_t)
                    # i_boundary /= hopsize_t
                    filename_syll_lab = join(eval_results_path + '_peakPicking', dataset_path,
                                             recording_name + '_' + str(i_obs + 1) + '.syll.lab')
                    label = False

                time_boundray_start = np.array(i_boundary[:-1]) * hopsize_t
                time_boundray_end = np.array(i_boundary[1:]) * hopsize_t

                eval_results_data_path = dirname(filename_syll_lab)

                if not exists(eval_results_data_path):
                    makedirs(eval_results_data_path)

                if varin['decoding'] == 'viterbi':
                    boundaryList = zip(time_boundray_start.tolist(), time_boundray_end.tolist(), lyrics_line)
                else:
                    boundaryList = zip(time_boundray_start.tolist(), time_boundray_end.tolist())

                # write boundary lab file
                boundaryLabWriter(boundaryList=boundaryList,
                                  outputFilename=filename_syll_lab,
                                    label=label)

                # print(i_boundary)
                # print(len(obs_i))
                # print(np.array(groundtruth_syllable)*fs/hopsize)

                if varin['plot']:
                    # plot Error analysis figures
                    plt.figure(figsize=(16, 6))
                    # plt.figure(figsize=(8, 4))
                    # class weight
                    ax1 = plt.subplot(3,1,1)
                    y = np.arange(0, 80)
                    x = np.arange(0, mfcc_line.shape[0])*(hopsize/float(fs))
                    cax = plt.pcolormesh(x, y, np.transpose(mfcc_line[:, 80 * 11:80 * 12]))
                    for gs in groundtruth_syllable:
                        plt.axvline(gs, color='r', linewidth=2)
                    # cbar = fig.colorbar(cax)
                    ax1.set_ylabel('Mel bands', fontsize=12)
                    ax1.get_xaxis().set_visible(False)
                    ax1.axis('tight')
                    plt.title('Calculating: '+recording_name+' phrase '+str(i_obs))

                    ax2 = plt.subplot(312, sharex=ax1)
                    plt.plot(np.arange(0,len(obs_i))*(hopsize/float(fs)), obs_i)
                    for ib in i_boundary:
                        plt.axvline(ib * (hopsize / float(fs)), color='r', linewidth=2)

                    ax2.set_ylabel('ODF', fontsize=12)
                    ax2.axis('tight')


                    ax3 = plt.subplot(313, sharex=ax1)
                    print(duration_score)
                    time_start = 0
                    for ii_ds, ds in enumerate(duration_score):
                        ax3.add_patch(
                            patches.Rectangle(
                                (time_start, ii_ds),  # (x,y)
                                ds,  # width
                                1,  # height
                            ))
                        time_start += ds
                    ax3.set_ylim((0,len(duration_score)))
                    # plt.xlabel('Time (s)')
                    # plt.tight_layout()

                    plt.show()
コード例 #5
0
def dumpFeatureOnset(recordings,
                     dataset_path,
                     feature_type='mfcc',
                     dmfcc=True,
                     nbf=False):
    '''
    dump the MFCC for each phoneme
    :param recordings:
    :return:
    '''

    # p: position, n: negative, 75: 0.75 sample_weight
    mfcc_p_all = []
    mfcc_n_all = []
    sample_weights_p_all = []
    sample_weights_n_all = []

    for i_recording, recording_name in enumerate(recordings):
        groundtruth_textgrid_file = os.path.join(aCapella_root, dataset_path,
                                                 annotation_path,
                                                 recording_name + '.TextGrid')
        score_file = os.path.join(aCapella_root, dataset_path, score_path,
                                  recording_name + '.csv')
        wav_file = os.path.join(aCapella_root, dataset_path, audio_path,
                                recording_name + '.wav')

        if not os.path.isfile(score_file):
            print 'Score not found: ' + score_file
            continue

        lineList = textGrid2WordList(groundtruth_textgrid_file,
                                     whichTier='line')
        utteranceList = textGrid2WordList(groundtruth_textgrid_file,
                                          whichTier='dianSilence')

        # parse lines of groundtruth
        nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines(
            lineList, utteranceList)

        # parse score
        utterance_durations, bpm = csvDurationScoreParser(score_file)

        # load audio
        audio = ess.MonoLoader(downmix='left',
                               filename=wav_file,
                               sampleRate=fs)()

        if feature_type == 'mfcc':
            # MFCC feature
            mfcc = getFeature(audio, d=dmfcc, nbf=nbf)
        elif feature_type == 'mfccBands1D':
            mfcc = getMFCCBands1D(audio, nbf=nbf)
            mfcc = np.log(100000 * mfcc + 1)
        elif feature_type == 'mfccBands2D':
            mfcc = getMFCCBands2D(audio,
                                  framesize,
                                  nbf=nbf,
                                  nlen=varin['nlen'])
            mfcc = np.log(100000 * mfcc + 1)
        else:
            print(feature_type + ' is not exist.')
            raise

        # create the ground truth lab files
        for idx, list in enumerate(nestedUtteranceLists):
            if int(bpm[idx]):
                print 'Processing feature collecting ... ' + recording_name + ' phrase ' + str(
                    idx + 1)

                times_onset = [u[0] for u in list[1]]
                # syllable onset frames
                frames_onset = np.array(np.around(
                    np.array(times_onset) * fs / hopsize),
                                        dtype=int)

                # line start and end frames
                frame_start = frames_onset[0]
                frame_end = int(list[0][1] * fs / hopsize)

                frames_onset_p75 = np.hstack(
                    (frames_onset - 1, frames_onset + 1))
                frames_onset_p50 = np.hstack(
                    (frames_onset - 2, frames_onset + 2))
                frames_onset_p25 = np.hstack(
                    (frames_onset - 3, frames_onset + 3))

                frames_onset_p75 = removeOutOfRange(frames_onset_p75,
                                                    frame_start, frame_end)
                frames_onset_p50 = removeOutOfRange(frames_onset_p50,
                                                    frame_start, frame_end)
                frames_onset_p25 = removeOutOfRange(frames_onset_p25,
                                                    frame_start, frame_end)
                # print(frames_onset_p75, frames_onset_p50, frames_onset_p25)

                # mfcc positive
                mfcc_p100 = mfcc[frames_onset, :]
                mfcc_p75 = mfcc[frames_onset_p75, :]
                mfcc_p50 = mfcc[frames_onset_p50, :]
                mfcc_p25 = mfcc[frames_onset_p25, :]

                # print(mfcc_p100.shape, mfcc_p75.shape, mfcc_p50.shape)

                frames_n25 = np.hstack((frames_onset - 4, frames_onset + 4))
                frames_n50 = np.hstack((frames_onset - 5, frames_onset + 5))
                frames_n75 = np.hstack((frames_onset - 6, frames_onset + 6))

                frames_n25 = removeOutOfRange(frames_n25, frame_start,
                                              frame_end)
                frames_n50 = removeOutOfRange(frames_n50, frame_start,
                                              frame_end)
                frames_n75 = removeOutOfRange(frames_n75, frame_start,
                                              frame_end)

                # mfcc negative
                mfcc_n25 = mfcc[frames_n25, :]
                mfcc_n50 = mfcc[frames_n50, :]
                mfcc_n75 = mfcc[frames_n75, :]

                frames_all = np.arange(frame_start, frame_end)
                frames_n100 = np.setdiff1d(
                    frames_all,
                    np.hstack((frames_onset, frames_onset_p75,
                               frames_onset_p50, frames_onset_p25, frames_n25,
                               frames_n50, frames_n75)))
                # print(frames_n100.shape, frames_all.shape)
                mfcc_n100 = mfcc[frames_n100, :]

                mfcc_p = np.concatenate(
                    (mfcc_p100, mfcc_p75, mfcc_p50, mfcc_p25), axis=0)
                sample_weights_p = np.concatenate(
                    (np.ones(
                        (mfcc_p100.shape[0], )), np.ones(
                            (mfcc_p75.shape[0], )) * 0.75,
                     np.ones((mfcc_p50.shape[0], )) * 0.5,
                     np.ones((mfcc_p25.shape[0], )) * 0.25))
                # print(sample_weights_p)
                # print(mfcc_p.shape)

                mfcc_n = np.concatenate(
                    (mfcc_n100, mfcc_n75, mfcc_n50, mfcc_n25), axis=0)
                sample_weights_n = np.concatenate(
                    (np.ones(
                        (mfcc_n100.shape[0], )), np.ones(
                            (mfcc_n75.shape[0], )) * 0.75,
                     np.ones((mfcc_n50.shape[0], )) * 0.5,
                     np.ones((mfcc_n25.shape[0], )) * 0.25))

                mfcc_p_all.append(mfcc_p)
                mfcc_n_all.append(mfcc_n)
                sample_weights_p_all.append(sample_weights_p)
                sample_weights_n_all.append(sample_weights_n)

                # print(len(mfcc_p_all), len(mfcc_n_all), len(sample_weights_p_all), len(sample_weights_n_all))

    return np.concatenate(mfcc_p_all), \
           np.concatenate(mfcc_n_all), \
           np.concatenate(sample_weights_p_all), \
           np.concatenate(sample_weights_n_all)
コード例 #6
0
def dumpFeaturePhoneme(full_path_recordings,
                       full_path_textgrids,
                       syllableTierName,
                       phonemeTierName,
                       feature_type='mfcc',
                       dmfcc=True,
                       nbf=False):
    '''
    dump the MFCC for each phoneme
    :param recordings:
    :return:
    '''

    ##-- dictionary feature
    dic_pho_feature = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_feature[pho] = np.array([])

    for ii_rec, recording in enumerate(full_path_recordings):

        lineList = textGrid2WordList(full_path_textgrids[ii_rec],
                                     whichTier=syllableTierName)
        utteranceList = textGrid2WordList(full_path_textgrids[ii_rec],
                                          whichTier=phonemeTierName)

        # parse lines of groundtruth
        nestedPhonemeLists, _, _ = wordListsParseByLines(
            lineList, utteranceList)

        # audio
        wav_full_filename = recording
        audio = ess.MonoLoader(downmix='left',
                               filename=wav_full_filename,
                               sampleRate=fs)()

        if feature_type == 'mfcc':
            # MFCC feature
            mfcc = getFeature(audio, d=dmfcc, nbf=nbf)
        elif feature_type == 'mfccBands1D':
            mfcc = getMFCCBands1D(audio, nbf=nbf)
            mfcc = np.log(100000 * mfcc + 1)
        elif feature_type == 'mfccBands2D':
            mfcc = getMFCCBands2D(audio,
                                  framesize,
                                  nbf=nbf,
                                  nlen=varin['nlen'])
            mfcc = np.log(100000 * mfcc + 1)
        else:
            print(feature_type + ' is not exist.')
            raise

        for ii, pho in enumerate(nestedPhonemeLists):
            print 'calculating ', recording, ' and phoneme ', str(
                ii), ' of ', str(len(nestedPhonemeLists))
            for p in pho[1]:
                # map from annotated xsampa to readable notation
                key = dic_pho_map[p[2]]

                sf = int(round(p[0] * fs / float(hopsize)))  # starting frame
                ef = int(round(p[1] * fs / float(hopsize)))  # ending frame

                mfcc_p = mfcc[sf:ef, :]  # phoneme syllable

                if not len(dic_pho_feature[key]):
                    dic_pho_feature[key] = mfcc_p
                else:
                    dic_pho_feature[key] = np.vstack(
                        (dic_pho_feature[key], mfcc_p))

    return dic_pho_feature