def data_parser(artist_path, wav_path, textgrid_path, rn, score_file, lab): """parse the wav filename, text grid and score""" if not lab: # ground truth text grid ground_truth_text_grid_file = join(textgrid_path, artist_path, rn + '.TextGrid') # wav wav_file = join(wav_path, artist_path, rn + '.wav') # parse line line_list = textGrid2WordList(ground_truth_text_grid_file, whichTier='line') # parse syllable syllable_list = textGrid2WordList(ground_truth_text_grid_file, whichTier='dianSilence') # parse lines of ground truth nested_syllable_lists, _, _ = wordListsParseByLines(line_list, syllable_list) # parse score syllables, pinyins, syllable_durations, bpm = csvScorePinyinParser(score_file) else: ground_truth_text_grid_file = join(textgrid_path, artist_path, rn + '.lab') wav_file = join(wav_path, artist_path, rn + '.mp3') line_list = [lab2WordList(ground_truth_text_grid_file, label=True)] syllables, syllable_durations, bpm = csvDurationScoreParser(score_file) nested_syllable_lists = None pinyins = None return nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins
def phoDurCollection(full_path_textgrids): ''' collect durations of pho into dictionary :param recordings: :return: ''' dict_duration_pho = {} for full_path_textgrid in full_path_textgrids: lineList = textGrid2WordList(full_path_textgrid, whichTier='dian') utteranceList = textGrid2WordList(full_path_textgrid, whichTier='details') # parse lines of groundtruth nestedPhonemeLists, _, _ = wordListsParseByLines( lineList, utteranceList) for pho in nestedPhonemeLists: for p in pho[1]: dur_pho = p[1] - p[0] sampa_pho = dic_pho_map[p[2]] if sampa_pho not in dict_duration_pho.keys(): dict_duration_pho[sampa_pho] = [dur_pho] else: dict_duration_pho[sampa_pho].append(dur_pho) return dict_duration_pho
def parse_syllable_line_list(ground_truth_text_grid_file, parent_tier, child_tier): if not os.path.isfile(ground_truth_text_grid_file): is_file_exist = False return False, is_file_exist, False else: is_file_exist = True # parse line line_list, _ = textGrid2WordList(ground_truth_text_grid_file, whichTier=parent_tier) # parse syllable syllable_list, is_syllable_found = textGrid2WordList( ground_truth_text_grid_file, whichTier=child_tier) # parse lines of ground truth nested_syllable_lists, _, _ = wordListsParseByLines( line_list, syllable_list) return nested_syllable_lists, is_file_exist, is_syllable_found
def onsetFunctionAllRecordings(recordings, textgrid_path, dict_recording_name_mapping, dataset_path, feature_type='mfcc', dmfcc=False, nbf=True, mth='jordi', late_fusion=True): """ ODF and viter decoding :param recordings: :param textgrid_path: :param dict_recording_name_mapping: mapping from "fem_01" to standard format, see filePath.py :param dataset_path: :param feature_type: 'mfcc', 'mfccBands1D' or 'mfccBands2D' :param dmfcc: delta for 'mfcc' :param nbf: context frames :param mth: jordi, jordi_horizontal_timbral, jan, jan_chan3 :param late_fusion: Bool :return: """ scaler = pickle.load(open(full_path_mfccBands_2D_scaler_onset, 'rb')) # kerasModel = _LRHMM.kerasModel(full_path_keras_cnn_am) for i_recording, recording_name in enumerate(recordings): groundtruth_textgrid_file = join(textgrid_path, dict_recording_name_mapping[recording_name]+'.TextGrid') score_file = join(aCapella_root, dataset_path, score_path, recording_name+'.csv') wav_file = join(aCapella_root, dataset_path, audio_path, recording_name+'.wav') if not isfile(score_file): print 'Score not found: ' + score_file continue lineList = textGrid2WordList(groundtruth_textgrid_file, whichTier='line') utteranceList = textGrid2WordList(groundtruth_textgrid_file, whichTier='dianSilence') # parse lines of groundtruth nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines(lineList, utteranceList) # parse score syllables, pinyins, syllable_durations, bpm = generatePinyin(score_file) # print(pinyins) # print(syllable_durations) if varin['obs'] == 'tocal': # load audio audio_monoloader = ess.MonoLoader(downmix = 'left', filename = wav_file, sampleRate = fs)() audio_eqloudloder = ess.EqloudLoader(filename=wav_file, sampleRate = fs)() if mth == 'jordi' or mth == 'jordi_horizontal_timbral' or mth == 'jan': mfcc, mfcc_reshaped = featureExtraction(audio_monoloader, scaler, int(round(0.025 * fs)), dmfcc=dmfcc, nbf=nbf, feature_type='mfccBands2D') for i_obs, lineList in enumerate(nestedUtteranceLists): if int(bpm[i_obs]): sample_start = int(round(lineList[0][0] * fs)) sample_end = int(round(lineList[0][1] * fs)) frame_start = int(round(lineList[0][0] * fs / hopsize)) frame_end = int(round(lineList[0][1] * fs / hopsize)) # print(feature.shape) obs_path = join('./obs', cnnModel_name, dataset_path) obs_filename = recording_name + '_' + str(i_obs + 1) + '.pkl' full_obs_name = join(obs_path, obs_filename) if varin['obs'] == 'tocal': if mth == 'jordi' or mth == 'jordi_horizontal_timbral' or mth == 'jan': audio_eqloudloder_line = audio_eqloudloder[sample_start:sample_end] mfcc_line = mfcc[frame_start:frame_end] mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end] mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1) obs = getOnsetFunction(observations=mfcc_reshaped_line, model=model_keras_cnn_0, method=mth) # obs_i = obs[:,1] obs_i = obs[:, 0] hann = np.hanning(5) hann /= np.sum(hann) obs_i = np.convolve(hann, obs_i, mode='same') # save onset curve print('save onset curve ... ...') obs_dirpath = dirname(full_obs_name) if not exists(obs_dirpath): makedirs(obs_dirpath) pickle.dump(obs_i, open(full_obs_name, 'w')) else: obs_i = pickle.load(open(full_obs_name, 'r')) if late_fusion: if varin['obs'] == 'viterbi': obs_2 = getOnsetFunction(observations=mfcc_reshaped_line, path_keras_cnn=full_path_keras_cnn_1, method=mth) obs_2_i = obs_2[:, 1] obs_2_i = np.convolve(hann, obs_2_i, mode='same') else: obs_path_1 = join('./obs', cnnModel_name_1, dataset_path) full_obs_name_1 = join(obs_path_1, obs_filename) obs_2_i = pickle.load(open(full_obs_name_1, 'r')) obs_i = late_fusion_calc(obs_i, obs_2_i, mth=2) # organize score print('Calculating: '+recording_name+' phrase '+str(i_obs)) print('ODF Methods: '+mth_ODF+' Late fusion: '+str(fusion)) time_line = lineList[0][1] - lineList[0][0] lyrics_line = [ll[2] for ll in lineList[1]] groundtruth_syllable = [ll[0]-lineList[0][0] for ll in lineList[1]] print('Syllable:') print(lyrics_line) print('Length of syllables, length of ground truth syllables:') print(len(lyrics_line), len(groundtruth_syllable)) pinyin_score = pinyins[i_obs] pinyin_score = [ps for ps in pinyin_score if len(ps)] duration_score = syllable_durations[i_obs] duration_score = np.array([float(ds) for ds in duration_score if len(ds)]) duration_score = duration_score * (time_line/np.sum(duration_score)) if varin['decoding'] == 'viterbi': # segmental decoding obs_i[0] = 1.0 obs_i[-1] = 1.0 i_boundary = viterbiSegmental2(obs_i, duration_score, varin) # # uncomment this section if we want to write boundaries to .syll.lab file filename_syll_lab = join(eval_results_path, dataset_path, recording_name+'_'+str(i_obs+1)+'.syll.lab') label = True else: i_boundary = peakPicking(1.0-obs_i) # arg_pp = {'threshold': 0.54, 'smooth': 0, 'fps': 1. / hopsize_t, 'pre_max': hopsize_t, # 'post_max': hopsize_t} # # peak_picking = OnsetPeakPickingProcessor(threshold=threshold,smooth=smooth,fps=fps,pre_max=pre_max,post_max=post_max) # peak_picking = OnsetPeakPickingProcessor(**arg_pp) # i_boundary = peak_picking.process(obs_i) # i_boundary = np.append(i_boundary, (len(obs_i) - 1) * hopsize_t) # i_boundary /= hopsize_t filename_syll_lab = join(eval_results_path + '_peakPicking', dataset_path, recording_name + '_' + str(i_obs + 1) + '.syll.lab') label = False time_boundray_start = np.array(i_boundary[:-1]) * hopsize_t time_boundray_end = np.array(i_boundary[1:]) * hopsize_t eval_results_data_path = dirname(filename_syll_lab) if not exists(eval_results_data_path): makedirs(eval_results_data_path) if varin['decoding'] == 'viterbi': boundaryList = zip(time_boundray_start.tolist(), time_boundray_end.tolist(), lyrics_line) else: boundaryList = zip(time_boundray_start.tolist(), time_boundray_end.tolist()) # write boundary lab file boundaryLabWriter(boundaryList=boundaryList, outputFilename=filename_syll_lab, label=label) # print(i_boundary) # print(len(obs_i)) # print(np.array(groundtruth_syllable)*fs/hopsize) if varin['plot']: # plot Error analysis figures plt.figure(figsize=(16, 6)) # plt.figure(figsize=(8, 4)) # class weight ax1 = plt.subplot(3,1,1) y = np.arange(0, 80) x = np.arange(0, mfcc_line.shape[0])*(hopsize/float(fs)) cax = plt.pcolormesh(x, y, np.transpose(mfcc_line[:, 80 * 11:80 * 12])) for gs in groundtruth_syllable: plt.axvline(gs, color='r', linewidth=2) # cbar = fig.colorbar(cax) ax1.set_ylabel('Mel bands', fontsize=12) ax1.get_xaxis().set_visible(False) ax1.axis('tight') plt.title('Calculating: '+recording_name+' phrase '+str(i_obs)) ax2 = plt.subplot(312, sharex=ax1) plt.plot(np.arange(0,len(obs_i))*(hopsize/float(fs)), obs_i) for ib in i_boundary: plt.axvline(ib * (hopsize / float(fs)), color='r', linewidth=2) ax2.set_ylabel('ODF', fontsize=12) ax2.axis('tight') ax3 = plt.subplot(313, sharex=ax1) print(duration_score) time_start = 0 for ii_ds, ds in enumerate(duration_score): ax3.add_patch( patches.Rectangle( (time_start, ii_ds), # (x,y) ds, # width 1, # height )) time_start += ds ax3.set_ylim((0,len(duration_score))) # plt.xlabel('Time (s)') # plt.tight_layout() plt.show()
def dumpFeatureOnset(recordings, dataset_path, feature_type='mfcc', dmfcc=True, nbf=False): ''' dump the MFCC for each phoneme :param recordings: :return: ''' # p: position, n: negative, 75: 0.75 sample_weight mfcc_p_all = [] mfcc_n_all = [] sample_weights_p_all = [] sample_weights_n_all = [] for i_recording, recording_name in enumerate(recordings): groundtruth_textgrid_file = os.path.join(aCapella_root, dataset_path, annotation_path, recording_name + '.TextGrid') score_file = os.path.join(aCapella_root, dataset_path, score_path, recording_name + '.csv') wav_file = os.path.join(aCapella_root, dataset_path, audio_path, recording_name + '.wav') if not os.path.isfile(score_file): print 'Score not found: ' + score_file continue lineList = textGrid2WordList(groundtruth_textgrid_file, whichTier='line') utteranceList = textGrid2WordList(groundtruth_textgrid_file, whichTier='dianSilence') # parse lines of groundtruth nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines( lineList, utteranceList) # parse score utterance_durations, bpm = csvDurationScoreParser(score_file) # load audio audio = ess.MonoLoader(downmix='left', filename=wav_file, sampleRate=fs)() if feature_type == 'mfcc': # MFCC feature mfcc = getFeature(audio, d=dmfcc, nbf=nbf) elif feature_type == 'mfccBands1D': mfcc = getMFCCBands1D(audio, nbf=nbf) mfcc = np.log(100000 * mfcc + 1) elif feature_type == 'mfccBands2D': mfcc = getMFCCBands2D(audio, framesize, nbf=nbf, nlen=varin['nlen']) mfcc = np.log(100000 * mfcc + 1) else: print(feature_type + ' is not exist.') raise # create the ground truth lab files for idx, list in enumerate(nestedUtteranceLists): if int(bpm[idx]): print 'Processing feature collecting ... ' + recording_name + ' phrase ' + str( idx + 1) times_onset = [u[0] for u in list[1]] # syllable onset frames frames_onset = np.array(np.around( np.array(times_onset) * fs / hopsize), dtype=int) # line start and end frames frame_start = frames_onset[0] frame_end = int(list[0][1] * fs / hopsize) frames_onset_p75 = np.hstack( (frames_onset - 1, frames_onset + 1)) frames_onset_p50 = np.hstack( (frames_onset - 2, frames_onset + 2)) frames_onset_p25 = np.hstack( (frames_onset - 3, frames_onset + 3)) frames_onset_p75 = removeOutOfRange(frames_onset_p75, frame_start, frame_end) frames_onset_p50 = removeOutOfRange(frames_onset_p50, frame_start, frame_end) frames_onset_p25 = removeOutOfRange(frames_onset_p25, frame_start, frame_end) # print(frames_onset_p75, frames_onset_p50, frames_onset_p25) # mfcc positive mfcc_p100 = mfcc[frames_onset, :] mfcc_p75 = mfcc[frames_onset_p75, :] mfcc_p50 = mfcc[frames_onset_p50, :] mfcc_p25 = mfcc[frames_onset_p25, :] # print(mfcc_p100.shape, mfcc_p75.shape, mfcc_p50.shape) frames_n25 = np.hstack((frames_onset - 4, frames_onset + 4)) frames_n50 = np.hstack((frames_onset - 5, frames_onset + 5)) frames_n75 = np.hstack((frames_onset - 6, frames_onset + 6)) frames_n25 = removeOutOfRange(frames_n25, frame_start, frame_end) frames_n50 = removeOutOfRange(frames_n50, frame_start, frame_end) frames_n75 = removeOutOfRange(frames_n75, frame_start, frame_end) # mfcc negative mfcc_n25 = mfcc[frames_n25, :] mfcc_n50 = mfcc[frames_n50, :] mfcc_n75 = mfcc[frames_n75, :] frames_all = np.arange(frame_start, frame_end) frames_n100 = np.setdiff1d( frames_all, np.hstack((frames_onset, frames_onset_p75, frames_onset_p50, frames_onset_p25, frames_n25, frames_n50, frames_n75))) # print(frames_n100.shape, frames_all.shape) mfcc_n100 = mfcc[frames_n100, :] mfcc_p = np.concatenate( (mfcc_p100, mfcc_p75, mfcc_p50, mfcc_p25), axis=0) sample_weights_p = np.concatenate( (np.ones( (mfcc_p100.shape[0], )), np.ones( (mfcc_p75.shape[0], )) * 0.75, np.ones((mfcc_p50.shape[0], )) * 0.5, np.ones((mfcc_p25.shape[0], )) * 0.25)) # print(sample_weights_p) # print(mfcc_p.shape) mfcc_n = np.concatenate( (mfcc_n100, mfcc_n75, mfcc_n50, mfcc_n25), axis=0) sample_weights_n = np.concatenate( (np.ones( (mfcc_n100.shape[0], )), np.ones( (mfcc_n75.shape[0], )) * 0.75, np.ones((mfcc_n50.shape[0], )) * 0.5, np.ones((mfcc_n25.shape[0], )) * 0.25)) mfcc_p_all.append(mfcc_p) mfcc_n_all.append(mfcc_n) sample_weights_p_all.append(sample_weights_p) sample_weights_n_all.append(sample_weights_n) # print(len(mfcc_p_all), len(mfcc_n_all), len(sample_weights_p_all), len(sample_weights_n_all)) return np.concatenate(mfcc_p_all), \ np.concatenate(mfcc_n_all), \ np.concatenate(sample_weights_p_all), \ np.concatenate(sample_weights_n_all)
def dumpFeaturePhoneme(full_path_recordings, full_path_textgrids, syllableTierName, phonemeTierName, feature_type='mfcc', dmfcc=True, nbf=False): ''' dump the MFCC for each phoneme :param recordings: :return: ''' ##-- dictionary feature dic_pho_feature = {} for _, pho in enumerate(set(dic_pho_map.values())): dic_pho_feature[pho] = np.array([]) for ii_rec, recording in enumerate(full_path_recordings): lineList = textGrid2WordList(full_path_textgrids[ii_rec], whichTier=syllableTierName) utteranceList = textGrid2WordList(full_path_textgrids[ii_rec], whichTier=phonemeTierName) # parse lines of groundtruth nestedPhonemeLists, _, _ = wordListsParseByLines( lineList, utteranceList) # audio wav_full_filename = recording audio = ess.MonoLoader(downmix='left', filename=wav_full_filename, sampleRate=fs)() if feature_type == 'mfcc': # MFCC feature mfcc = getFeature(audio, d=dmfcc, nbf=nbf) elif feature_type == 'mfccBands1D': mfcc = getMFCCBands1D(audio, nbf=nbf) mfcc = np.log(100000 * mfcc + 1) elif feature_type == 'mfccBands2D': mfcc = getMFCCBands2D(audio, framesize, nbf=nbf, nlen=varin['nlen']) mfcc = np.log(100000 * mfcc + 1) else: print(feature_type + ' is not exist.') raise for ii, pho in enumerate(nestedPhonemeLists): print 'calculating ', recording, ' and phoneme ', str( ii), ' of ', str(len(nestedPhonemeLists)) for p in pho[1]: # map from annotated xsampa to readable notation key = dic_pho_map[p[2]] sf = int(round(p[0] * fs / float(hopsize))) # starting frame ef = int(round(p[1] * fs / float(hopsize))) # ending frame mfcc_p = mfcc[sf:ef, :] # phoneme syllable if not len(dic_pho_feature[key]): dic_pho_feature[key] = mfcc_p else: dic_pho_feature[key] = np.vstack( (dic_pho_feature[key], mfcc_p)) return dic_pho_feature