def dump_feature_onset_helper(lab, wav_path, textgrid_path, score_path, artist_name, recording_name, feature_type): """ load or parse audio, textgrid :param lab: :param wav_path: :param textgrid_path: :param score_path: :param artist_name: :param recording_name: :param feature_type: :return: """ if not lab: ground_truth_textgrid_file = os.path.join(textgrid_path, artist_name, recording_name + '.TextGrid') wav_file = os.path.join(wav_path, artist_name, recording_name + '.wav') line_list = textGrid2WordList(ground_truth_textgrid_file, whichTier='line') utterance_list = textGrid2WordList(ground_truth_textgrid_file, whichTier='dianSilence') # parse lines of groundtruth nested_utterance_lists, num_lines, num_utterances = wordListsParseByLines( line_list, utterance_list) else: ground_truth_textgrid_file = os.path.join(textgrid_path, artist_name, recording_name + '.lab') wav_file = os.path.join(wav_path, artist_name, recording_name + '.mp3') nested_utterance_lists = [ lab2WordList(ground_truth_textgrid_file, label=True) ] # parse score score_file = os.path.join(score_path, artist_name, recording_name + '.csv') _, utterance_durations, bpm = csvDurationScoreParser(score_file) # load audio if feature_type == 'madmom': mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1) else: print(feature_type + ' is not exist.') raise return nested_utterance_lists, utterance_durations, bpm, mfcc
def odf_calculation_schluter_phrase(audio_filename, scaler_0, model_keras_cnn_0, fs, hopsize_t, len_seq, stateful): mfcc = getMFCCBands2DMadmom(audio_filename, fs=fs, hopsize_t=hopsize_t, channel=1) mfcc_scaled = scaler_0.transform(mfcc) # length of the padded sequence len_2_pad = int(len_seq * np.ceil(len(mfcc_scaled) / float(len_seq))) len_padded = len_2_pad - len(mfcc_scaled) # pad feature, label and sample weights mfcc_line_pad = np.zeros((len_2_pad, mfcc_scaled.shape[1]), dtype='float32') mfcc_line_pad[:mfcc_scaled.shape[0], :] = mfcc_scaled mfcc_line_pad = featureReshape(mfcc_line_pad, nlen=7) iter_time = len(mfcc_line_pad) / len_seq obs_i = np.array([]) for ii in range(len(mfcc_line_pad) / len_seq): # evaluate for each segment mfcc_line_tensor = mfcc_line_pad[ii * len_seq:(ii + 1) * len_seq] mfcc_line_tensor = np.expand_dims(mfcc_line_tensor, axis=0) mfcc_line_tensor = np.expand_dims(mfcc_line_tensor, axis=2) y_pred = model_keras_cnn_0.predict_on_batch(mfcc_line_tensor) # remove the padded samples if ii == iter_time - 1 and len_padded > 0: y_pred = y_pred[:, :len_seq - len_padded, :] if stateful and ii == iter_time - 1: model_keras_cnn_0.reset_states() # reduce the label dimension y_pred = y_pred.reshape((y_pred.shape[1],)) obs_i = np.append(obs_i, y_pred) return obs_i, mfcc
def dump_feature_onset_helper(audio_path, annotation_path, fn, channel): audio_fn = join(audio_path, fn + '.flac') annotation_fn = join(annotation_path, fn + '.onsets') mfcc = getMFCCBands2DMadmom(audio_fn, fs, hopsize_t, channel) print('Feature collecting ...', fn) times_onset = annotationCvParser(annotation_fn) times_onset = [float(to) for to in times_onset] # syllable onset frames frames_onset = np.array(np.around(np.array(times_onset) / hopsize_t), dtype=int) # line start and end frames frame_start = 0 frame_end = mfcc.shape[0] - 1 return mfcc, frames_onset, frame_start, frame_end
def batch_process_onset_detection(wav_path, textgrid_path, score_path, scaler, test_recordings, model_keras_cnn_0, cnnModel_name, detection_results_path, architecture, len_seq, lab=False, threshold=0.54, obs_cal=True, decoding_method='viterbi', stateful=True): """ experiment process, evaluate a whole jingju dataset using CRNN model :param wav_path: string, where we store the wav :param textgrid_path: string, where we store the textgrid :param score_path: string, where we store the score :param scaler: sklearn object, StandardScaler :param test_recordings: list of strings, testing recording filename :param model_keras_cnn_0: keras .h5, model weights :param cnnModel_name: string, model name :param detection_results_path: string, where we store the evaluation results :param architecture: string, model architecture name :param len_seq: input sequence frame length :param lab: string, for Riyaz dataset, not used in the paper :param threshold: float, threshold for peak picking onset selection :param obs_cal: string, tocal or toload, for saving running time :param decoding_method: string, viterbi or peakPicking :param stateful: bool, whether to use the stateful RNN :return: """ eval_results_decoding_path = \ get_results_decoding_path(decoding_method=decoding_method, bool_corrected_score_duration=varin['corrected_score_duration'], eval_results_path=detection_results_path) for artist_path, rn in test_recordings: score_file = join(score_path, artist_path, rn + '.csv') if not isfile(score_file): print 'Score not found: ' + score_file continue nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins = \ data_parser(artist_path=artist_path, wav_path=wav_path, textgrid_path=textgrid_path, rn=rn, score_file=score_file, lab=lab) if obs_cal == 'tocal': # load audio mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1) mfcc_scaled = scaler.transform(mfcc) i_line = -1 for i_obs, line in enumerate(line_list): if not lab and len(line[2]) == 0: continue i_line += 1 try: print(syllable_durations[i_line]) except IndexError: continue if float(bpm[i_line]) == 0: continue time_line, lyrics_line, frame_start, frame_end = get_line_properties( lab=lab, line=line, hopsize_t=hopsize_t) obs_path = join('./obs', cnnModel_name, artist_path) obs_filename = rn + '_' + str(i_line + 1) + '.pkl' if obs_cal == 'tocal': obs_i, mfcc_line = odf_calculation_crnn( mfcc=mfcc, mfcc_scaled=mfcc_scaled, model_keras_cnn_0=model_keras_cnn_0, frame_start=frame_start, frame_end=frame_end, len_seq=len_seq, stateful=stateful) # save onset curve print('save onset curve ... ...') if not exists(obs_path): makedirs(obs_path) pickle.dump(obs_i, open(join(obs_path, obs_filename), 'w')) else: obs_i = pickle.load(open(join(obs_path, obs_filename), 'r')) obs_i = np.squeeze(obs_i) obs_i = smooth_obs(obs_i) # organize score print('Calculating: ', rn, ' phrase', str(i_obs)) print('ODF Methods: ', architecture) duration_score = syllable_durations[i_line] duration_score = np.array( [float(ds) for ds in duration_score if len(ds)]) duration_score *= (time_line / np.sum(duration_score)) i_boundary, label = boundary_decoding( decoding_method=decoding_method, obs_i=obs_i, duration_score=duration_score, varin=varin, threshold=threshold, hopsize_t=hopsize_t, viterbiDecoding=viterbiDecoding, OnsetPeakPickingProcessor=OnsetPeakPickingProcessor) time_boundary_start = np.array(i_boundary[:-1]) * hopsize_t time_boundary_end = np.array(i_boundary[1:]) * hopsize_t boundary_list = get_boundary_list( lab=lab, decoding_method=decoding_method, time_boundary_start=time_boundary_start, time_boundary_end=time_boundary_end, pinyins=pinyins, syllables=syllables, i_line=i_line) filename_syll_lab = join(eval_results_decoding_path, artist_path, rn + '_' + str(i_line + 1) + '.syll.lab') boundaryLabWriter(boundaryList=boundary_list, outputFilename=filename_syll_lab, label=label) if varin['plot'] and obs_cal == 'tocal': plot_jingju(nested_syllable_lists, i_line, mfcc_line, hopsize_t, obs_i, i_boundary, duration_score) return eval_results_decoding_path
def batch_process_onset_detection(wav_path, textgrid_path, score_path, scaler, test_recordings, model_keras_cnn_0, cnnModel_name, detection_results_path, architecture='baseline', lab=False, threshold=0.54, obs_cal=True, decoding_method='viterbi'): """ :param wav_path: string, path where we have the audio files :param textgrid_path: string, path where we have the text grid ground truth :param score_path: string, path where we have the scores :param scaler: scaler object sklearn :param test_recordings: list of strings, test recording filenames :param model_keras_cnn_0: keras .h5, CNN onset detection model :param cnnModel_name: string, CNN model name :param detection_results_path: string, path where we save the evaluation results :param architecture: string, the model architecture :param lab: bool, used for Riyaz dataset :param threshold: float, used for peak picking :param obs_cal: bool, if to calculate the ODF or not :param decoding_method: string, viterbi or peakPicking :return: """ eval_results_decoding_path = \ get_results_decoding_path(decoding_method=decoding_method, bool_corrected_score_duration=varin['corrected_score_duration'], eval_results_path=detection_results_path) # loop through all recordings for artist_path, rn in test_recordings: score_file = join(score_path, artist_path, rn + '.csv') if not isfile(score_file): print('Score not found: ' + score_file) continue nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins = \ data_parser(artist_path=artist_path, wav_path=wav_path, textgrid_path=textgrid_path, rn=rn, score_file=score_file, lab=lab) if obs_cal == 'tocal': # load audio mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1) mfcc_scaled = scaler.transform(mfcc) mfcc_reshaped = featureReshape(mfcc_scaled, nlen=7) i_line = -1 for i_obs, line in enumerate(line_list): # line without lyrics will be ignored if not lab and len(line[2]) == 0: continue i_line += 1 # line without duration will be ignored try: print(syllable_durations[i_line]) except IndexError: continue # line non-fixed tempo will be ignored if float(bpm[i_line]) == 0: continue time_line, lyrics_line, frame_start, frame_end = get_line_properties( lab=lab, line=line, hopsize_t=hopsize_t) # initialize ODF path and filename obs_path = join('./obs', cnnModel_name, artist_path) obs_filename = rn + '_' + str(i_line + 1) + '.pkl' if obs_cal == 'tocal': obs_i, mfcc_line = odf_calculation_no_crnn( mfcc=mfcc, mfcc_reshaped=mfcc_reshaped, model_name=cnnModel_name, model_keras_cnn_0=model_keras_cnn_0, architecture=architecture, frame_start=frame_start, frame_end=frame_end) # save onset curve print('save onset curve ... ...') if not exists(obs_path): makedirs(obs_path) pickle.dump(obs_i, open(join(obs_path, obs_filename), 'w')) else: obs_i = pickle.load(open(join(obs_path, obs_filename), 'r')) obs_i = np.squeeze(obs_i) obs_i = smooth_obs(obs_i) # organize score print('Calculating: ', rn, ' phrase ', str(i_obs)) print('ODF Methods: ', architecture) # process the score duration duration_score = syllable_durations[i_line] # only save the duration if it exists duration_score = np.array( [float(ds) for ds in duration_score if len(ds)]) # normalize the duration duration_score *= (time_line / np.sum(duration_score)) i_boundary, label = boundary_decoding( decoding_method=decoding_method, obs_i=obs_i, duration_score=duration_score, varin=varin, threshold=threshold, hopsize_t=hopsize_t, viterbiDecoding=viterbiDecoding, OnsetPeakPickingProcessor=OnsetPeakPickingProcessor) # create detected syllable result filename filename_syll_lab = join(eval_results_decoding_path, artist_path, rn + '_' + str(i_line + 1) + '.syll.lab') time_boundary_start = np.array(i_boundary[:-1]) * hopsize_t time_boundary_end = np.array(i_boundary[1:]) * hopsize_t boundary_list = get_boundary_list( lab=lab, decoding_method=decoding_method, time_boundary_start=time_boundary_start, time_boundary_end=time_boundary_end, pinyins=pinyins, syllables=syllables, i_line=i_line) boundaryLabWriter(boundaryList=boundary_list, outputFilename=filename_syll_lab, label=label) if varin['plot'] and obs_cal == 'tocal': plot_jingju(nested_syllable_lists=nested_syllable_lists, i_line=i_line, mfcc_line=mfcc_line, hopsize_t=hopsize_t, obs_i=obs_i, i_boundary=i_boundary, duration_score=duration_score) return eval_results_decoding_path