def syllable_prob(wav_file): ''' load model and get syllable Probability return: syllable Probability [numpy 1dim] ''' model_joint = load_model(model_joint_path) scaler_joint = pickle.load(open(model_scaler_path)) log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1) log_mel = scaler_joint.transform(log_mel_old) log_mel = feature_reshape(log_mel, nlen=7) log_mel = np.expand_dims(log_mel, axis=1) obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2) obs_syllable = np.squeeze(obs_syllable) obs_syllable = smooth_obs(obs_syllable) obs_syllable[0] = 1.0 obs_syllable[-1] = 0.0 return obs_syllable
def _main(wav_file,input_json,output_json,mode): root_path = os.path.join(os.path.dirname(__file__)) joint_cnn_model_path = os.path.join(root_path, 'cnnModels', 'joint') # load keras joint cnn model model_joint = load_model(os.path.join(joint_cnn_model_path, 'jan_joint0.h5')) # load log mel feature scaler scaler_joint = pickle.load(open(os.path.join(joint_cnn_model_path, 'scaler_joint.pkl'), 'rb')) data_wav, fs_wav = librosa.load(wav_file,sr=44100) mfshs = MFSHS(data_wav) mfshs.frame() pitches = mfshs.pitches zeroAmploc = mfshs.zeroAmploc #frequency = np.array(pitchResult['frequency']) log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1) log_mel = scaler_joint.transform(log_mel_old) log_mel = feature_reshape(log_mel, nlen=7) log_mel = np.expand_dims(log_mel, axis=1) obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2) obs_syllable = np.squeeze(obs_syllable) obs_syllable = smooth_obs(obs_syllable) obs_syllable[0] = 1.0 obs_syllable[-1] = 0.0 #print sf_onset_frame score_note,pauseLoc = parse_musescore(input_json) resultOnset = findPeak(obs_syllable,pitches,score_note) Note_and_onset = pitch_Note(pitches,resultOnset['onset_frame'],score_note) score_note = np.array(score_note) result_loc_info = sw_alignment(score_note,Note_and_onset['notes']) #result_info = saveJson(filename_json,pitches,resultOnset['onset_frame'],score_note,pauseLoc,mode) post_proprocess(output_json,pitches,resultOnset['onset_frame'],zeroAmploc,score_note,pauseLoc,result_loc_info,mode)
def dump_feature_phn(wav_path, textgrid_path, recordings, syllableTierName, phonemeTierName): """ Dump feature for each phoneme :param wav_path: :param textgrid_path: :param recordings: :param syllableTierName: :param phonemeTierName: :return: """ dic_pho_feature = {} for _, pho in enumerate(set(dic_pho_map.values())): dic_pho_feature[pho] = np.array([]) for artist_path, recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path=textgrid_path, recording=join(artist_path,recording), tier0=syllableTierName, tier1=phonemeTierName) # audio wav_full_filename = join(wav_path, artist_path, recording + '.wav') log_mel = get_log_mel_madmom(wav_full_filename, fs, hopsize_t, channel=1) for ii, pho in enumerate(nestedPhonemeLists): print 'calculating ', recording, ' and phoneme ', str( ii), ' of ', str(len(nestedPhonemeLists)) for p in pho[1]: # map from annotated xsampa to readable notation try: key = dic_pho_map[p[2]] except KeyError: print(artist_path, recording) print(ii, p[2]) raise sf = int(round(p[0] * fs / float(hopsize))) # starting frame ef = int(round(p[1] * fs / float(hopsize))) # ending frame log_mel_phn = log_mel[sf:ef, :] # phoneme syllable if not len(dic_pho_feature[key]): dic_pho_feature[key] = log_mel_phn else: dic_pho_feature[key] = np.vstack( (dic_pho_feature[key], log_mel_phn)) return dic_pho_feature
def det_syllable_prob(wav_file, model_joint, scaler_joint): log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1) print(log_mel_old.shape) log_mel = scaler_joint.transform(log_mel_old) log_mel = feature_reshape(log_mel, nlen=7) log_mel = np.expand_dims(log_mel, axis=1) obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2) obs_syllable = np.squeeze(obs_syllable) obs_syllable = smooth_obs(obs_syllable) obs_syllable[0] = 1.0 obs_syllable[-1] = 0.0 return obs_syllable
def dump_feature_onset_helper(wav_path, textgrid_path, artist_name, recording_name): groundtruth_textgrid_file = os.path.join(textgrid_path, artist_name, recording_name + '.TextGrid') wav_file = os.path.join(wav_path, artist_name, recording_name + '.wav') lineList = textGrid2WordList(groundtruth_textgrid_file, whichTier='line') utteranceList = textGrid2WordList(groundtruth_textgrid_file, whichTier='dianSilence') phonemeList = textGrid2WordList(groundtruth_textgrid_file, whichTier='details') # parse lines of groundtruth nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines( lineList, utteranceList) nestedPhonemeLists, _, _ = wordListsParseByLines(lineList, phonemeList) # load audio mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1) return nestedUtteranceLists, nestedPhonemeLists, mfcc, phonemeList
def onset_function_all_recordings(wav_path, textgrid_path, scaler, test_recordings, model_keras_cnn_0, cnnModel_name, eval_results_path, obs_cal='tocal', plot=False, save_data=False, missing_phn=False): """ ODF and viterbi decoding :param wav_path: string, the path of the .wav files :param textgrid_path: string, the path of the .textgrid files :param scaler: sklearn scaler object :param test_recordings: list, the test recording names :param model_keras_cnn_0: loaded keras CNN model name :param eval_results_path: string, where to put the evaluation results :param obs_cal: string, tocal or toload, if to calculate the observation function :param plot: bool :param save_data: bool, whether to save the wav, duration and label data :param missing_phn: bool, whether to consider the missing phonemes in actual singing, for experiment (not in the paper) :return: """ for artist_path, fn in test_recordings: print('Calculating for artist:', artist_path, 'filename:', fn) # use the teacher's text grid as the score score_text_grid_file = join(textgrid_path, artist_path, 'teacher.TextGrid') # student text grid ground_truth_text_grid_file = join(textgrid_path, artist_path, fn + '.TextGrid') # student .wav wav_file = join(wav_path, artist_path, fn + '.wav') # parse teacher (score) and student (ground truth) text grid file score_syllable_lists, score_phoneme_lists = \ textgrid_syllable_phoneme_parser(score_text_grid_file, 'dianSilence', 'detailsSilence') gt_syllable_lists, gt_phoneme_lists = \ textgrid_syllable_phoneme_parser(ground_truth_text_grid_file, 'dianSilence', 'details') # do audio precessing if obs_cal == 'tocal' or plot: mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1) mfcc_scaled = scaler.transform(mfcc) mfcc_reshaped = feature_reshape(mfcc_scaled, nlen=7) for ii_line in range(len(gt_syllable_lists)): print('line:', ii_line) # observation path, save the onset function for the next time calculation obs_path = join('./obs', cnnModel_name, artist_path) obs_syllable_filename = fn + '_syllable_' + str(ii_line + 1) + '.pkl' obs_phoneme_filename = fn + '_phoneme_' + str(ii_line + 1) + '.pkl' # sometimes the score and ground truth text grids are not started from the same phrase, # ii_aug is the offset ii_aug = findShiftOffset(gt_syllable_lists, score_syllable_lists, ii_line) # calculate necessary information from the text grid frame_start, frame_end, \ time_start, time_end, \ syllable_gt_onsets, syllable_gt_labels, \ phoneme_gt_onsets, phoneme_gt_labels, \ syllable_score_onsets, syllable_score_labels, \ phoneme_score_onsets, phoneme_score_labels, \ syllable_score_durs, phoneme_list_score = \ gt_score_preparation_helper(gt_syllable_lists, score_syllable_lists, gt_phoneme_lists, score_phoneme_lists, ii_line, ii_aug) # collect phoneme durations and labels phoneme_score_durs_grouped_by_syllables = [] phoneme_score_labels_grouped_by_syllables = [] phoneme_score_durs_syllable = [] phoneme_score_labels_syllable = [] for pls in phoneme_list_score: # when the phoneme onset time is also the syllable onset time if pls[0] in syllable_score_onsets[1:]: phoneme_score_durs_grouped_by_syllables.append( phoneme_score_durs_syllable) phoneme_score_labels_grouped_by_syllables.append( phoneme_score_labels_syllable) phoneme_score_durs_syllable = [] phoneme_score_labels_syllable = [] phoneme_score_durs_syllable.append(pls[1] - pls[0]) phoneme_score_labels_syllable.append(pls[2]) if pls == phoneme_list_score[-1]: phoneme_score_durs_grouped_by_syllables.append( phoneme_score_durs_syllable) phoneme_score_labels_grouped_by_syllables.append( phoneme_score_labels_syllable) # onsets start from time 0 syllable_gt_onsets_0start = np.array( syllable_gt_onsets) - syllable_gt_onsets[0] phoneme_gt_onsets_0start = np.array( phoneme_gt_onsets) - phoneme_gt_onsets[0] phoneme_gt_onsets_0start_without_syllable_onsets = \ np.setdiff1d(phoneme_gt_onsets_0start, syllable_gt_onsets_0start) if not set(syllable_gt_onsets).issubset(set(phoneme_gt_onsets)): raise if not set(syllable_score_onsets).issubset( set(phoneme_score_onsets)): raise frame_start = int(round(time_start / hopsize_t)) frame_end = int(round(time_end / hopsize_t)) syllable_score_durs = np.array(syllable_score_durs) syllable_score_durs *= (time_end - time_start) / np.sum(syllable_score_durs) if obs_cal == 'tocal' or plot: mfcc_line = mfcc[frame_start:frame_end] mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end] mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1) # calculate syllable and phoneme onset functions obs_syllable, obs_phoneme = model_keras_cnn_0.predict( mfcc_reshaped_line, batch_size=128, verbose=2) # save onset functions into obs_path print('save onset curve ... ...') if not exists(obs_path): makedirs(obs_path) pickle.dump(obs_syllable, open(join(obs_path, obs_syllable_filename), 'w')) pickle.dump(obs_phoneme, open(join(obs_path, obs_phoneme_filename), 'w')) else: obs_syllable = pickle.load( open(join(obs_path, obs_syllable_filename), 'r')) obs_phoneme = pickle.load( open(join(obs_path, obs_phoneme_filename), 'r')) obs_syllable = np.squeeze(obs_syllable) obs_phoneme = np.squeeze(obs_phoneme) obs_syllable = smooth_obs(obs_syllable) obs_phoneme = smooth_obs(obs_phoneme) # decoding syllable boundaries obs_syllable[0] = 1.0 obs_syllable[-1] = 1.0 boundaries_syllable = viterbiDecodingPhonemeSeg.viterbiSegmental2( obs_syllable, syllable_score_durs, varin) # syllable boundaries boundaries_syllable_start_time = np.array( boundaries_syllable[:-1]) * hopsize_t boundaries_syllable_end_time = np.array( boundaries_syllable[1:]) * hopsize_t # initialize phoneme boundaries arrays boundaries_phoneme_start_time = np.array([]) boundaries_phoneme_end_time = np.array([]) # array of the phoneme durations to be concatenated phoneme_score_durs = np.array([]) phoneme_score_labels = [] # decode phoneme onsets for ii_syl_boundary in range(len(boundaries_syllable) - 1): dur_syl = boundaries_syllable_end_time[ ii_syl_boundary] - boundaries_syllable_start_time[ ii_syl_boundary] frame_start_syl = boundaries_syllable[ii_syl_boundary] frame_end_syl = boundaries_syllable[ii_syl_boundary + 1] obs_phoneme_syl = obs_phoneme[frame_start_syl:frame_end_syl] obs_phoneme_syl[0] = 1.0 obs_phoneme_syl[-1] = 1.0 # phoneme score durs and labels for the current syllable, used in the decoding phoneme_score_durs_syl = np.array( phoneme_score_durs_grouped_by_syllables[ii_syl_boundary]) if len(phoneme_score_durs_syl) < 2: continue phoneme_score_durs_syl_vars = [phoneme_score_durs_syl ] # init the durs_syl_vars if missing_phn: phoneme_score_labels_syl = phoneme_score_labels_grouped_by_syllables[ ii_syl_boundary] phoneme_score_labels_syl_vars, phoneme_score_durs_syl_vars = \ score_variations_phn(phoneme_score_labels_syl, phoneme_score_durs_syl) # missing phoneme decoding, only for experiment, not included in the paper if missing_phn and len(phoneme_score_durs_syl_vars) > 1: boundaries_phoneme_syl_vars = [] phoneme_score_durs_syl_vars_norm = [] posterior_vars = [] for ii in range(len(phoneme_score_durs_syl_vars)): phoneme_score_labels_syl_vars_ii = phoneme_score_labels_syl_vars[ ii] phoneme_score_durs_syl_vars_ii = np.array( phoneme_score_durs_syl_vars[ii]) phoneme_score_durs_syl_vars_ii *= dur_syl / np.sum( phoneme_score_durs_syl_vars_ii) boundaries_phoneme_syl_vars_ii, pp_ii = \ viterbiDecodingPhonemeSeg.viterbiSegmentalPenalized(obs_phoneme_syl, phoneme_score_durs_syl_vars_ii, varin) posterior = pp_ii / np.power( len(phoneme_score_labels_syl_vars_ii), varin['posterior_norm']) boundaries_phoneme_syl_vars.append( boundaries_phoneme_syl_vars_ii) phoneme_score_durs_syl_vars_norm.append( phoneme_score_durs_syl_vars_ii) posterior_vars.append(posterior) # posterior vars either contain inf or nan if len(posterior_vars) and np.all( np.isinf(posterior_vars) + np.isnan(posterior_vars)): continue idx_max_posterior = np.argmax(posterior_vars) boundaries_phoneme_syl = boundaries_phoneme_syl_vars[ idx_max_posterior] phoneme_score_labels += phoneme_score_labels_syl_vars[ idx_max_posterior] phoneme_score_durs_syl = phoneme_score_durs_syl_vars_norm[ idx_max_posterior] # print(idx_max_posterior) else: phoneme_score_durs_syl *= dur_syl / np.sum( phoneme_score_durs_syl) phoneme_score_labels += phoneme_score_labels_grouped_by_syllables[ ii_syl_boundary] boundaries_phoneme_syl = \ viterbiDecodingPhonemeSeg.viterbiSegmental2(obs_phoneme_syl, phoneme_score_durs_syl, varin) # phoneme boundaries boundaries_phoneme_syl_start_time = \ (np.array(boundaries_phoneme_syl[:-1]) + frame_start_syl) * hopsize_t boundaries_phoneme_syl_end_time = (np.array( boundaries_phoneme_syl[1:]) + frame_start_syl) * hopsize_t boundaries_phoneme_start_time = \ np.concatenate((boundaries_phoneme_start_time, boundaries_phoneme_syl_start_time)) boundaries_phoneme_end_time = \ np.concatenate((boundaries_phoneme_end_time, boundaries_phoneme_syl_end_time)) phoneme_score_durs = np.concatenate( (phoneme_score_durs, phoneme_score_durs_syl)) phoneme_score_durs *= (time_end - time_start) / np.sum(phoneme_score_durs) # save the results results_aggregation_save_helper( syllable_gt_onsets_0start, syllable_gt_labels, boundaries_syllable_start_time, syllable_score_labels, phoneme_gt_onsets_0start, phoneme_gt_labels, boundaries_phoneme_start_time, phoneme_score_labels, eval_results_path, artist_path, fn, ii_line, time_end - time_start) if plot: figure_plot_joint( mfcc_line, syllable_gt_onsets_0start, phoneme_gt_onsets_0start_without_syllable_onsets, obs_syllable, boundaries_syllable_start_time, obs_phoneme, boundaries_phoneme_start_time, syllable_score_durs, phoneme_score_durs) if save_data: # save wav line data_wav, fs_wav = sf.read(wav_file) sf.write( './temp/wav_line_' + str(ii_line) + '.wav', data_wav[int(time_start * fs_wav):int(time_end * fs_wav)], fs_wav) print(time_start, time_end) # save durations: pickle.dump( syllable_score_durs, open('./temp/syllable_score_durs_' + str(ii_line) + '.pkl', 'w'), protocol=2) pickle.dump( phoneme_score_durs_grouped_by_syllables, open( './temp/phoneme_score_durs_grouped_by_syllables_' + str(ii_line) + '.pkl', 'w'), protocol=2) print(syllable_score_durs) print(phoneme_score_durs_grouped_by_syllables) # save labels: pickle.dump(syllable_score_labels, open( './temp/syllable_score_labels_' + str(ii_line) + '.pkl', 'w'), protocol=2) pickle.dump( phoneme_score_labels_grouped_by_syllables, open( './temp/phoneme_score_labels_grouped_by_syllables_' + str(ii_line) + '.pkl', 'w'), protocol=2) print(syllable_score_labels) print(phoneme_score_labels_grouped_by_syllables)
def _main(wav_file,score_file,est_file=None): print(wav_file) data_wav, fs_wav = librosa.load(wav_file,sr=44100) #start_time = time.time() start_time = time.time() mfshs = MFSHS(data_wav) mfshs.frame() pitches = mfshs.pitches energes = mfshs.energes zeroAmploc = mfshs.zeroAmploc #print('pitch detection time:',time.time()-start_time) root_path = os.path.join(os.path.dirname(__file__)) joint_cnn_model_path = os.path.join(root_path, 'cnnModels', 'joint') # load keras joint cnn model model_joint = load_model(os.path.join(joint_cnn_model_path, 'jan_joint0.h5')) # load log mel feature scaler scaler_joint = pickle.load(open(os.path.join(joint_cnn_model_path, 'scaler_joint.pkl'), 'rb')) log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1) log_mel = scaler_joint.transform(log_mel_old) log_mel = feature_reshape(log_mel, nlen=7) log_mel = np.expand_dims(log_mel, axis=1) #start_time = time.time() obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2) #print('cnn detection time: ',time.time()-start_time) obs_syllable = np.squeeze(obs_syllable) obs_syllable = smooth_obs(obs_syllable) obs_syllable[0] = 1.0 obs_syllable[-1] = 0.0 #start_time = time.time() score_note,pauseLoc = parse_musescore(score_file) resultOnset = findPeak(obs_syllable,pitches,score_note,est_file) filename_json = os.path.splitext(wav_file)[0]+".json" #print('post-processing time :' ,time.time()-start_time) Note_and_onset = pitch_Note(pitches,resultOnset['onset_frame'],score_note) #draw_energe(energes,resultOnset['onset_frame'],zeroAmploc) score_note = np.array(score_note) result_loc_info = sw_alignment(score_note,Note_and_onset['notes']) #result_info,paddingzero_frame = saveJson(filename_json,pitches,resultOnset['onset_frame'],score_note,pauseLoc,0) result_info,det_Note = post_proprocess(filename_json,pitches,resultOnset['onset_frame'],zeroAmploc,score_note,pauseLoc,result_loc_info,0) #print("total time:",time.time()-start_time) filename_pitch = os.path.splitext(wav_file)[0]+"_pitch.txt" mfshs.saveArray(filename_pitch,pitches) filename_onset = os.path.splitext(wav_file)[0]+"_onset.txt" mfshs.saveArray(filename_onset,resultOnset['onset_time']) filename_score = os.path.splitext(wav_file)[0]+"_score.txt" mfshs.saveArray(filename_score,score_note) filename_detnote = os.path.splitext(wav_file)[0]+"_detnote.txt" mfshs.saveArray(filename_detnote,np.round(np.array(det_Note),2)) return result_info['score']
# get wav duration data_wav, fs_wav = sf.read(wav_file) time_wav = len(data_wav) / float(fs_wav) onset_time_ref, syllable_durations_ref = get_onset_time_syllable_duration_ref( syllable_durations=syllable_durations, len_audio=time_wav) print(onset_time_ref) print(syllable_durations_ref) results_vad = VAD(wav_file=wav_file, hopsize_t=hopsize_t) # calculate log mel feature log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1) log_mel = scaler_joint.transform(log_mel_old) log_mel = feature_reshape(log_mel, nlen=7) log_mel = np.expand_dims(log_mel, axis=1) # get the onset detection function obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2) # post-processing the detection function obs_syllable = np.squeeze(obs_syllable) obs_phoneme = np.squeeze(obs_phoneme) obs_syllable = smooth_obs(obs_syllable)
def phoneme_seg_all_recordings(wav_path, textgrid_path, scaler, scaler_joint, test_recordings, model_keras_cnn_0, model_joint, eval_results_path, use_joint_obs=False, plot=False, debug_mode=False): """ :param wav_path: :param textgrid_path: :param scaler: :param scaler_joint: onset detection joing model scaler, for experiment, not included in the paper :param test_recordings: :param model_keras_cnn_0: :param model_joint: onset detection joint model, for experiment, not included in the paper :param eval_results_path: :param use_joint_obs: bool :param plot: bool :param debug_mode: bool :return: """ for artist_path, fn in test_recordings: print('Calculating for artist:', artist_path, 'filename:', fn) score_textgrid_file = join(textgrid_path, artist_path, 'teacher.TextGrid') groundtruth_textgrid_file = join(textgrid_path, artist_path, fn + '.TextGrid') wav_file = join(wav_path, artist_path, fn + '.wav') scoreSyllableLists, scorePhonemeLists = textgrid_syllable_phoneme_parser( score_textgrid_file, 'dianSilence', 'details') gtSyllableLists, gtPhonemeLists = textgrid_syllable_phoneme_parser( groundtruth_textgrid_file, 'dianSilence', 'details') # calculate mfcc mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1) mfcc_scaled = scaler.transform(mfcc) mfcc_reshaped = feature_reshape(mfcc_scaled, nlen=7) if use_joint_obs: mfcc_scaled_joint = scaler_joint.transform(mfcc) mfcc_reshaped_joint = feature_reshape(mfcc_scaled_joint, nlen=7) for ii_line in range(len(gtSyllableLists)): print('line:', ii_line) # search for the corresponding score line ii_aug = findShiftOffset(gtSyllableLists, scoreSyllableLists, ii_line) frame_start, frame_end, \ time_start, time_end, \ syllable_gt_onsets, syllable_gt_labels, \ phoneme_gt_onsets, phoneme_gt_labels, \ syllable_score_onsets, syllable_score_labels, \ phoneme_score_onsets, phoneme_score_labels, \ syllable_score_durs, phoneme_list_score = \ gt_score_preparation_helper(gtSyllableLists, scoreSyllableLists, gtPhonemeLists, scorePhonemeLists, ii_line, ii_aug) # phoneme durations and labels phoneme_score_durs = [] # index of syllable onsets in phoneme onsets list idx_syllable_score_phoneme = [] for ii_pls, pls in enumerate(phoneme_list_score): # when the phoneme onset time is also the syllable onset time phoneme_score_durs.append(pls[1] - pls[0]) if pls[0] in syllable_score_onsets: idx_syllable_score_phoneme.append(ii_pls) # map the phone labels phoneme_score_labels_mapped = [ dic_pho_map[l] for l in phoneme_score_labels ] # normalize phoneme score durations phoneme_score_durs = np.array(phoneme_score_durs) phoneme_score_durs *= (time_end - time_start) / np.sum(phoneme_score_durs) # onsets start from time 0, syllable and phoneme onsets syllable_gt_onsets_0start = np.array( syllable_gt_onsets) - syllable_gt_onsets[0] phoneme_gt_onsets_0start = np.array( phoneme_gt_onsets) - phoneme_gt_onsets[0] phoneme_gt_onsets_0start_without_syllable_onsets = \ np.setdiff1d(phoneme_gt_onsets_0start, syllable_gt_onsets_0start) # check the annotations, if syllable onset are also phoneme onsets if not set(syllable_gt_onsets).issubset(set(phoneme_gt_onsets)): raise if not set(syllable_score_onsets).issubset( set(phoneme_score_onsets)): raise # line level mfcc mfcc_line = mfcc[frame_start:frame_end] mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end] mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1) if use_joint_obs: mfcc_reshaped_line_joint = mfcc_reshaped_joint[ frame_start:frame_end] mfcc_reshaped_line_joint = np.expand_dims( mfcc_reshaped_line_joint, axis=1) _, obs_joint_phoneme = model_joint.predict( mfcc_reshaped_line_joint, batch_size=128, verbose=2) obs_joint_phoneme = obs_joint_phoneme[:, 0] # obs_joint_phoneme[:20] = 0.0 else: obs_joint_phoneme = None # transition matrix mat_tran = singleTransMatBuild(phoneme_score_labels_mapped) # initialize the the HSMM # set proportionality to 0.2 in some sample will break hsmm = LRHSMM(mat_tran, phoneme_score_labels_mapped, phoneme_score_durs, proportionality_std=0.2) # calculate observation hsmm.mapBKeras(observations=mfcc_reshaped_line, kerasModel=model_keras_cnn_0, obs_onset_phn=obs_joint_phoneme, use_joint_obs=use_joint_obs, debug_mode=debug_mode) forwardDelta, \ previousState, \ state, \ stateIn, \ occupancy, \ tau = hsmm._inferenceInit(observations=mfcc_reshaped_line) path, posteri_proba = hsmm._viterbiHSMM( forwardDelta, previousState, state, stateIn, occupancy, tau, obsOnsetPhn=obs_joint_phoneme) # construct ground truth path phoneme_gt_onsets_0start_frame = list( np.floor(phoneme_gt_onsets_0start * (len(path) / (time_end - time_start)))) path_gt = np.zeros((len(path), ), dtype='int') state_num = 0 for ii_path in range(len(path)): if ii_path in phoneme_gt_onsets_0start_frame[1:]: state_num += 1 path_gt[ii_path] = state_num # detected phoneme onsets phoneme_start_frame = [0] for ii_path in range(len(path) - 1): if path[ii_path] != path[ii_path + 1]: phoneme_start_frame.append(ii_path + 1) boundaries_phoneme_start_time = list( np.array(phoneme_start_frame) * (time_end - time_start) / len(path)) boundaries_syllable_start_time = [ boundaries_phoneme_start_time[ii_bpst] for ii_bpst in range(len(boundaries_phoneme_start_time)) if ii_bpst in idx_syllable_score_phoneme ] # remove the silence from the score and the ground truth onset time if u'' in phoneme_gt_labels: phoneme_gt_onsets_0start, phoneme_gt_labels = remove_silence( phoneme_gt_onsets_0start, phoneme_gt_labels) if u'' in phoneme_score_labels: boundaries_phoneme_start_time, phoneme_score_labels = remove_silence( boundaries_phoneme_start_time, phoneme_score_labels) results_aggregation_save_helper( syllable_gt_onsets_0start, syllable_gt_labels, boundaries_syllable_start_time, syllable_score_labels, phoneme_gt_onsets_0start, phoneme_gt_labels, boundaries_phoneme_start_time, phoneme_score_labels, eval_results_path, artist_path, fn, ii_line, time_end - time_start) if plot: figure_plot_hsmm( mfcc_line, syllable_gt_onsets_0start, phoneme_gt_onsets_0start_without_syllable_onsets, hsmm, phoneme_score_labels_mapped, path, boundaries_phoneme_start_time, boundaries_syllable_start_time, syllable_score_durs, phoneme_score_durs, obs_joint_phoneme)