Esempio n. 1
0
def syllable_prob(wav_file):
    '''
		load model and get syllable Probability
	return:
		syllable Probability [numpy 1dim]

	'''
    model_joint = load_model(model_joint_path)
    scaler_joint = pickle.load(open(model_scaler_path))
    log_mel_old = get_log_mel_madmom(wav_file,
                                     fs=fs_wav,
                                     hopsize_t=hopsize_t,
                                     channel=1)
    log_mel = scaler_joint.transform(log_mel_old)
    log_mel = feature_reshape(log_mel, nlen=7)
    log_mel = np.expand_dims(log_mel, axis=1)

    obs_syllable, obs_phoneme = model_joint.predict(log_mel,
                                                    batch_size=128,
                                                    verbose=2)

    obs_syllable = np.squeeze(obs_syllable)
    obs_syllable = smooth_obs(obs_syllable)
    obs_syllable[0] = 1.0
    obs_syllable[-1] = 0.0
    return obs_syllable
Esempio n. 2
0
def _main(wav_file,input_json,output_json,mode):
	root_path = os.path.join(os.path.dirname(__file__))
	joint_cnn_model_path = os.path.join(root_path, 'cnnModels', 'joint')
	# load keras joint cnn model
	model_joint = load_model(os.path.join(joint_cnn_model_path, 'jan_joint0.h5'))
	# load log mel feature scaler
	scaler_joint = pickle.load(open(os.path.join(joint_cnn_model_path, 'scaler_joint.pkl'), 'rb'))
	data_wav, fs_wav = librosa.load(wav_file,sr=44100)
	mfshs = MFSHS(data_wav)
	mfshs.frame()
	pitches = mfshs.pitches
	zeroAmploc = mfshs.zeroAmploc
	#frequency = np.array(pitchResult['frequency'])

	log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1)
	log_mel = scaler_joint.transform(log_mel_old)
	log_mel = feature_reshape(log_mel, nlen=7)
	log_mel = np.expand_dims(log_mel, axis=1)
	obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2)
	obs_syllable = np.squeeze(obs_syllable)
	obs_syllable = smooth_obs(obs_syllable)
	obs_syllable[0] = 1.0
	obs_syllable[-1] = 0.0

	#print sf_onset_frame
	score_note,pauseLoc = parse_musescore(input_json)
	resultOnset = findPeak(obs_syllable,pitches,score_note)
	Note_and_onset = pitch_Note(pitches,resultOnset['onset_frame'],score_note)
	score_note = np.array(score_note)
	result_loc_info = sw_alignment(score_note,Note_and_onset['notes'])

	#result_info = saveJson(filename_json,pitches,resultOnset['onset_frame'],score_note,pauseLoc,mode)
	post_proprocess(output_json,pitches,resultOnset['onset_frame'],zeroAmploc,score_note,pauseLoc,result_loc_info,mode)
def dump_feature_phn(wav_path, textgrid_path, recordings, syllableTierName,
                     phonemeTierName):
    """
    Dump feature for each phoneme
    :param wav_path:
    :param textgrid_path:
    :param recordings:
    :param syllableTierName:
    :param phonemeTierName:
    :return:
    """

    dic_pho_feature = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_feature[pho] = np.array([])

    for artist_path, recording in recordings:
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path=textgrid_path,
                                         recording=join(artist_path,recording),
                                         tier0=syllableTierName,
                                         tier1=phonemeTierName)

        # audio
        wav_full_filename = join(wav_path, artist_path, recording + '.wav')

        log_mel = get_log_mel_madmom(wav_full_filename,
                                     fs,
                                     hopsize_t,
                                     channel=1)

        for ii, pho in enumerate(nestedPhonemeLists):
            print 'calculating ', recording, ' and phoneme ', str(
                ii), ' of ', str(len(nestedPhonemeLists))
            for p in pho[1]:
                # map from annotated xsampa to readable notation
                try:
                    key = dic_pho_map[p[2]]
                except KeyError:
                    print(artist_path, recording)
                    print(ii, p[2])
                    raise

                sf = int(round(p[0] * fs / float(hopsize)))  # starting frame
                ef = int(round(p[1] * fs / float(hopsize)))  # ending frame

                log_mel_phn = log_mel[sf:ef, :]  # phoneme syllable

                if not len(dic_pho_feature[key]):
                    dic_pho_feature[key] = log_mel_phn
                else:
                    dic_pho_feature[key] = np.vstack(
                        (dic_pho_feature[key], log_mel_phn))

    return dic_pho_feature
Esempio n. 4
0
def det_syllable_prob(wav_file, model_joint, scaler_joint):
    log_mel_old = get_log_mel_madmom(wav_file,
                                     fs=fs_wav,
                                     hopsize_t=hopsize_t,
                                     channel=1)
    print(log_mel_old.shape)
    log_mel = scaler_joint.transform(log_mel_old)
    log_mel = feature_reshape(log_mel, nlen=7)
    log_mel = np.expand_dims(log_mel, axis=1)

    obs_syllable, obs_phoneme = model_joint.predict(log_mel,
                                                    batch_size=128,
                                                    verbose=2)

    obs_syllable = np.squeeze(obs_syllable)
    obs_syllable = smooth_obs(obs_syllable)
    obs_syllable[0] = 1.0
    obs_syllable[-1] = 0.0
    return obs_syllable
def dump_feature_onset_helper(wav_path, textgrid_path, artist_name,
                              recording_name):

    groundtruth_textgrid_file = os.path.join(textgrid_path, artist_name,
                                             recording_name + '.TextGrid')
    wav_file = os.path.join(wav_path, artist_name, recording_name + '.wav')

    lineList = textGrid2WordList(groundtruth_textgrid_file, whichTier='line')
    utteranceList = textGrid2WordList(groundtruth_textgrid_file,
                                      whichTier='dianSilence')
    phonemeList = textGrid2WordList(groundtruth_textgrid_file,
                                    whichTier='details')

    # parse lines of groundtruth
    nestedUtteranceLists, numLines, numUtterances = wordListsParseByLines(
        lineList, utteranceList)
    nestedPhonemeLists, _, _ = wordListsParseByLines(lineList, phonemeList)

    # load audio
    mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1)

    return nestedUtteranceLists, nestedPhonemeLists, mfcc, phonemeList
Esempio n. 6
0
def onset_function_all_recordings(wav_path,
                                  textgrid_path,
                                  scaler,
                                  test_recordings,
                                  model_keras_cnn_0,
                                  cnnModel_name,
                                  eval_results_path,
                                  obs_cal='tocal',
                                  plot=False,
                                  save_data=False,
                                  missing_phn=False):
    """
    ODF and viterbi decoding
    :param wav_path: string, the path of the .wav files
    :param textgrid_path: string, the path of the .textgrid files
    :param scaler: sklearn scaler object
    :param test_recordings: list, the test recording names
    :param model_keras_cnn_0: loaded keras CNN model name
    :param eval_results_path: string, where to put the evaluation results
    :param obs_cal: string, tocal or toload, if to calculate the observation function
    :param plot: bool
    :param save_data: bool, whether to save the wav, duration and label data
    :param missing_phn: bool, whether to consider the missing phonemes in actual singing, for experiment (not in the paper)
    :return:
    """

    for artist_path, fn in test_recordings:

        print('Calculating for artist:', artist_path, 'filename:', fn)

        # use the teacher's text grid as the score
        score_text_grid_file = join(textgrid_path, artist_path,
                                    'teacher.TextGrid')
        # student text grid
        ground_truth_text_grid_file = join(textgrid_path, artist_path,
                                           fn + '.TextGrid')

        # student .wav
        wav_file = join(wav_path, artist_path, fn + '.wav')

        # parse teacher (score) and student (ground truth) text grid file
        score_syllable_lists, score_phoneme_lists = \
            textgrid_syllable_phoneme_parser(score_text_grid_file, 'dianSilence', 'detailsSilence')
        gt_syllable_lists, gt_phoneme_lists = \
            textgrid_syllable_phoneme_parser(ground_truth_text_grid_file, 'dianSilence', 'details')

        # do audio precessing
        if obs_cal == 'tocal' or plot:
            mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1)
            mfcc_scaled = scaler.transform(mfcc)
            mfcc_reshaped = feature_reshape(mfcc_scaled, nlen=7)

        for ii_line in range(len(gt_syllable_lists)):
            print('line:', ii_line)

            # observation path, save the onset function for the next time calculation
            obs_path = join('./obs', cnnModel_name, artist_path)
            obs_syllable_filename = fn + '_syllable_' + str(ii_line +
                                                            1) + '.pkl'
            obs_phoneme_filename = fn + '_phoneme_' + str(ii_line + 1) + '.pkl'

            # sometimes the score and ground truth text grids are not started from the same phrase,
            # ii_aug is the offset
            ii_aug = findShiftOffset(gt_syllable_lists, score_syllable_lists,
                                     ii_line)

            # calculate necessary information from the text grid
            frame_start, frame_end, \
            time_start, time_end, \
            syllable_gt_onsets, syllable_gt_labels, \
            phoneme_gt_onsets, phoneme_gt_labels, \
            syllable_score_onsets, syllable_score_labels, \
            phoneme_score_onsets, phoneme_score_labels, \
            syllable_score_durs, phoneme_list_score = \
                gt_score_preparation_helper(gt_syllable_lists,
                                            score_syllable_lists,
                                            gt_phoneme_lists,
                                            score_phoneme_lists,
                                            ii_line,
                                            ii_aug)

            # collect phoneme durations and labels
            phoneme_score_durs_grouped_by_syllables = []
            phoneme_score_labels_grouped_by_syllables = []
            phoneme_score_durs_syllable = []
            phoneme_score_labels_syllable = []
            for pls in phoneme_list_score:

                # when the phoneme onset time is also the syllable onset time
                if pls[0] in syllable_score_onsets[1:]:
                    phoneme_score_durs_grouped_by_syllables.append(
                        phoneme_score_durs_syllable)
                    phoneme_score_labels_grouped_by_syllables.append(
                        phoneme_score_labels_syllable)
                    phoneme_score_durs_syllable = []
                    phoneme_score_labels_syllable = []

                phoneme_score_durs_syllable.append(pls[1] - pls[0])
                phoneme_score_labels_syllable.append(pls[2])

                if pls == phoneme_list_score[-1]:
                    phoneme_score_durs_grouped_by_syllables.append(
                        phoneme_score_durs_syllable)
                    phoneme_score_labels_grouped_by_syllables.append(
                        phoneme_score_labels_syllable)

            # onsets start from time 0
            syllable_gt_onsets_0start = np.array(
                syllable_gt_onsets) - syllable_gt_onsets[0]
            phoneme_gt_onsets_0start = np.array(
                phoneme_gt_onsets) - phoneme_gt_onsets[0]
            phoneme_gt_onsets_0start_without_syllable_onsets = \
                np.setdiff1d(phoneme_gt_onsets_0start, syllable_gt_onsets_0start)

            if not set(syllable_gt_onsets).issubset(set(phoneme_gt_onsets)):
                raise
            if not set(syllable_score_onsets).issubset(
                    set(phoneme_score_onsets)):
                raise

            frame_start = int(round(time_start / hopsize_t))
            frame_end = int(round(time_end / hopsize_t))

            syllable_score_durs = np.array(syllable_score_durs)
            syllable_score_durs *= (time_end -
                                    time_start) / np.sum(syllable_score_durs)

            if obs_cal == 'tocal' or plot:
                mfcc_line = mfcc[frame_start:frame_end]
                mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end]
                mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1)

                # calculate syllable and phoneme onset functions
                obs_syllable, obs_phoneme = model_keras_cnn_0.predict(
                    mfcc_reshaped_line, batch_size=128, verbose=2)

                # save onset functions into obs_path
                print('save onset curve ... ...')
                if not exists(obs_path):
                    makedirs(obs_path)
                pickle.dump(obs_syllable,
                            open(join(obs_path, obs_syllable_filename), 'w'))
                pickle.dump(obs_phoneme,
                            open(join(obs_path, obs_phoneme_filename), 'w'))

            else:
                obs_syllable = pickle.load(
                    open(join(obs_path, obs_syllable_filename), 'r'))
                obs_phoneme = pickle.load(
                    open(join(obs_path, obs_phoneme_filename), 'r'))

            obs_syllable = np.squeeze(obs_syllable)
            obs_phoneme = np.squeeze(obs_phoneme)

            obs_syllable = smooth_obs(obs_syllable)
            obs_phoneme = smooth_obs(obs_phoneme)

            # decoding syllable boundaries
            obs_syllable[0] = 1.0
            obs_syllable[-1] = 1.0
            boundaries_syllable = viterbiDecodingPhonemeSeg.viterbiSegmental2(
                obs_syllable, syllable_score_durs, varin)

            # syllable boundaries
            boundaries_syllable_start_time = np.array(
                boundaries_syllable[:-1]) * hopsize_t
            boundaries_syllable_end_time = np.array(
                boundaries_syllable[1:]) * hopsize_t

            # initialize phoneme boundaries arrays
            boundaries_phoneme_start_time = np.array([])
            boundaries_phoneme_end_time = np.array([])

            # array of the phoneme durations to be concatenated
            phoneme_score_durs = np.array([])
            phoneme_score_labels = []

            # decode phoneme onsets
            for ii_syl_boundary in range(len(boundaries_syllable) - 1):

                dur_syl = boundaries_syllable_end_time[
                    ii_syl_boundary] - boundaries_syllable_start_time[
                        ii_syl_boundary]

                frame_start_syl = boundaries_syllable[ii_syl_boundary]
                frame_end_syl = boundaries_syllable[ii_syl_boundary + 1]

                obs_phoneme_syl = obs_phoneme[frame_start_syl:frame_end_syl]
                obs_phoneme_syl[0] = 1.0
                obs_phoneme_syl[-1] = 1.0

                # phoneme score durs and labels for the current syllable, used in the decoding
                phoneme_score_durs_syl = np.array(
                    phoneme_score_durs_grouped_by_syllables[ii_syl_boundary])

                if len(phoneme_score_durs_syl) < 2:
                    continue

                phoneme_score_durs_syl_vars = [phoneme_score_durs_syl
                                               ]  # init the durs_syl_vars
                if missing_phn:
                    phoneme_score_labels_syl = phoneme_score_labels_grouped_by_syllables[
                        ii_syl_boundary]
                    phoneme_score_labels_syl_vars, phoneme_score_durs_syl_vars = \
                        score_variations_phn(phoneme_score_labels_syl, phoneme_score_durs_syl)

                # missing phoneme decoding, only for experiment, not included in the paper
                if missing_phn and len(phoneme_score_durs_syl_vars) > 1:
                    boundaries_phoneme_syl_vars = []
                    phoneme_score_durs_syl_vars_norm = []
                    posterior_vars = []
                    for ii in range(len(phoneme_score_durs_syl_vars)):
                        phoneme_score_labels_syl_vars_ii = phoneme_score_labels_syl_vars[
                            ii]
                        phoneme_score_durs_syl_vars_ii = np.array(
                            phoneme_score_durs_syl_vars[ii])
                        phoneme_score_durs_syl_vars_ii *= dur_syl / np.sum(
                            phoneme_score_durs_syl_vars_ii)
                        boundaries_phoneme_syl_vars_ii, pp_ii = \
                            viterbiDecodingPhonemeSeg.viterbiSegmentalPenalized(obs_phoneme_syl,
                                                                                phoneme_score_durs_syl_vars_ii,
                                                                                varin)
                        posterior = pp_ii / np.power(
                            len(phoneme_score_labels_syl_vars_ii),
                            varin['posterior_norm'])

                        boundaries_phoneme_syl_vars.append(
                            boundaries_phoneme_syl_vars_ii)
                        phoneme_score_durs_syl_vars_norm.append(
                            phoneme_score_durs_syl_vars_ii)
                        posterior_vars.append(posterior)

                    # posterior vars either contain inf or nan
                    if len(posterior_vars) and np.all(
                            np.isinf(posterior_vars) +
                            np.isnan(posterior_vars)):
                        continue

                    idx_max_posterior = np.argmax(posterior_vars)
                    boundaries_phoneme_syl = boundaries_phoneme_syl_vars[
                        idx_max_posterior]
                    phoneme_score_labels += phoneme_score_labels_syl_vars[
                        idx_max_posterior]
                    phoneme_score_durs_syl = phoneme_score_durs_syl_vars_norm[
                        idx_max_posterior]
                    # print(idx_max_posterior)

                else:
                    phoneme_score_durs_syl *= dur_syl / np.sum(
                        phoneme_score_durs_syl)
                    phoneme_score_labels += phoneme_score_labels_grouped_by_syllables[
                        ii_syl_boundary]
                    boundaries_phoneme_syl = \
                        viterbiDecodingPhonemeSeg.viterbiSegmental2(obs_phoneme_syl, phoneme_score_durs_syl, varin)

                # phoneme boundaries
                boundaries_phoneme_syl_start_time = \
                    (np.array(boundaries_phoneme_syl[:-1]) + frame_start_syl) * hopsize_t
                boundaries_phoneme_syl_end_time = (np.array(
                    boundaries_phoneme_syl[1:]) + frame_start_syl) * hopsize_t

                boundaries_phoneme_start_time = \
                    np.concatenate((boundaries_phoneme_start_time, boundaries_phoneme_syl_start_time))
                boundaries_phoneme_end_time = \
                    np.concatenate((boundaries_phoneme_end_time, boundaries_phoneme_syl_end_time))

                phoneme_score_durs = np.concatenate(
                    (phoneme_score_durs, phoneme_score_durs_syl))

            phoneme_score_durs *= (time_end -
                                   time_start) / np.sum(phoneme_score_durs)

            # save the results
            results_aggregation_save_helper(
                syllable_gt_onsets_0start, syllable_gt_labels,
                boundaries_syllable_start_time, syllable_score_labels,
                phoneme_gt_onsets_0start, phoneme_gt_labels,
                boundaries_phoneme_start_time, phoneme_score_labels,
                eval_results_path, artist_path, fn, ii_line,
                time_end - time_start)

            if plot:
                figure_plot_joint(
                    mfcc_line, syllable_gt_onsets_0start,
                    phoneme_gt_onsets_0start_without_syllable_onsets,
                    obs_syllable, boundaries_syllable_start_time, obs_phoneme,
                    boundaries_phoneme_start_time, syllable_score_durs,
                    phoneme_score_durs)

            if save_data:
                # save wav line
                data_wav, fs_wav = sf.read(wav_file)
                sf.write(
                    './temp/wav_line_' + str(ii_line) + '.wav',
                    data_wav[int(time_start * fs_wav):int(time_end * fs_wav)],
                    fs_wav)
                print(time_start, time_end)

                # save durations:
                pickle.dump(
                    syllable_score_durs,
                    open('./temp/syllable_score_durs_' + str(ii_line) + '.pkl',
                         'w'),
                    protocol=2)
                pickle.dump(
                    phoneme_score_durs_grouped_by_syllables,
                    open(
                        './temp/phoneme_score_durs_grouped_by_syllables_' +
                        str(ii_line) + '.pkl', 'w'),
                    protocol=2)
                print(syllable_score_durs)
                print(phoneme_score_durs_grouped_by_syllables)

                # save labels:
                pickle.dump(syllable_score_labels,
                            open(
                                './temp/syllable_score_labels_' +
                                str(ii_line) + '.pkl', 'w'),
                            protocol=2)
                pickle.dump(
                    phoneme_score_labels_grouped_by_syllables,
                    open(
                        './temp/phoneme_score_labels_grouped_by_syllables_' +
                        str(ii_line) + '.pkl', 'w'),
                    protocol=2)
                print(syllable_score_labels)
                print(phoneme_score_labels_grouped_by_syllables)
Esempio n. 7
0
def _main(wav_file,score_file,est_file=None):
	print(wav_file)
	data_wav, fs_wav = librosa.load(wav_file,sr=44100)
	#start_time = time.time()
	start_time = time.time()
	
	mfshs = MFSHS(data_wav)
	mfshs.frame()

	pitches = mfshs.pitches
	energes = mfshs.energes
	zeroAmploc = mfshs.zeroAmploc
	#print('pitch detection time:',time.time()-start_time)

	root_path = os.path.join(os.path.dirname(__file__))
	joint_cnn_model_path = os.path.join(root_path, 'cnnModels', 'joint')

	# load keras joint cnn model
	model_joint = load_model(os.path.join(joint_cnn_model_path, 'jan_joint0.h5'))
	# load log mel feature scaler
	scaler_joint = pickle.load(open(os.path.join(joint_cnn_model_path, 'scaler_joint.pkl'), 'rb'))


	log_mel_old = get_log_mel_madmom(wav_file, fs=fs_wav, hopsize_t=hopsize_t, channel=1)
	log_mel = scaler_joint.transform(log_mel_old)
	log_mel = feature_reshape(log_mel, nlen=7)
	log_mel = np.expand_dims(log_mel, axis=1)

	#start_time = time.time()
	obs_syllable, obs_phoneme = model_joint.predict(log_mel, batch_size=128, verbose=2)
	#print('cnn detection time: ',time.time()-start_time)

	obs_syllable = np.squeeze(obs_syllable)
	obs_syllable = smooth_obs(obs_syllable)
	obs_syllable[0] = 1.0
	obs_syllable[-1] = 0.0

	#start_time = time.time()

	score_note,pauseLoc = parse_musescore(score_file)

	resultOnset = findPeak(obs_syllable,pitches,score_note,est_file)
	filename_json = os.path.splitext(wav_file)[0]+".json"
	#print('post-processing time :' ,time.time()-start_time)
	
	Note_and_onset = pitch_Note(pitches,resultOnset['onset_frame'],score_note)
	#draw_energe(energes,resultOnset['onset_frame'],zeroAmploc)
	score_note = np.array(score_note)
	result_loc_info = sw_alignment(score_note,Note_and_onset['notes'])

	#result_info,paddingzero_frame = saveJson(filename_json,pitches,resultOnset['onset_frame'],score_note,pauseLoc,0)
	result_info,det_Note = post_proprocess(filename_json,pitches,resultOnset['onset_frame'],zeroAmploc,score_note,pauseLoc,result_loc_info,0)

	#print("total time:",time.time()-start_time)
	filename_pitch = os.path.splitext(wav_file)[0]+"_pitch.txt"
	mfshs.saveArray(filename_pitch,pitches)
	filename_onset = os.path.splitext(wav_file)[0]+"_onset.txt"
	mfshs.saveArray(filename_onset,resultOnset['onset_time'])
	filename_score = os.path.splitext(wav_file)[0]+"_score.txt"
	mfshs.saveArray(filename_score,score_note)
	filename_detnote = os.path.splitext(wav_file)[0]+"_detnote.txt"
	mfshs.saveArray(filename_detnote,np.round(np.array(det_Note),2))

	return result_info['score']
# get wav duration
data_wav, fs_wav = sf.read(wav_file)
time_wav = len(data_wav) / float(fs_wav)

onset_time_ref, syllable_durations_ref = get_onset_time_syllable_duration_ref(
    syllable_durations=syllable_durations, len_audio=time_wav)

print(onset_time_ref)
print(syllable_durations_ref)

results_vad = VAD(wav_file=wav_file, hopsize_t=hopsize_t)

# calculate log mel feature
log_mel_old = get_log_mel_madmom(wav_file,
                                 fs=fs_wav,
                                 hopsize_t=hopsize_t,
                                 channel=1)
log_mel = scaler_joint.transform(log_mel_old)
log_mel = feature_reshape(log_mel, nlen=7)
log_mel = np.expand_dims(log_mel, axis=1)

# get the onset detection function
obs_syllable, obs_phoneme = model_joint.predict(log_mel,
                                                batch_size=128,
                                                verbose=2)

# post-processing the detection function
obs_syllable = np.squeeze(obs_syllable)
obs_phoneme = np.squeeze(obs_phoneme)

obs_syllable = smooth_obs(obs_syllable)
def phoneme_seg_all_recordings(wav_path,
                               textgrid_path,
                               scaler,
                               scaler_joint,
                               test_recordings,
                               model_keras_cnn_0,
                               model_joint,
                               eval_results_path,
                               use_joint_obs=False,
                               plot=False,
                               debug_mode=False):
    """
    :param wav_path:
    :param textgrid_path:
    :param scaler:
    :param scaler_joint: onset detection joing model scaler, for experiment, not included in the paper
    :param test_recordings:
    :param model_keras_cnn_0:
    :param model_joint: onset detection joint model, for experiment, not included in the paper
    :param eval_results_path:
    :param use_joint_obs: bool
    :param plot: bool
    :param debug_mode: bool
    :return:
    """

    for artist_path, fn in test_recordings:

        print('Calculating for artist:', artist_path, 'filename:', fn)

        score_textgrid_file = join(textgrid_path, artist_path,
                                   'teacher.TextGrid')
        groundtruth_textgrid_file = join(textgrid_path, artist_path,
                                         fn + '.TextGrid')
        wav_file = join(wav_path, artist_path, fn + '.wav')
        scoreSyllableLists, scorePhonemeLists = textgrid_syllable_phoneme_parser(
            score_textgrid_file, 'dianSilence', 'details')
        gtSyllableLists, gtPhonemeLists = textgrid_syllable_phoneme_parser(
            groundtruth_textgrid_file, 'dianSilence', 'details')

        # calculate mfcc
        mfcc = get_log_mel_madmom(wav_file, fs, hopsize_t, channel=1)
        mfcc_scaled = scaler.transform(mfcc)
        mfcc_reshaped = feature_reshape(mfcc_scaled, nlen=7)

        if use_joint_obs:
            mfcc_scaled_joint = scaler_joint.transform(mfcc)
            mfcc_reshaped_joint = feature_reshape(mfcc_scaled_joint, nlen=7)

        for ii_line in range(len(gtSyllableLists)):
            print('line:', ii_line)

            # search for the corresponding score line
            ii_aug = findShiftOffset(gtSyllableLists, scoreSyllableLists,
                                     ii_line)

            frame_start, frame_end, \
            time_start, time_end, \
            syllable_gt_onsets, syllable_gt_labels, \
            phoneme_gt_onsets, phoneme_gt_labels, \
            syllable_score_onsets, syllable_score_labels, \
            phoneme_score_onsets, phoneme_score_labels, \
            syllable_score_durs, phoneme_list_score = \
                            gt_score_preparation_helper(gtSyllableLists,
                                                        scoreSyllableLists,
                                                        gtPhonemeLists,
                                                        scorePhonemeLists,
                                                        ii_line,
                                                        ii_aug)

            # phoneme durations and labels
            phoneme_score_durs = []
            # index of syllable onsets in phoneme onsets list
            idx_syllable_score_phoneme = []
            for ii_pls, pls in enumerate(phoneme_list_score):
                # when the phoneme onset time is also the syllable onset time
                phoneme_score_durs.append(pls[1] - pls[0])

                if pls[0] in syllable_score_onsets:
                    idx_syllable_score_phoneme.append(ii_pls)

            # map the phone labels
            phoneme_score_labels_mapped = [
                dic_pho_map[l] for l in phoneme_score_labels
            ]

            # normalize phoneme score durations
            phoneme_score_durs = np.array(phoneme_score_durs)
            phoneme_score_durs *= (time_end -
                                   time_start) / np.sum(phoneme_score_durs)

            # onsets start from time 0, syllable and phoneme onsets
            syllable_gt_onsets_0start = np.array(
                syllable_gt_onsets) - syllable_gt_onsets[0]
            phoneme_gt_onsets_0start = np.array(
                phoneme_gt_onsets) - phoneme_gt_onsets[0]
            phoneme_gt_onsets_0start_without_syllable_onsets = \
                np.setdiff1d(phoneme_gt_onsets_0start, syllable_gt_onsets_0start)

            # check the annotations, if syllable onset are also phoneme onsets
            if not set(syllable_gt_onsets).issubset(set(phoneme_gt_onsets)):
                raise
            if not set(syllable_score_onsets).issubset(
                    set(phoneme_score_onsets)):
                raise

            # line level mfcc
            mfcc_line = mfcc[frame_start:frame_end]
            mfcc_reshaped_line = mfcc_reshaped[frame_start:frame_end]
            mfcc_reshaped_line = np.expand_dims(mfcc_reshaped_line, axis=1)

            if use_joint_obs:
                mfcc_reshaped_line_joint = mfcc_reshaped_joint[
                    frame_start:frame_end]
                mfcc_reshaped_line_joint = np.expand_dims(
                    mfcc_reshaped_line_joint, axis=1)
                _, obs_joint_phoneme = model_joint.predict(
                    mfcc_reshaped_line_joint, batch_size=128, verbose=2)

                obs_joint_phoneme = obs_joint_phoneme[:, 0]
                # obs_joint_phoneme[:20] = 0.0
            else:
                obs_joint_phoneme = None

            # transition matrix
            mat_tran = singleTransMatBuild(phoneme_score_labels_mapped)

            # initialize the the HSMM
            # set proportionality to 0.2 in some sample will break
            hsmm = LRHSMM(mat_tran,
                          phoneme_score_labels_mapped,
                          phoneme_score_durs,
                          proportionality_std=0.2)

            # calculate observation
            hsmm.mapBKeras(observations=mfcc_reshaped_line,
                           kerasModel=model_keras_cnn_0,
                           obs_onset_phn=obs_joint_phoneme,
                           use_joint_obs=use_joint_obs,
                           debug_mode=debug_mode)

            forwardDelta, \
            previousState, \
            state, \
            stateIn, \
            occupancy, \
            tau = hsmm._inferenceInit(observations=mfcc_reshaped_line)

            path, posteri_proba = hsmm._viterbiHSMM(
                forwardDelta,
                previousState,
                state,
                stateIn,
                occupancy,
                tau,
                obsOnsetPhn=obs_joint_phoneme)

            # construct ground truth path
            phoneme_gt_onsets_0start_frame = list(
                np.floor(phoneme_gt_onsets_0start * (len(path) /
                                                     (time_end - time_start))))
            path_gt = np.zeros((len(path), ), dtype='int')
            state_num = 0
            for ii_path in range(len(path)):
                if ii_path in phoneme_gt_onsets_0start_frame[1:]:
                    state_num += 1
                path_gt[ii_path] = state_num

            # detected phoneme onsets
            phoneme_start_frame = [0]
            for ii_path in range(len(path) - 1):
                if path[ii_path] != path[ii_path + 1]:
                    phoneme_start_frame.append(ii_path + 1)

            boundaries_phoneme_start_time = list(
                np.array(phoneme_start_frame) * (time_end - time_start) /
                len(path))
            boundaries_syllable_start_time = [
                boundaries_phoneme_start_time[ii_bpst]
                for ii_bpst in range(len(boundaries_phoneme_start_time))
                if ii_bpst in idx_syllable_score_phoneme
            ]

            # remove the silence from the score and the ground truth onset time
            if u'' in phoneme_gt_labels:
                phoneme_gt_onsets_0start, phoneme_gt_labels = remove_silence(
                    phoneme_gt_onsets_0start, phoneme_gt_labels)

            if u'' in phoneme_score_labels:
                boundaries_phoneme_start_time, phoneme_score_labels = remove_silence(
                    boundaries_phoneme_start_time, phoneme_score_labels)

            results_aggregation_save_helper(
                syllable_gt_onsets_0start, syllable_gt_labels,
                boundaries_syllable_start_time, syllable_score_labels,
                phoneme_gt_onsets_0start, phoneme_gt_labels,
                boundaries_phoneme_start_time, phoneme_score_labels,
                eval_results_path, artist_path, fn, ii_line,
                time_end - time_start)

            if plot:
                figure_plot_hsmm(
                    mfcc_line, syllable_gt_onsets_0start,
                    phoneme_gt_onsets_0start_without_syllable_onsets, hsmm,
                    phoneme_score_labels_mapped, path,
                    boundaries_phoneme_start_time,
                    boundaries_syllable_start_time, syllable_score_durs,
                    phoneme_score_durs, obs_joint_phoneme)