コード例 #1
0
def dump_feature_onset_helper(lab, wav_path, textgrid_path, score_path,
                              artist_name, recording_name, feature_type):
    """
    load or parse audio, textgrid
    :param lab:
    :param wav_path:
    :param textgrid_path:
    :param score_path:
    :param artist_name:
    :param recording_name:
    :param feature_type:
    :return:
    """
    if not lab:
        ground_truth_textgrid_file = os.path.join(textgrid_path, artist_name,
                                                  recording_name + '.TextGrid')
        wav_file = os.path.join(wav_path, artist_name, recording_name + '.wav')
        line_list = textGrid2WordList(ground_truth_textgrid_file,
                                      whichTier='line')
        utterance_list = textGrid2WordList(ground_truth_textgrid_file,
                                           whichTier='dianSilence')

        # parse lines of groundtruth
        nested_utterance_lists, num_lines, num_utterances = wordListsParseByLines(
            line_list, utterance_list)
    else:
        ground_truth_textgrid_file = os.path.join(textgrid_path, artist_name,
                                                  recording_name + '.lab')
        wav_file = os.path.join(wav_path, artist_name, recording_name + '.mp3')
        nested_utterance_lists = [
            lab2WordList(ground_truth_textgrid_file, label=True)
        ]

    # parse score
    score_file = os.path.join(score_path, artist_name, recording_name + '.csv')
    _, utterance_durations, bpm = csvDurationScoreParser(score_file)

    # load audio
    if feature_type == 'madmom':
        mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1)
    else:
        print(feature_type + ' is not exist.')
        raise

    return nested_utterance_lists, utterance_durations, bpm, mfcc
コード例 #2
0
def odf_calculation_schluter_phrase(audio_filename,
                                    scaler_0,
                                    model_keras_cnn_0,
                                    fs,
                                    hopsize_t,
                                    len_seq,
                                    stateful):

    mfcc = getMFCCBands2DMadmom(audio_filename, fs=fs, hopsize_t=hopsize_t, channel=1)
    mfcc_scaled = scaler_0.transform(mfcc)

    # length of the padded sequence
    len_2_pad = int(len_seq * np.ceil(len(mfcc_scaled) / float(len_seq)))
    len_padded = len_2_pad - len(mfcc_scaled)

    # pad feature, label and sample weights
    mfcc_line_pad = np.zeros((len_2_pad, mfcc_scaled.shape[1]), dtype='float32')
    mfcc_line_pad[:mfcc_scaled.shape[0], :] = mfcc_scaled
    mfcc_line_pad = featureReshape(mfcc_line_pad, nlen=7)

    iter_time = len(mfcc_line_pad) / len_seq
    obs_i = np.array([])
    for ii in range(len(mfcc_line_pad) / len_seq):

        # evaluate for each segment
        mfcc_line_tensor = mfcc_line_pad[ii * len_seq:(ii + 1) * len_seq]
        mfcc_line_tensor = np.expand_dims(mfcc_line_tensor, axis=0)
        mfcc_line_tensor = np.expand_dims(mfcc_line_tensor, axis=2)

        y_pred = model_keras_cnn_0.predict_on_batch(mfcc_line_tensor)

        # remove the padded samples
        if ii == iter_time - 1 and len_padded > 0:
            y_pred = y_pred[:, :len_seq - len_padded, :]

        if stateful and ii == iter_time - 1:
            model_keras_cnn_0.reset_states()

        # reduce the label dimension
        y_pred = y_pred.reshape((y_pred.shape[1],))

        obs_i = np.append(obs_i, y_pred)

    return obs_i, mfcc
def dump_feature_onset_helper(audio_path, annotation_path, fn, channel):

    audio_fn = join(audio_path, fn + '.flac')
    annotation_fn = join(annotation_path, fn + '.onsets')

    mfcc = getMFCCBands2DMadmom(audio_fn, fs, hopsize_t, channel)

    print('Feature collecting ...', fn)

    times_onset = annotationCvParser(annotation_fn)
    times_onset = [float(to) for to in times_onset]
    # syllable onset frames
    frames_onset = np.array(np.around(np.array(times_onset) / hopsize_t), dtype=int)

    # line start and end frames
    frame_start = 0
    frame_end = mfcc.shape[0] - 1

    return mfcc, frames_onset, frame_start, frame_end
コード例 #4
0
def batch_process_onset_detection(wav_path,
                                  textgrid_path,
                                  score_path,
                                  scaler,
                                  test_recordings,
                                  model_keras_cnn_0,
                                  cnnModel_name,
                                  detection_results_path,
                                  architecture,
                                  len_seq,
                                  lab=False,
                                  threshold=0.54,
                                  obs_cal=True,
                                  decoding_method='viterbi',
                                  stateful=True):
    """
    experiment process, evaluate a whole jingju dataset using CRNN model
    :param wav_path: string, where we store the wav
    :param textgrid_path: string, where we store the textgrid
    :param score_path: string, where we store the score
    :param scaler: sklearn object, StandardScaler
    :param test_recordings: list of strings, testing recording filename
    :param model_keras_cnn_0: keras .h5, model weights
    :param cnnModel_name: string, model name
    :param detection_results_path: string, where we store the evaluation results
    :param architecture: string, model architecture name
    :param len_seq: input sequence frame length
    :param lab: string, for Riyaz dataset, not used in the paper
    :param threshold: float, threshold for peak picking onset selection
    :param obs_cal: string, tocal or toload, for saving running time
    :param decoding_method: string, viterbi or peakPicking
    :param stateful: bool, whether to use the stateful RNN
    :return:
    """

    eval_results_decoding_path = \
        get_results_decoding_path(decoding_method=decoding_method,
                                  bool_corrected_score_duration=varin['corrected_score_duration'],
                                  eval_results_path=detection_results_path)

    for artist_path, rn in test_recordings:

        score_file = join(score_path, artist_path, rn + '.csv')

        if not isfile(score_file):
            print 'Score not found: ' + score_file
            continue

        nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins = \
            data_parser(artist_path=artist_path,
                        wav_path=wav_path,
                        textgrid_path=textgrid_path,
                        rn=rn,
                        score_file=score_file,
                        lab=lab)

        if obs_cal == 'tocal':
            # load audio
            mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1)
            mfcc_scaled = scaler.transform(mfcc)

        i_line = -1
        for i_obs, line in enumerate(line_list):
            if not lab and len(line[2]) == 0:
                continue

            i_line += 1

            try:
                print(syllable_durations[i_line])
            except IndexError:
                continue

            if float(bpm[i_line]) == 0:
                continue

            time_line, lyrics_line, frame_start, frame_end = get_line_properties(
                lab=lab, line=line, hopsize_t=hopsize_t)

            obs_path = join('./obs', cnnModel_name, artist_path)
            obs_filename = rn + '_' + str(i_line + 1) + '.pkl'

            if obs_cal == 'tocal':

                obs_i, mfcc_line = odf_calculation_crnn(
                    mfcc=mfcc,
                    mfcc_scaled=mfcc_scaled,
                    model_keras_cnn_0=model_keras_cnn_0,
                    frame_start=frame_start,
                    frame_end=frame_end,
                    len_seq=len_seq,
                    stateful=stateful)

                # save onset curve
                print('save onset curve ... ...')
                if not exists(obs_path):
                    makedirs(obs_path)
                pickle.dump(obs_i, open(join(obs_path, obs_filename), 'w'))
            else:
                obs_i = pickle.load(open(join(obs_path, obs_filename), 'r'))

            obs_i = np.squeeze(obs_i)
            obs_i = smooth_obs(obs_i)

            # organize score
            print('Calculating: ', rn, ' phrase', str(i_obs))
            print('ODF Methods: ', architecture)

            duration_score = syllable_durations[i_line]
            duration_score = np.array(
                [float(ds) for ds in duration_score if len(ds)])
            duration_score *= (time_line / np.sum(duration_score))

            i_boundary, label = boundary_decoding(
                decoding_method=decoding_method,
                obs_i=obs_i,
                duration_score=duration_score,
                varin=varin,
                threshold=threshold,
                hopsize_t=hopsize_t,
                viterbiDecoding=viterbiDecoding,
                OnsetPeakPickingProcessor=OnsetPeakPickingProcessor)

            time_boundary_start = np.array(i_boundary[:-1]) * hopsize_t
            time_boundary_end = np.array(i_boundary[1:]) * hopsize_t

            boundary_list = get_boundary_list(
                lab=lab,
                decoding_method=decoding_method,
                time_boundary_start=time_boundary_start,
                time_boundary_end=time_boundary_end,
                pinyins=pinyins,
                syllables=syllables,
                i_line=i_line)

            filename_syll_lab = join(eval_results_decoding_path, artist_path,
                                     rn + '_' + str(i_line + 1) + '.syll.lab')

            boundaryLabWriter(boundaryList=boundary_list,
                              outputFilename=filename_syll_lab,
                              label=label)

            if varin['plot'] and obs_cal == 'tocal':
                plot_jingju(nested_syllable_lists, i_line, mfcc_line,
                            hopsize_t, obs_i, i_boundary, duration_score)

    return eval_results_decoding_path
def batch_process_onset_detection(wav_path,
                                  textgrid_path,
                                  score_path,
                                  scaler,
                                  test_recordings,
                                  model_keras_cnn_0,
                                  cnnModel_name,
                                  detection_results_path,
                                  architecture='baseline',
                                  lab=False,
                                  threshold=0.54,
                                  obs_cal=True,
                                  decoding_method='viterbi'):
    """
    :param wav_path: string, path where we have the audio files
    :param textgrid_path:  string, path where we have the text grid ground truth
    :param score_path: string, path where we have the scores
    :param scaler: scaler object sklearn
    :param test_recordings: list of strings, test recording filenames
    :param model_keras_cnn_0: keras .h5, CNN onset detection model
    :param cnnModel_name: string, CNN model name
    :param detection_results_path: string, path where we save the evaluation results
    :param architecture: string, the model architecture
    :param lab: bool, used for Riyaz dataset
    :param threshold: float, used for peak picking
    :param obs_cal: bool, if to calculate the ODF or not
    :param decoding_method: string, viterbi or peakPicking
    :return:
    """

    eval_results_decoding_path = \
        get_results_decoding_path(decoding_method=decoding_method,
                                  bool_corrected_score_duration=varin['corrected_score_duration'],
                                  eval_results_path=detection_results_path)

    # loop through all recordings
    for artist_path, rn in test_recordings:

        score_file = join(score_path, artist_path, rn + '.csv')

        if not isfile(score_file):
            print('Score not found: ' + score_file)
            continue

        nested_syllable_lists, wav_file, line_list, syllables, syllable_durations, bpm, pinyins = \
            data_parser(artist_path=artist_path,
                        wav_path=wav_path,
                        textgrid_path=textgrid_path,
                        rn=rn,
                        score_file=score_file,
                        lab=lab)

        if obs_cal == 'tocal':
            # load audio
            mfcc = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1)
            mfcc_scaled = scaler.transform(mfcc)
            mfcc_reshaped = featureReshape(mfcc_scaled, nlen=7)

        i_line = -1
        for i_obs, line in enumerate(line_list):
            # line without lyrics will be ignored
            if not lab and len(line[2]) == 0:
                continue

            i_line += 1

            # line without duration will be ignored
            try:
                print(syllable_durations[i_line])
            except IndexError:
                continue

            # line non-fixed tempo will be ignored
            if float(bpm[i_line]) == 0:
                continue

            time_line, lyrics_line, frame_start, frame_end = get_line_properties(
                lab=lab, line=line, hopsize_t=hopsize_t)

            # initialize ODF path and filename
            obs_path = join('./obs', cnnModel_name, artist_path)
            obs_filename = rn + '_' + str(i_line + 1) + '.pkl'

            if obs_cal == 'tocal':

                obs_i, mfcc_line = odf_calculation_no_crnn(
                    mfcc=mfcc,
                    mfcc_reshaped=mfcc_reshaped,
                    model_name=cnnModel_name,
                    model_keras_cnn_0=model_keras_cnn_0,
                    architecture=architecture,
                    frame_start=frame_start,
                    frame_end=frame_end)

                # save onset curve
                print('save onset curve ... ...')
                if not exists(obs_path):
                    makedirs(obs_path)
                pickle.dump(obs_i, open(join(obs_path, obs_filename), 'w'))
            else:
                obs_i = pickle.load(open(join(obs_path, obs_filename), 'r'))

            obs_i = np.squeeze(obs_i)
            obs_i = smooth_obs(obs_i)

            # organize score
            print('Calculating: ', rn, ' phrase ', str(i_obs))
            print('ODF Methods: ', architecture)

            # process the score duration
            duration_score = syllable_durations[i_line]
            # only save the duration if it exists
            duration_score = np.array(
                [float(ds) for ds in duration_score if len(ds)])
            # normalize the duration
            duration_score *= (time_line / np.sum(duration_score))

            i_boundary, label = boundary_decoding(
                decoding_method=decoding_method,
                obs_i=obs_i,
                duration_score=duration_score,
                varin=varin,
                threshold=threshold,
                hopsize_t=hopsize_t,
                viterbiDecoding=viterbiDecoding,
                OnsetPeakPickingProcessor=OnsetPeakPickingProcessor)

            # create detected syllable result filename
            filename_syll_lab = join(eval_results_decoding_path, artist_path,
                                     rn + '_' + str(i_line + 1) + '.syll.lab')
            time_boundary_start = np.array(i_boundary[:-1]) * hopsize_t
            time_boundary_end = np.array(i_boundary[1:]) * hopsize_t

            boundary_list = get_boundary_list(
                lab=lab,
                decoding_method=decoding_method,
                time_boundary_start=time_boundary_start,
                time_boundary_end=time_boundary_end,
                pinyins=pinyins,
                syllables=syllables,
                i_line=i_line)

            boundaryLabWriter(boundaryList=boundary_list,
                              outputFilename=filename_syll_lab,
                              label=label)

            if varin['plot'] and obs_cal == 'tocal':
                plot_jingju(nested_syllable_lists=nested_syllable_lists,
                            i_line=i_line,
                            mfcc_line=mfcc_line,
                            hopsize_t=hopsize_t,
                            obs_i=obs_i,
                            i_boundary=i_boundary,
                            duration_score=duration_score)

    return eval_results_decoding_path