Python featureReshapeの例、src.audio_preprocessing.featureReshape Pythonの例

コード例 #1

0

ファイルを表示

ファイル: tsne_plot.py プロジェクト: malgabri/phoneticSimilarity

def embedding_frame_tsne(filename_feature, filename_list_key, filename_scaler):
    """frame_leval embedding average precision"""
    logger = logging.getLogger(__name__)

    list_feature = pickle.load(open(filename_feature, 'rb'))
    list_key = pickle.load(open(filename_list_key, 'rb'))
    scaler = pickle.load(open(filename_scaler, 'rb'))

    path_model = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level'
    path_eval = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level'
    model_name = 'wide_frame_level_emb'

    embedding_dim = 29

    array_feature_replicated, array_labels, labels = \
        feature_replication(list_feature=list_feature, list_key=list_key, scaler=scaler)

    for ii, feature in enumerate(array_feature_replicated):
        array_feature_replicated[ii] = featureReshape(feature, nlen=7)

    np.save(file=os.path.join(path_eval, model_name + '_labels'), arr=labels)

    for ii in range(1):
        filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
        model = load_model(filepath=filename_model)

        embeddings = np.zeros((len(array_feature_replicated), embedding_dim))
        for ii_emb, feature in enumerate(array_feature_replicated):
            logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii)

            feature = np.expand_dims(feature, axis=1)
            y_pred = model.predict_on_batch(feature)
            embeddings[ii_emb, :] = np.mean(y_pred, axis=0)

        np.save(file=os.path.join(path_eval, model_name + '_embedding_' + str(ii)), arr=embeddings)

コード例 #2

0

ファイルを表示

ファイル: trainingSampleCollectionGOP.py プロジェクト: ronggong/phoneticSimilarity

def featureAggregator(dic_pho_feature_train):
    """
    aggregate feature dictionary into numpy feature, label lists,
    reshape the feature
    :param dic_pho_feature_train:
    :return:
    """
    feature_all = np.array([], dtype='float32')
    label_all = []
    for key in dic_pho_feature_train:
        feature = dic_pho_feature_train[key]
        label = [dic_pho_label[key]] * len(feature)

        if len(feature):
            if not len(feature_all):
                feature_all = feature
            else:
                feature_all = np.vstack((feature_all, feature))
            label_all += label
    label_all = np.array(label_all, dtype='int64')

    scaler = preprocessing.StandardScaler().fit(feature_all)
    feature_all = scaler.transform(feature_all)
    feature_all = featureReshape(feature_all, nlen=7)

    return feature_all, label_all, scaler

コード例 #3

0

ファイルを表示

ファイル: phonetic_assessment.py プロジェクト: ronggong/phoneticSimilarity

def measureEmbFrameLevelDissimilarity(model_keras_cnn_0, log_mel_phn_teacher,
                                      log_mel_phn_student):
    """obtain the frame level embedding dissimilarity"""
    log_mel_phn_teacher = np.expand_dims(featureReshape(log_mel_phn_teacher,
                                                        nlen=7),
                                         axis=1)
    emb_phn_teacher = model_keras_cnn_0.predict_on_batch(log_mel_phn_teacher)
    emb_phn_teacher = np.mean(emb_phn_teacher, axis=0)

    log_mel_phn_student = np.expand_dims(featureReshape(log_mel_phn_student,
                                                        nlen=7),
                                         axis=1)
    emb_phn_student = model_keras_cnn_0.predict_on_batch(log_mel_phn_student)
    emb_phn_student = np.mean(emb_phn_student, axis=0)

    # print(emb_phn_teacher)
    # print(emb_phn_student)
    dis_dis = 1.0 - cosine(emb_phn_teacher, emb_phn_student)
    return dis_dis

コード例 #4

0

ファイルを表示

ファイル: eval_embedding.py プロジェクト: malgabri/phoneticSimilarity

def embedding_frame_ap(filename_feature, filename_list_key, filename_scaler):
    """frame_leval embedding average precision"""
    logger = logging.getLogger(__name__)

    list_feature = pickle.load(open(filename_feature, 'rb'))
    list_key = pickle.load(open(filename_list_key, 'rb'))
    scaler = pickle.load(open(filename_scaler, 'rb'))

    list_ap = []
    embedding_dim = 29

    array_feature_replicated, array_labels, labels = \
        feature_replication(list_feature=list_feature, list_key=list_key, scaler=scaler)

    for ii, feature in enumerate(array_feature_replicated):
        array_feature_replicated[ii] = featureReshape(feature, nlen=7)

    path_model = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level'
    path_eval = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level'
    model_name = 'wide_frame_level_emb'

    for ii in range(5):
        filename_model = os.path.join(path_model,
                                      model_name + '_' + str(ii) + '.h5')
        model = load_model(filepath=filename_model)

        embeddings = np.zeros((len(array_feature_replicated), embedding_dim))
        for ii_emb, feature in enumerate(array_feature_replicated):
            logger.info('calculating..., %s, total, %s, round, %s', ii_emb,
                        len(array_feature_replicated), ii)

            feature = np.expand_dims(feature, axis=1)
            y_pred = model.predict_on_batch(feature)
            embeddings[ii_emb, :] = np.mean(y_pred, axis=0)

        dist_mat = (2.0 - squareform(pdist(embeddings, 'cosine'))) / 2.0
        gt_mat = ground_truth_matrix(labels)

        np.save(file=os.path.join(path_eval, 'dist_mat_' + str(ii)),
                arr=dist_mat)

        ap = eval_embeddings(dist_mat=dist_mat, gt_mat=gt_mat)

        list_ap.append(ap)

    filename_eval = os.path.join(path_eval, model_name + '.csv')
    with open(filename_eval, 'w') as csvfile:
        csvwriter = csv.writer(
            csvfile,
            delimiter=',',
        )
        csvwriter.writerow([np.mean(list_ap), np.std(list_ap)])

コード例 #5

0

ファイルを表示

ファイル: baseline1_oracle_GOP.py プロジェクト: ronggong/phoneticSimilarity

def runProcess(val_test, plot):
    model_keras_cnn_0 = load_model(kerasModels_path)

    # open a pickle from python 2 in python 3, requires to add encoding
    scaler = pickle.load(open(kerasScaler_path, 'rb'), encoding='latin1')

    # the test dataset filenames
    primarySchool_val_recordings, primarySchool_test_recordings = getTestRecordingsJoint()

    if val_test == 'val':
        recordings = primarySchool_val_recordings
    else:
        recordings = primarySchool_test_recordings

    dict_total = {}
    dict_head = {}
    dict_belly = {}

    dict_feature_phns_total = {}
    dict_feature_phns_head = {}
    dict_feature_phns_belly = {}

    for artist, fn in recordings:

        # teacher's textgrid file
        teacher_textgrid_file = os.path.join(primarySchool_textgrid_path, artist, 'teacher.TextGrid')
        # textgrid path, to get the line onset offset
        student_textgrid_file = os.path.join(primarySchool_textgrid_path, artist, fn + '.TextGrid')

        # parse the textgrid to phoneme list
        teacherSyllableLists, teacherPhonemeLists = textgridSyllablePhonemeParser(teacher_textgrid_file,
                                                                              'dianSilence',
                                                                              'details')
        studentSyllableLists, studentPhonemeLists = textgridSyllablePhonemeParser(student_textgrid_file,
                                                                        'dianSilence',
                                                                        'details')

        student_wav_file = os.path.join(primarySchool_wav_path, artist, fn + '.wav')

        # calculate log mel
        log_mel = getMFCCBands2DMadmom(student_wav_file, fs, hopsize_t, channel=1)
        log_mel_scaled = scaler.transform(log_mel)
        log_mel_reshaped = featureReshape(log_mel_scaled, nlen=7)

        if artist not in dict_total:
            dict_total[artist] = {}
            dict_head[artist] = {}
            dict_belly[artist] = {}

            dict_feature_phns_total[artist] = {}
            dict_feature_phns_head[artist] = {}
            dict_feature_phns_belly[artist] = {}

        for ii_line in range(len(studentPhonemeLists)): # iterate each line

            # find the right line index for the teacher's textgrid,
            # ``student02_first_half'' only corresponds to a part of the teacher's textgrid,
            # we need to shift the index of the teacher's textgrid to find the right line
            ii_aug = findShiftOffset(gtSyllableLists=studentSyllableLists,
                                     scoreSyllableLists=teacherSyllableLists,
                                     ii_line=ii_line)

            list_phn_teacher, list_phn_student, list_syl_teacher, list_syl_onsets_time_teacher = \
                getListsSylPhn(teacherSyllableLists=teacherSyllableLists,
                               teacherPhonemeLists=teacherPhonemeLists,
                               studentPhonemeLists=studentPhonemeLists,
                               ii_line=ii_line,
                               ii_aug=ii_aug)


            phns_teacher = [lpt[2] for lpt in list_phn_teacher]
            phns_student = [lpt[2] for lpt in list_phn_student]

            insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair, dict_student_idx_2_teacher_phn = \
                phnSequenceAlignment(phns_teacher=phns_teacher, phns_student=phns_student)

            list_phn_teacher_pair, list_phn_student_pair, idx_syl_heads, phn_tails_missing, num_tails_missing = \
                getIdxHeadsMissingTails(teacher_student_indices_pair=teacher_student_indices_pair,
                                        list_phn_teacher=list_phn_teacher,
                                        list_phn_student=list_phn_student,
                                        list_syl_onsets_time_teacher=list_syl_onsets_time_teacher,
                                        deletion_indices_teacher=deletion_indices_teacher,
                                        phns_tails=phns_tails)

            print('these phone indices are inserted in student phone list', insertion_indices_student)
            print('these phone indices are deleted in teacher phone list', deletion_indices_teacher)
            print('these phone tails are deleted in teacher phone list', phn_tails_missing)

            obs_line = getObsLine(studentPhonemeLists=studentPhonemeLists,
                                   ii_line=ii_line,
                                   hopsize_t=hopsize_t,
                                   log_mel_reshaped=log_mel_reshaped,
                                   model_keras_cnn_0=model_keras_cnn_0)

            GOP_line = []
            for ii_phn in range(len(list_phn_student_pair)):

                phn_start_frame = int(round((list_phn_student_pair[ii_phn][0] - list_phn_student_pair[0][0]) / hopsize_t))
                phn_end_frame = int(round((list_phn_student_pair[ii_phn][1] - list_phn_student_pair[0][0]) / hopsize_t))

                phn_label = list_phn_teacher_pair[ii_phn][2]

                # the case of the phn length is 0
                if phn_end_frame == phn_start_frame:
                    GOP_line.append([ii_phn, -np.inf, phn_label])
                    continue

                obs_line_phn = obs_line[phn_start_frame:phn_end_frame]

                # if plot:
                #     figurePlot(obs_line_phn.T)

                # calculate GOP
                GOP_phn = GOP_phn_level(phn_label=phn_label, obs_line_phn=obs_line_phn)
                GOP_line.append([ii_phn, GOP_phn, phn_label])

            # print(len(GOP_line), idx_syl_heads)
            gop_total = [gop[1] for gop in GOP_line if not np.isinf(gop[1])]
            gop_head = [gop[1] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] in idx_syl_heads]
            gop_belly = [gop[1] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] not in idx_syl_heads]

            if plot:
                disLinePlot(gop_total, [gop[2] for gop in GOP_line if not np.isinf(gop[1])])
                disLinePlot(gop_head, [gop[2] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] in idx_syl_heads])
                disLinePlot(gop_belly, [gop[2] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] not in idx_syl_heads])


            total_distortion = np.mean(gop_total)
            head_distortion = np.mean(gop_head)
            belly_distortion = np.mean(gop_belly)

            dict_total[artist][fn + '_' + str(ii_line+ii_aug)] = total_distortion
            dict_head[artist][fn + '_' + str(ii_line+ii_aug)] = head_distortion
            dict_belly[artist][fn + '_' + str(ii_line+ii_aug)] = belly_distortion

            dict_feature_phns_total[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_total), 'num_tails_missing':num_tails_missing}
            dict_feature_phns_head[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_head), 'num_tails_missing':num_tails_missing}
            dict_feature_phns_belly[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_belly), 'num_tails_missing':num_tails_missing}

    if val_test == 'test':
        with open('./data/rating_GOP_oracle_total.json', 'w') as savefile:
            json.dump(dict_total, savefile)
        with open('./data/rating_GOP_oracle_head.json', 'w') as savefile:
            json.dump(dict_head, savefile)
        with open('./data/rating_GOP_oracle_belly.json', 'w') as savefile:
            json.dump(dict_belly, savefile)

    with open('./data/training_features/GOP_oracle_'+val_test+'_total.pkl', 'wb') as savefile:
        pickle.dump(dict_feature_phns_total, savefile)
    with open('./data/training_features/GOP_oracle_'+val_test+'_head.pkl', 'wb') as savefile:
        pickle.dump(dict_feature_phns_head, savefile)
    with open('./data/training_features/GOP_oracle_'+val_test+'_belly.pkl', 'wb') as savefile:
        pickle.dump(dict_feature_phns_belly, savefile)

コード例 #6

0

ファイルを表示

ファイル: eval_grad_cam.py プロジェクト: ronggong/phoneticSimilarity

def embedding_frame_ap(filename_feature_teacher,
                       filename_list_key_teacher,
                       filename_feature_student,
                       filename_list_key_student,
                       filename_scaler,
                       embedding_dim,
                       val_test):
    """frame_leval embedding average precision"""
    logger = logging.getLogger(__name__)

    list_feature_teacher = pickle.load(open(filename_feature_teacher, 'rb'))
    list_key_teacher = pickle.load(open(filename_list_key_teacher, 'rb'))
    list_feature_student = pickle.load(open(filename_feature_student, 'rb'))
    list_key_student = pickle.load(open(filename_list_key_student, 'rb'))
    scaler = pickle.load(open(filename_scaler, 'rb'))

    list_ap = []

    array_feature_replicated_teacher, array_labels_teacher, labels_teacher = \
        feature_replication_teacher_student(list_feature=list_feature_teacher,
                                            list_key=list_key_teacher,
                                            scaler=scaler,
                                            data_str='_teacher')

    array_feature_replicated_student, array_labels_student, labels_student = \
        feature_replication_teacher_student(list_feature=list_feature_student,
                                            list_key=list_key_student,
                                            scaler=scaler,
                                            data_str='_student')

    array_feature_replicated = array_feature_replicated_teacher + array_feature_replicated_student

    labels = np.array(labels_teacher + labels_student)

    for ii, feature in enumerate(array_feature_replicated):
        array_feature_replicated[ii] = featureReshape(feature, nlen=7)

    path_model = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level'
    path_eval = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level'

    model_name = 'wide_frame_level_emb_teacher_student_2_class' if embedding_dim == 2 \
        else 'wide_frame_level_emb_teacher_student'

    for ii in range(5):
        filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
        model = load_model(filepath=filename_model)

        print(model.summary())

        embeddings = np.zeros((len(array_feature_replicated), embedding_dim))
        for ii_emb, feature in enumerate(array_feature_replicated):
            logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii)

            feature = np.expand_dims(feature, axis=1)

            print(feature.shape)

            feature_len = feature.shape[0]

            heatmap_concatenate = np.zeros((80, feature_len))

            for ii in range(feature_len):
                feature_map = feature[ii, 0]
                heatmap = grad_cam(model, feature[ii:ii+1], layer_name='conv2d_2')

                if not np.isnan(np.sum(heatmap)):
                    # print(heatmap.shape)
                    # plt.matshow(heatmap)
                    # plt.show()

                    heatmap = cv2.resize(heatmap, (15, 80))

                    heatmap_concatenate[:, ii] = heatmap[:, 7]

                    # fin = cv2.addWeighted(heatmap, 0.7, feature_map, 0.3, 0)

                    # superimposed_img = heatmap + feature_map

                    # print(heatmap*0.4)

            print(heatmap_concatenate)

            heatmap_concatenate = heatmap_postprocessing(heatmap=heatmap_concatenate)

            plt.imshow(heatmap_concatenate)

            plt.show()

コード例 #7

0

ファイルを表示

ファイル: eval_embedding_teacher_student_pairs.py プロジェクト: ronggong/phoneticSimilarity

def embedding_frame_ap(filename_feature_teacher, filename_list_key_teacher,
                       filename_feature_student, filename_list_key_student,
                       filename_scaler, embedding_dim, val_test):
    """frame_leval embedding average precision"""
    logger = logging.getLogger(__name__)

    list_feature_teacher = pickle.load(open(filename_feature_teacher, 'rb'))
    list_key_teacher = pickle.load(open(filename_list_key_teacher, 'rb'))
    list_feature_student = pickle.load(open(filename_feature_student, 'rb'))
    list_key_student = pickle.load(open(filename_list_key_student, 'rb'))
    scaler = pickle.load(open(filename_scaler, 'rb'))

    list_ap = []

    array_feature_replicated_teacher, array_labels_teacher, labels_teacher = \
        feature_replication_teacher_student(list_feature=list_feature_teacher,
                                            list_key=list_key_teacher,
                                            scaler=scaler,
                                            data_str='_teacher')

    array_feature_replicated_student, array_labels_student, labels_student = \
        feature_replication_teacher_student(list_feature=list_feature_student,
                                            list_key=list_key_student,
                                            scaler=scaler,
                                            data_str='_student')

    array_feature_replicated = array_feature_replicated_teacher + array_feature_replicated_student

    labels = np.array(labels_teacher + labels_student)

    for ii, feature in enumerate(array_feature_replicated):
        array_feature_replicated[ii] = featureReshape(feature, nlen=7)

    path_model = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level'
    path_eval = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level'

    model_name = 'wide_frame_level_emb_teacher_student_2_class' if embedding_dim == 2 \
        else 'wide_frame_level_emb_teacher_student'

    for ii in range(5):
        filename_model = os.path.join(path_model,
                                      model_name + '_' + str(ii) + '.h5')
        model = load_model(filepath=filename_model)

        embeddings = np.zeros((len(array_feature_replicated), embedding_dim))
        for ii_emb, feature in enumerate(array_feature_replicated):
            logger.info('calculating..., %s, total, %s, round, %s', ii_emb,
                        len(array_feature_replicated), ii)

            feature = np.expand_dims(feature, axis=1)
            y_pred = model.predict_on_batch(feature)
            embeddings[ii_emb, :] = np.mean(y_pred, axis=0)

        list_dist = []
        list_gt = []
        for ii_class in range(27):
            idx_ii_class = \
            np.where(np.logical_or(labels == ii_class, labels == ii_class + 27))[0]
            dist_mat = (2.0 - squareform(
                pdist(embeddings[idx_ii_class, :], 'cosine'))) / 2.0
            labels_ii_class = [labels[idx] for idx in idx_ii_class]
            gt_mat = ground_truth_matrix(labels_ii_class)

            # np.save(file=os.path.join(path_eval, 'dist_mat_'+str(ii)), arr=dist_mat)

            sample_num = dist_mat.shape[0]
            iu1 = np.triu_indices(sample_num, 1)  # trim the upper mat

            list_dist.append(dist_mat[iu1])
            list_gt.append(gt_mat[iu1])

        list_dist = np.concatenate(list_dist)
        list_gt = np.concatenate(list_gt)

        ap = average_precision_score(y_true=np.abs(list_gt),
                                     y_score=np.abs(list_dist),
                                     average='weighted')

        list_ap.append(ap)

    post_fix = '_pairs' if val_test == 'val' else '_extra_pairs'

    filename_eval = os.path.join(path_eval, model_name + post_fix + '.csv')

    with open(filename_eval, 'w') as csvfile:
        csvwriter = csv.writer(
            csvfile,
            delimiter=',',
        )
        csvwriter.writerow([np.mean(list_ap), np.std(list_ap)])

コード例 #8

0

ファイルを表示

        # textgrid path, to get the line onset offset
        groundtruth_textgrid_file = os.path.join(primarySchool_textgrid_path,
                                                 artist, fn + '.TextGrid')

        # parse the TextGrid
        list_line = textGrid2WordList(groundtruth_textgrid_file,
                                      whichTier='line')

        wav_file = os.path.join(primarySchool_wav_path, artist, fn + '.wav')

        vad_results = VAD(wav_file)

        # calculate log mel
        log_mel = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1)
        log_mel_scaled = scaler.transform(log_mel)
        log_mel_reshaped = featureReshape(log_mel_scaled, nlen=7)

        ii_line = 0
        for line in list_line:  # iterate each line
            if len(line[2].strip()):

                # start and end time
                time_start = line[0]
                time_end = line[1]
                frame_start = int(round(time_start / hopsize_t))
                frame_end = int(round(time_end / hopsize_t))
                frame_end = frame_end if frame_end <= len(
                    vad_results) else len(vad_results)

                # log_mel_reshape line
                log_mel_reshaped_line = log_mel_reshaped[frame_start:frame_end]