def embedding_frame_tsne(filename_feature, filename_list_key, filename_scaler): """frame_leval embedding average precision""" logger = logging.getLogger(__name__) list_feature = pickle.load(open(filename_feature, 'rb')) list_key = pickle.load(open(filename_list_key, 'rb')) scaler = pickle.load(open(filename_scaler, 'rb')) path_model = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level' path_eval = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level' model_name = 'wide_frame_level_emb' embedding_dim = 29 array_feature_replicated, array_labels, labels = \ feature_replication(list_feature=list_feature, list_key=list_key, scaler=scaler) for ii, feature in enumerate(array_feature_replicated): array_feature_replicated[ii] = featureReshape(feature, nlen=7) np.save(file=os.path.join(path_eval, model_name + '_labels'), arr=labels) for ii in range(1): filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5') model = load_model(filepath=filename_model) embeddings = np.zeros((len(array_feature_replicated), embedding_dim)) for ii_emb, feature in enumerate(array_feature_replicated): logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii) feature = np.expand_dims(feature, axis=1) y_pred = model.predict_on_batch(feature) embeddings[ii_emb, :] = np.mean(y_pred, axis=0) np.save(file=os.path.join(path_eval, model_name + '_embedding_' + str(ii)), arr=embeddings)
def featureAggregator(dic_pho_feature_train): """ aggregate feature dictionary into numpy feature, label lists, reshape the feature :param dic_pho_feature_train: :return: """ feature_all = np.array([], dtype='float32') label_all = [] for key in dic_pho_feature_train: feature = dic_pho_feature_train[key] label = [dic_pho_label[key]] * len(feature) if len(feature): if not len(feature_all): feature_all = feature else: feature_all = np.vstack((feature_all, feature)) label_all += label label_all = np.array(label_all, dtype='int64') scaler = preprocessing.StandardScaler().fit(feature_all) feature_all = scaler.transform(feature_all) feature_all = featureReshape(feature_all, nlen=7) return feature_all, label_all, scaler
def measureEmbFrameLevelDissimilarity(model_keras_cnn_0, log_mel_phn_teacher, log_mel_phn_student): """obtain the frame level embedding dissimilarity""" log_mel_phn_teacher = np.expand_dims(featureReshape(log_mel_phn_teacher, nlen=7), axis=1) emb_phn_teacher = model_keras_cnn_0.predict_on_batch(log_mel_phn_teacher) emb_phn_teacher = np.mean(emb_phn_teacher, axis=0) log_mel_phn_student = np.expand_dims(featureReshape(log_mel_phn_student, nlen=7), axis=1) emb_phn_student = model_keras_cnn_0.predict_on_batch(log_mel_phn_student) emb_phn_student = np.mean(emb_phn_student, axis=0) # print(emb_phn_teacher) # print(emb_phn_student) dis_dis = 1.0 - cosine(emb_phn_teacher, emb_phn_student) return dis_dis
def embedding_frame_ap(filename_feature, filename_list_key, filename_scaler): """frame_leval embedding average precision""" logger = logging.getLogger(__name__) list_feature = pickle.load(open(filename_feature, 'rb')) list_key = pickle.load(open(filename_list_key, 'rb')) scaler = pickle.load(open(filename_scaler, 'rb')) list_ap = [] embedding_dim = 29 array_feature_replicated, array_labels, labels = \ feature_replication(list_feature=list_feature, list_key=list_key, scaler=scaler) for ii, feature in enumerate(array_feature_replicated): array_feature_replicated[ii] = featureReshape(feature, nlen=7) path_model = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level' path_eval = '/Users/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level' model_name = 'wide_frame_level_emb' for ii in range(5): filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5') model = load_model(filepath=filename_model) embeddings = np.zeros((len(array_feature_replicated), embedding_dim)) for ii_emb, feature in enumerate(array_feature_replicated): logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii) feature = np.expand_dims(feature, axis=1) y_pred = model.predict_on_batch(feature) embeddings[ii_emb, :] = np.mean(y_pred, axis=0) dist_mat = (2.0 - squareform(pdist(embeddings, 'cosine'))) / 2.0 gt_mat = ground_truth_matrix(labels) np.save(file=os.path.join(path_eval, 'dist_mat_' + str(ii)), arr=dist_mat) ap = eval_embeddings(dist_mat=dist_mat, gt_mat=gt_mat) list_ap.append(ap) filename_eval = os.path.join(path_eval, model_name + '.csv') with open(filename_eval, 'w') as csvfile: csvwriter = csv.writer( csvfile, delimiter=',', ) csvwriter.writerow([np.mean(list_ap), np.std(list_ap)])
def runProcess(val_test, plot): model_keras_cnn_0 = load_model(kerasModels_path) # open a pickle from python 2 in python 3, requires to add encoding scaler = pickle.load(open(kerasScaler_path, 'rb'), encoding='latin1') # the test dataset filenames primarySchool_val_recordings, primarySchool_test_recordings = getTestRecordingsJoint() if val_test == 'val': recordings = primarySchool_val_recordings else: recordings = primarySchool_test_recordings dict_total = {} dict_head = {} dict_belly = {} dict_feature_phns_total = {} dict_feature_phns_head = {} dict_feature_phns_belly = {} for artist, fn in recordings: # teacher's textgrid file teacher_textgrid_file = os.path.join(primarySchool_textgrid_path, artist, 'teacher.TextGrid') # textgrid path, to get the line onset offset student_textgrid_file = os.path.join(primarySchool_textgrid_path, artist, fn + '.TextGrid') # parse the textgrid to phoneme list teacherSyllableLists, teacherPhonemeLists = textgridSyllablePhonemeParser(teacher_textgrid_file, 'dianSilence', 'details') studentSyllableLists, studentPhonemeLists = textgridSyllablePhonemeParser(student_textgrid_file, 'dianSilence', 'details') student_wav_file = os.path.join(primarySchool_wav_path, artist, fn + '.wav') # calculate log mel log_mel = getMFCCBands2DMadmom(student_wav_file, fs, hopsize_t, channel=1) log_mel_scaled = scaler.transform(log_mel) log_mel_reshaped = featureReshape(log_mel_scaled, nlen=7) if artist not in dict_total: dict_total[artist] = {} dict_head[artist] = {} dict_belly[artist] = {} dict_feature_phns_total[artist] = {} dict_feature_phns_head[artist] = {} dict_feature_phns_belly[artist] = {} for ii_line in range(len(studentPhonemeLists)): # iterate each line # find the right line index for the teacher's textgrid, # ``student02_first_half'' only corresponds to a part of the teacher's textgrid, # we need to shift the index of the teacher's textgrid to find the right line ii_aug = findShiftOffset(gtSyllableLists=studentSyllableLists, scoreSyllableLists=teacherSyllableLists, ii_line=ii_line) list_phn_teacher, list_phn_student, list_syl_teacher, list_syl_onsets_time_teacher = \ getListsSylPhn(teacherSyllableLists=teacherSyllableLists, teacherPhonemeLists=teacherPhonemeLists, studentPhonemeLists=studentPhonemeLists, ii_line=ii_line, ii_aug=ii_aug) phns_teacher = [lpt[2] for lpt in list_phn_teacher] phns_student = [lpt[2] for lpt in list_phn_student] insertion_indices_student, deletion_indices_teacher, teacher_student_indices_pair, dict_student_idx_2_teacher_phn = \ phnSequenceAlignment(phns_teacher=phns_teacher, phns_student=phns_student) list_phn_teacher_pair, list_phn_student_pair, idx_syl_heads, phn_tails_missing, num_tails_missing = \ getIdxHeadsMissingTails(teacher_student_indices_pair=teacher_student_indices_pair, list_phn_teacher=list_phn_teacher, list_phn_student=list_phn_student, list_syl_onsets_time_teacher=list_syl_onsets_time_teacher, deletion_indices_teacher=deletion_indices_teacher, phns_tails=phns_tails) print('these phone indices are inserted in student phone list', insertion_indices_student) print('these phone indices are deleted in teacher phone list', deletion_indices_teacher) print('these phone tails are deleted in teacher phone list', phn_tails_missing) obs_line = getObsLine(studentPhonemeLists=studentPhonemeLists, ii_line=ii_line, hopsize_t=hopsize_t, log_mel_reshaped=log_mel_reshaped, model_keras_cnn_0=model_keras_cnn_0) GOP_line = [] for ii_phn in range(len(list_phn_student_pair)): phn_start_frame = int(round((list_phn_student_pair[ii_phn][0] - list_phn_student_pair[0][0]) / hopsize_t)) phn_end_frame = int(round((list_phn_student_pair[ii_phn][1] - list_phn_student_pair[0][0]) / hopsize_t)) phn_label = list_phn_teacher_pair[ii_phn][2] # the case of the phn length is 0 if phn_end_frame == phn_start_frame: GOP_line.append([ii_phn, -np.inf, phn_label]) continue obs_line_phn = obs_line[phn_start_frame:phn_end_frame] # if plot: # figurePlot(obs_line_phn.T) # calculate GOP GOP_phn = GOP_phn_level(phn_label=phn_label, obs_line_phn=obs_line_phn) GOP_line.append([ii_phn, GOP_phn, phn_label]) # print(len(GOP_line), idx_syl_heads) gop_total = [gop[1] for gop in GOP_line if not np.isinf(gop[1])] gop_head = [gop[1] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] in idx_syl_heads] gop_belly = [gop[1] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] not in idx_syl_heads] if plot: disLinePlot(gop_total, [gop[2] for gop in GOP_line if not np.isinf(gop[1])]) disLinePlot(gop_head, [gop[2] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] in idx_syl_heads]) disLinePlot(gop_belly, [gop[2] for gop in GOP_line if not np.isinf(gop[1]) and gop[0] not in idx_syl_heads]) total_distortion = np.mean(gop_total) head_distortion = np.mean(gop_head) belly_distortion = np.mean(gop_belly) dict_total[artist][fn + '_' + str(ii_line+ii_aug)] = total_distortion dict_head[artist][fn + '_' + str(ii_line+ii_aug)] = head_distortion dict_belly[artist][fn + '_' + str(ii_line+ii_aug)] = belly_distortion dict_feature_phns_total[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_total), 'num_tails_missing':num_tails_missing} dict_feature_phns_head[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_head), 'num_tails_missing':num_tails_missing} dict_feature_phns_belly[artist][fn + '_' + str(ii_line+ii_aug)] = {'distortion_phns':np.array(gop_belly), 'num_tails_missing':num_tails_missing} if val_test == 'test': with open('./data/rating_GOP_oracle_total.json', 'w') as savefile: json.dump(dict_total, savefile) with open('./data/rating_GOP_oracle_head.json', 'w') as savefile: json.dump(dict_head, savefile) with open('./data/rating_GOP_oracle_belly.json', 'w') as savefile: json.dump(dict_belly, savefile) with open('./data/training_features/GOP_oracle_'+val_test+'_total.pkl', 'wb') as savefile: pickle.dump(dict_feature_phns_total, savefile) with open('./data/training_features/GOP_oracle_'+val_test+'_head.pkl', 'wb') as savefile: pickle.dump(dict_feature_phns_head, savefile) with open('./data/training_features/GOP_oracle_'+val_test+'_belly.pkl', 'wb') as savefile: pickle.dump(dict_feature_phns_belly, savefile)
def embedding_frame_ap(filename_feature_teacher, filename_list_key_teacher, filename_feature_student, filename_list_key_student, filename_scaler, embedding_dim, val_test): """frame_leval embedding average precision""" logger = logging.getLogger(__name__) list_feature_teacher = pickle.load(open(filename_feature_teacher, 'rb')) list_key_teacher = pickle.load(open(filename_list_key_teacher, 'rb')) list_feature_student = pickle.load(open(filename_feature_student, 'rb')) list_key_student = pickle.load(open(filename_list_key_student, 'rb')) scaler = pickle.load(open(filename_scaler, 'rb')) list_ap = [] array_feature_replicated_teacher, array_labels_teacher, labels_teacher = \ feature_replication_teacher_student(list_feature=list_feature_teacher, list_key=list_key_teacher, scaler=scaler, data_str='_teacher') array_feature_replicated_student, array_labels_student, labels_student = \ feature_replication_teacher_student(list_feature=list_feature_student, list_key=list_key_student, scaler=scaler, data_str='_student') array_feature_replicated = array_feature_replicated_teacher + array_feature_replicated_student labels = np.array(labels_teacher + labels_student) for ii, feature in enumerate(array_feature_replicated): array_feature_replicated[ii] = featureReshape(feature, nlen=7) path_model = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level' path_eval = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level' model_name = 'wide_frame_level_emb_teacher_student_2_class' if embedding_dim == 2 \ else 'wide_frame_level_emb_teacher_student' for ii in range(5): filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5') model = load_model(filepath=filename_model) print(model.summary()) embeddings = np.zeros((len(array_feature_replicated), embedding_dim)) for ii_emb, feature in enumerate(array_feature_replicated): logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii) feature = np.expand_dims(feature, axis=1) print(feature.shape) feature_len = feature.shape[0] heatmap_concatenate = np.zeros((80, feature_len)) for ii in range(feature_len): feature_map = feature[ii, 0] heatmap = grad_cam(model, feature[ii:ii+1], layer_name='conv2d_2') if not np.isnan(np.sum(heatmap)): # print(heatmap.shape) # plt.matshow(heatmap) # plt.show() heatmap = cv2.resize(heatmap, (15, 80)) heatmap_concatenate[:, ii] = heatmap[:, 7] # fin = cv2.addWeighted(heatmap, 0.7, feature_map, 0.3, 0) # superimposed_img = heatmap + feature_map # print(heatmap*0.4) print(heatmap_concatenate) heatmap_concatenate = heatmap_postprocessing(heatmap=heatmap_concatenate) plt.imshow(heatmap_concatenate) plt.show()
def embedding_frame_ap(filename_feature_teacher, filename_list_key_teacher, filename_feature_student, filename_list_key_student, filename_scaler, embedding_dim, val_test): """frame_leval embedding average precision""" logger = logging.getLogger(__name__) list_feature_teacher = pickle.load(open(filename_feature_teacher, 'rb')) list_key_teacher = pickle.load(open(filename_list_key_teacher, 'rb')) list_feature_student = pickle.load(open(filename_feature_student, 'rb')) list_key_student = pickle.load(open(filename_list_key_student, 'rb')) scaler = pickle.load(open(filename_scaler, 'rb')) list_ap = [] array_feature_replicated_teacher, array_labels_teacher, labels_teacher = \ feature_replication_teacher_student(list_feature=list_feature_teacher, list_key=list_key_teacher, scaler=scaler, data_str='_teacher') array_feature_replicated_student, array_labels_student, labels_student = \ feature_replication_teacher_student(list_feature=list_feature_student, list_key=list_key_student, scaler=scaler, data_str='_student') array_feature_replicated = array_feature_replicated_teacher + array_feature_replicated_student labels = np.array(labels_teacher + labels_student) for ii, feature in enumerate(array_feature_replicated): array_feature_replicated[ii] = featureReshape(feature, nlen=7) path_model = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/models/phoneme_embedding_frame_level' path_eval = '/home/gong/Documents/pycharmProjects/phoneticSimilarity/eval/phoneme_embedding_frame_level' model_name = 'wide_frame_level_emb_teacher_student_2_class' if embedding_dim == 2 \ else 'wide_frame_level_emb_teacher_student' for ii in range(5): filename_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5') model = load_model(filepath=filename_model) embeddings = np.zeros((len(array_feature_replicated), embedding_dim)) for ii_emb, feature in enumerate(array_feature_replicated): logger.info('calculating..., %s, total, %s, round, %s', ii_emb, len(array_feature_replicated), ii) feature = np.expand_dims(feature, axis=1) y_pred = model.predict_on_batch(feature) embeddings[ii_emb, :] = np.mean(y_pred, axis=0) list_dist = [] list_gt = [] for ii_class in range(27): idx_ii_class = \ np.where(np.logical_or(labels == ii_class, labels == ii_class + 27))[0] dist_mat = (2.0 - squareform( pdist(embeddings[idx_ii_class, :], 'cosine'))) / 2.0 labels_ii_class = [labels[idx] for idx in idx_ii_class] gt_mat = ground_truth_matrix(labels_ii_class) # np.save(file=os.path.join(path_eval, 'dist_mat_'+str(ii)), arr=dist_mat) sample_num = dist_mat.shape[0] iu1 = np.triu_indices(sample_num, 1) # trim the upper mat list_dist.append(dist_mat[iu1]) list_gt.append(gt_mat[iu1]) list_dist = np.concatenate(list_dist) list_gt = np.concatenate(list_gt) ap = average_precision_score(y_true=np.abs(list_gt), y_score=np.abs(list_dist), average='weighted') list_ap.append(ap) post_fix = '_pairs' if val_test == 'val' else '_extra_pairs' filename_eval = os.path.join(path_eval, model_name + post_fix + '.csv') with open(filename_eval, 'w') as csvfile: csvwriter = csv.writer( csvfile, delimiter=',', ) csvwriter.writerow([np.mean(list_ap), np.std(list_ap)])
# textgrid path, to get the line onset offset groundtruth_textgrid_file = os.path.join(primarySchool_textgrid_path, artist, fn + '.TextGrid') # parse the TextGrid list_line = textGrid2WordList(groundtruth_textgrid_file, whichTier='line') wav_file = os.path.join(primarySchool_wav_path, artist, fn + '.wav') vad_results = VAD(wav_file) # calculate log mel log_mel = getMFCCBands2DMadmom(wav_file, fs, hopsize_t, channel=1) log_mel_scaled = scaler.transform(log_mel) log_mel_reshaped = featureReshape(log_mel_scaled, nlen=7) ii_line = 0 for line in list_line: # iterate each line if len(line[2].strip()): # start and end time time_start = line[0] time_end = line[1] frame_start = int(round(time_start / hopsize_t)) frame_end = int(round(time_end / hopsize_t)) frame_end = frame_end if frame_end <= len( vad_results) else len(vad_results) # log_mel_reshape line log_mel_reshaped_line = log_mel_reshaped[frame_start:frame_end]