def index_train(feat_filename, index_filename, index_type="IVF1024,Flat"): print('Loading feature data...') max_feats = 10000 feats, utt_ids = kaldi_io.readArk(feat_filename, limit=max_feats) complete_feat_len = 0 for feat in feats: complete_feat_len += feat.shape[0] utt_map = np.zeros(complete_feat_len, dtype=np.int32) pos_map = np.zeros(complete_feat_len, dtype=np.int32) # create the utt_map and pos_map that map a pos id to utt id and position inside an utterance pos = 0 for i, (feat, utt_id) in enumerate(zip(feats, utt_ids)): for j in range(feat.shape[0]): utt_map[pos] = i pos_map[pos] = j pos += 1 complete_feats = np.concatenate(feats, axis=0) print('complete_feats:', complete_feats) print('complete_feats.min():', complete_feats.min()) print('complete_feats.max():', complete_feats.max()) print('complete_feats.sum()', complete_feats.sum()) print('Going to index features of shape:', complete_feats.shape) index = faiss.index_factory(complete_feats.shape[1], index_type.encode("ascii")) index.train(complete_feats) index.add(complete_feats) print('is_trained:', index.is_trained) print('ntotal:', index.ntotal) print('Indexing finished.') search_vec_index = 100 search_vec = complete_feats[search_vec_index] search_vec = search_vec.reshape((1, len(search_vec))) print("Shape of search_vec:", search_vec.shape) D, I = index.search(search_vec, 50) print("Neighboors of ", search_vec_index) print("D:") print(D) print("I:") print(I)
def visualize_kaldi_bin_feats(feat_filename, max_frames, num_feat=0, phn_file='', phn_offset=5, wav_file='', do_tsne=False): feats, utt_ids = kaldi_io.readArk(feat_filename, limit=10000) print([feat.shape for feat in feats], utt_ids) print('showing features for utt_id:', utt_ids[num_feat]) print('min vector:') print(np.min(feats[num_feat], axis=1)) print('max vector:') print(np.max(feats[num_feat], axis=1)) print('sum vector:') print(np.sum(feats[num_feat], axis=1)) print(feats[num_feat].shape) if phn_file == '': plt.matshow(feats[num_feat][:max_frames].T) plt.show() else: plt.matshow(feats[num_feat][:max_frames].T) positions, names = utils.loadPhnFile(phn_file) xpositions = [ float(pos[1]) / samples_per_frame - phn_offset for pos in positions if float(pos[1]) / samples_per_frame < max_frames ] for xc in xpositions: plt.axvline(x=xc, color='k', linestyle='--') plt.show() if do_tsne: plt.figure(1) print('Calculating TSNE:') model = TSNE(n_components=2, random_state=0) tsne_data = model.fit_transform(feats[num_feat]) plt.plot(tsne_data[:, 0], tsne_data[:, 1], '--') print('Now showing tsne plot:') plt.show()
def cluster_rnn_phn(n_clusters, wav_files, ark_file, hopping_size, window_size, subsample, n_jobs=4): feats, uttids = kaldi_io.readArk(ark_file) tf.segment_mean() #tf. #from https://github.com/tensorflow/tensorflow/issues/7389 ones = tf.ones_like(x) count = tf.unsorted_segment_sum(ones, ids, 2) sums = tf.unsorted_segment_sum(x, ids, 2) mean = tf.divide(sums, count)
def load_feats_flat(ark_file): feats, uttids = kaldi_io.readArk(ark_file) # preallocate the array and establish array sizes inner_dim = feats[0][0].shape[1] sum_len = feats2sumlen(feats) feats_flat = np.zeros(sum_len, inner_dim) uttids_flat = [] pos_flat = [] pos = 0 for uttid, feat in zip(feats): feats_flat[pos:pos + feat.shape[0]] = feat pos += feat.shape[0] #repeating uttid feat.shape[0] times uttids_flat += [uttid] * feat.shape[0] pos_flat = np.array(float(x) for x in range(feat.shape[0])) * hopping_size return feats, feats_flat, uttids_flat, pos_flat
def visualize_stats(feat_filename, max_feats, abs_feats=True, reverse_sort=True): feats, utt_ids = kaldi_io.readArk(feat_filename, limit=max_feats) feats_len = len(feats) print("Loaded:" + str(feats_len) + "feats") sums = [] for feat in feats: if abs_feats: feat = np.abs(feat) local_sum = np.sum(feat, axis=0) / float(len(feat)) print(local_sum.shape) sums.append(local_sum) sums = np.stack(sums, axis=0) print(sums.shape) finalsum = np.sum(sums, axis=0) / float(feats_len) finalsum_sorted = np.sort(np.array(finalsum)) if reverse_sort: finalsum_sorted = finalsum_sorted[::-1] print(finalsum) print(finalsum_sorted) plt.plot(finalsum_sorted) plt.figure(1) plt.matshow([finalsum]) plt.figure(2) plt.matshow([finalsum_sorted]) plt.show()
def get_vectors(): # possible parameters # feat_file -> path to feat_file, must be one returned by /list_avail_reps # half_index -> cut vectors at this position (optional, default: -1) # limit -> max vectors to return (optional, numeric) # average_utts -> average vector for each utterance (optional, default: True) # normalize -> normalize vectors to unit length (optional, default: False) # Reading parameters from POST request: if 'feat_file' in flask.request.form: feat_filename = flask.request.form['feat_file'] else: response_str = json.dumps({ 'status': 'fail', 'reason': 'You must supply a feat_file for /get_vectors' }) response = Response(response_str, mimetype='application/json') return response if 'half_index' in flask.request.form: half_index = int(flask.request.form['half_index']) else: print( 'POST /get_vectors called without half_index parameter, setting to default -1 (disable)' ) half_index = -1 if 'limit' in flask.request.form: limit = int(flask.request.form['limit']) else: print( 'POST /get_vectors called without limit parameter, setting to default -1 (disable)' ) limit = -1 if 'average_utts' in flask.request.form: average_utts = flask.request.form['average_utts'] else: print( 'POST /get_vectors called without average_utts parameter, setting to default true (enable)' ) average_utts = True if not average_utts or average_utts == 'False' or average_utts == 'false': if 'stride' in flask.request.form: stride = int(flask.request.form['stride']) else: print( 'POST /get_vectors called with average_utts = False, but stride parameter is not set, setting it to the default value (1)' ) stride = 1 normalize = ('normalize' in flask.request.form) feats, utt_ids = kaldi_io.readArk(feat_filename, limit=limit) feats_len = len(feats) assert (len(utt_ids) == len(feats)) print("Loaded:" + str(feats_len) + " feats.") if average_utts or average_utts == 'True' or average_utts == 'true': feats = [feat.mean(0) for feat in feats] if half_index != -1: print('Cutting vectors at ', half_index, 'and normalize to unit length' if normalize else '') feats = [ feat[:half_index] / (np.linalg.norm(feat[:half_index]) if normalize else 1.0) for feat in feats ] else: if normalize: print('Normalize to unit length.') feats = [feat / np.linalg.norm(feat) for feat in feats] response_vec_dict = {} for utt_id, feat in zip(utt_ids, feats): response_vec_dict[utt_id] = feat.tolist() response_str = json.dumps({ 'status': 'success', 'vectors': response_vec_dict }) else: if stride != 1: feats = [feat[::stride] for feat in feats] if half_index != -1: print('Cutting vectors at ', half_index) print('Not yet supported') if normalize: feats = [(feat.T / np.linalg.norm(feat, axis=1)).T for feat in feats] response_vec_dict = {} for utt_id, feat in zip(utt_ids, feats): response_vec_dict[utt_id] = feat.tolist() response_str = json.dumps({ 'status': 'success', 'vectors': response_vec_dict }) response = Response(response_str, mimetype='application/json') return response
def visualize_classes_tsne(feat_filename, utt_2_class_filename, half_index=-1, normalize=True, class_mean_vector=False): feats, utt_ids = kaldi_io.readArk(feat_filename, limit=25000) feats_len = len(feats) assert (len(utt_ids) == len(feats)) print("Loaded:" + str(feats_len) + " feats.") feats = [feat.mean(0) for feat in feats] if half_index != -1: print('Cutting vectors at ', half_index, 'and normalize to unit length' if normalize else '') feats = [ feat[:half_index] / (np.linalg.norm(feat[:half_index]) if normalize else 1.0) for feat in feats ] else: if normalize: print('Normalize to unit length.') feats = [feat / np.linalg.norm(feat) for feat in feats] utt_2_class = utils.loadUtt2Spk(utt_2_class_filename) ground_truth_utt_2_class = [ utt_2_class[utt_id] for utt_id in utt_ids if utt_id in utt_2_class ] utt_ids_filtered = [utt_id for utt_id in utt_ids if utt_id in utt_2_class] #feats_filtered = [feat for feat,utt_id in zip(feats, utt_ids) if utt_id in utt_2_class] assert (len(ground_truth_utt_2_class) == len(utt_ids_filtered)) #assert(len(utt_ids_filtered) == len(feats_filtered) ) dataset = {} for feat, utt in zip(feats, utt_ids): if utt in utt_2_class: dataset[utt] = feat myclass_2_utt = {} myclass_2_samples = {} for myclass in set(ground_truth_utt_2_class): my_class_filtered_utts = [ utt_id for utt_id, gd_class in zip(utt_ids_filtered, ground_truth_utt_2_class) if gd_class == myclass ] if len(my_class_filtered_utts) > 100: myclass_2_utt[myclass] = my_class_filtered_utts myclass_2_samples[myclass] = random.sample( myclass_2_utt[myclass], min(1000, len(myclass_2_utt[myclass]))) feats_samples = [] feats_samples_classes = [] if class_mean_vector: for myclass in myclass_2_samples: feats_samples += [ np.vstack(dataset[utt] for utt in myclass_2_samples[myclass]).mean(0) ] feats_samples_classes += [myclass] else: for myclass in myclass_2_samples: feats_samples += [ dataset[utt] for utt in myclass_2_samples[myclass] ] feats_samples_classes += [myclass] * len( myclass_2_samples[myclass]) print('Added', len(myclass_2_samples[myclass]), 'entries for', myclass) print([ utt.replace('train-sample', 'train/sample') + '.mp3' for utt in myclass_2_samples[myclass] ]) class_2_num = dict([(a, b) for b, a in enumerate(list(myclass_2_samples.keys()))]) print(class_2_num) feats_samples_classes_num = [ class_2_num[myclass] for myclass in feats_samples_classes ] #print(feats_samples_classes_num) num_classes = max(feats_samples_classes_num) print('Num classes=', num_classes) print(feats_samples) print('shape:', feats_samples[0].shape) print('Calculating TSNE:') model = TSNE(n_components=2, random_state=0, metric='euclidean') tsne_data = model.fit_transform(np.vstack(feats_samples)) #model = TSNE(n_components=2, random_state=0, metric='cosine') #tsne_data = model.fit_transform([feat[100:] for feat in feats]) colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired colorst = colormap(np.linspace( 0, 0.9, num_classes + 1)) #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)] cs = [ colorst[feats_samples_classes_num[i]] for i in range(len(feats_samples_classes_num)) ] #print(tsne_data[:,0]) #print(tsne_data[:,1]) plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs) #for i,elem in enumerate(tsne_data): # print(cs[0]) # print(ground_truth_utt_2_spk[0]) # plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i]) plt.legend() # for i in range(tsne_data.shape[0]): # plt.text(tsne_data[i,0], tsne_data[i,1], uttids[i], fontsize=8, color=cs[i]) print('Now showing tsne plot:') plt.show()
def convert(alignments, spectrograms): input_dir = os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '/../dat/speech_tokenizer') output_dir = os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '/../dat/fast_load') utterances_output_dir = output_dir + '/utterances' if not os.path.exists(utterances_output_dir): os.makedirs(utterances_output_dir) if alignments: print('Storing tag information...', end='', flush=True) tags = [] with open(input_dir + '/new_alignments/phones.txt', 'r') as file_handle: for i, line in enumerate(file_handle): tag, id = line.strip('\n').split(' ') assert i == int(id) tag = tag.split('_') tag = (tag[1] if len(tag) > 1 else None, tag[0]) tags.append(tag) with open(output_dir + '/tags.json', 'w') as file_handle: json.dump(tags, file_handle) print(' DONE') print('Converting alignments...', end='', flush=True) tag_dict = dict() with open(input_dir + '/new_alignments/merged_alignment.txt', 'r') as file_handle: for line in file_handle: # Get the data on one tag line = line.strip('\n').split(' ') # Convert starts and durations from seconds to numbers of frames without risking floating point errors assert len(re.sub('.*\.', '', line[2])) == 3 assert len(re.sub('.*\.', '', line[3])) == 3 id, start, duration, tag = line[0], int( re.sub('\.', '', line[2][:-1])), int( re.sub('\.', '', line[3][:-1])), int(line[4]) if id not in tag_dict: tag_dict[id] = [] tag_dict[id].append((start, duration, tag)) n_ids = len(tag_dict.keys()) print(' DONE') start_time = time() for i, (key, value) in enumerate(tag_dict.items()): progress.print_bar(i, n_ids, 20, 'Storing alignment data... ┃', '┃') with open(utterances_output_dir + '/%s.json' % (key, ), 'w') as file_handle: # file_dict = {'id' : key, 'alignments' : value} json.dump(value, file_handle) progress.print_bar(i + 1, n_ids, 20, 'Storing alignment data... ┃', '┃ DONE %.4fs' % (time() - start_time)) print('Storing sequence IDs...', end='', flush=True) with open(output_dir + '/utterances.json', 'w') as file_handle: json.dump( sorted(tag_dict), file_handle, indent=4 ) # Asserts the same IDs in both alignments and spectrograms print(' DONE') if spectrograms: # Convert spectrograms print('Loading spectrogram data (this may take some time)...', end='', flush=True) feats, ids = readArk( input_dir + '/TEDLIUM_fbank_train_cleaned/unnormalized.feats.ark') n_ids = len(ids) print(' DONE') start_time = time() for i, (feat, id) in enumerate(zip(feats, ids)): progress.print_bar(i, n_ids, 20, 'Storing spectrogram data... ┃', '┃') np.save(utterances_output_dir + '/%s.npy' % (id, ), feat) progress.print_bar(i + 1, n_ids, 20, 'Storing spectrogram data... ┃', '┃ DONE %.4fs' % (time() - start_time))
def cluster_speaker(ark_file, cluster_algo='HDBSCAN', half_index=-1, dbscan_eps=0.0005, dbscan_min_samples=3, min_cluster_sizes_str="5", min_samples_str="3", utt_2_spk=None, output_utt_2_spk=None, fileset='dev', tsne_viz=False, n_jobs=4, db_scan_range_search=False, hdb_scan_range_search=False, normalize=True, do_save_result=True, use_gpu=False): postfix = '' print('Loading feats now:') feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset)) print('feat[0] shape: ', feats[0].shape) #feats = np.vstack([pairwise_normalize(feat[0]) for feat in feats]) print('Generating mean vector.') feats = np.vstack([feat.mean(0) for utt, feat in zip(uttids, feats)]) if half_index != -1: print('Cutting vectors at ', half_index, 'and normalize to unit length' if normalize else '') feats = np.vstack([ feat[half_index:] / (np.linalg.norm(feat[half_index:]) if normalize else 1.0) for feat in feats ]) else: if normalize: print('Normalize to unit length.') feats = np.vstack([feat / np.linalg.norm(feat) for feat in feats]) print('Done. feats shape:', feats.shape) # feats = np.vstack([feat[0] for utt,feat in zip(uttids,feats) if 'AlGore' not in utt]) # uttids = [utt for utt in uttids if 'AlGore' not in utt] print('feats shape:', feats.shape) print('feat[0] shape: ', feats[0].shape) print('halfindex:', half_index) # print('some distances:') # for a,b in [(random.randint(0, len(feats)-1), random.randint(0, len(feats)-1)) for i in range(10)] + [(0,0)]: # dst = distance.euclidean(feats[a],feats[b]) # print('euc dst:', a,b,'=',dst) # dst = distance.cosine(feats[a],feats[b]) # print('cos dst:', a,b,'=',dst) # dst = np.dot(feats[a],feats[b]) # print('dot dst:', a,b,'=',dst) # # dst = pos_neg_dot_distance(feats[a],feats[b]) # print('pos_neg_dot_distance dst:', a,b,'=',dst) # # # dst = pairwise_pos_neg_dot_distance(feats[a],feats[b]) # print('pairwise_pos_neg_dot_distance dst:', a,b,'=',dst) # # for a in range(10): # for b in range(10): # print('feats[a]:',feats[a]) # print('feats[b]:',feats[b]) # dst = pos_neg_dot_distance(feats[a],feats[b]) # print('pos_neg_dot_distance dst:', a,b,'=',dst) # pairwise_pos_neg_dot_distance(feats[a],feats[b]) # print('pairwise_pos_neg_dot_distance dst:', a,b,'=',dst) ground_truth_utt_2_spk, ground_truth_utt_2_spk_int = None, None if utt_2_spk is not None and utt_2_spk.lower( ) != 'none' and utt_2_spk.strip() != '': utt_2_spk = utils.loadUtt2Spk(utt_2_spk.replace('%set', fileset)) ground_truth_utt_2_spk = [utt_2_spk[utt_id] for utt_id in uttids] le = preprocessing.LabelEncoder() le.fit(ground_truth_utt_2_spk) ground_truth_utt_2_spk_int = le.transform(ground_truth_utt_2_spk) print("Ground truth speaker classes available:") print(ground_truth_utt_2_spk_int) print('Now running DBSCAN clustering on', len(uttids), 'entries.') bestARI = 0.0 bestConf = {} if db_scan_range_search: eps_range = [x / 100.0 for x in range(1, 100)] min_samples_range = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 50, 100] result_mat = np.zeros((len(eps_range), len(min_samples_range))) print('shape result mat:', result_mat.shape) for i_eps, dbscan_eps in enumerate(eps_range): for i_min_samples, dbscan_min_samples in enumerate( min_samples_range): dbscan_algo = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples, metric=pairwise_pos_neg_dot_distance, n_jobs=1) clustering = dbscan_algo.fit(feats) clustering_labels = list(clustering.labels_) print('dbscan_eps', dbscan_eps, 'dbscan_min_samples', dbscan_min_samples) print('num clusters:', len(set(clustering_labels))) ARI = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int, clustering_labels) result_mat[i_eps][i_min_samples] = float(ARI) print('ARI:', ARI) if ARI > bestARI: print('Found new best conf:', ARI) bestConf = { 'eps': dbscan_eps, 'min_samples': dbscan_min_samples } bestARI = ARI plt.matshow(result_mat) plt.show() np.save(ark_file + '.dbrangescan_cluster_ARI' + postfix, result_mat) print('bestARI:', bestARI) print('bestConf:', bestConf) min_cluster_sizes = [int(x) for x in min_cluster_sizes_str.split(',')] min_samples = [int(x) for x in min_samples_str.split(',')] result_mat = np.zeros((len(min_cluster_sizes), len(min_samples))) result_mat_outliers = np.zeros_like(result_mat) result_mat_n = np.zeros_like(result_mat) best_pairwise_f1 = 0.0 bestConf = {} # previous good config: min_cluster_size=5, min_samples=3 for i, min_cluster_size in enumerate(min_cluster_sizes): for j, min_sample in enumerate(min_samples): feat_key = ark_file.split('/')[-3] + '_' + str( min_cluster_size) + '_' + str(min_sample) if do_save_result: save_result(feat_key, "cl_size", str(min_cluster_size)) save_result(feat_key, "min_s", str(min_sample)) if cluster_algo == 'HDBSCAN': print('Running HDBSCAN with min_cluster_size', min_cluster_size, 'min_samples', dbscan_min_samples) cluster_algo = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_sample, metric='euclidean', algorithm='best', core_dist_n_jobs=28) elif cluster_algo == 'DBSCAN': print('Running DBSCAN with dbscan_eps', dpscan_eps, 'dbscan_min_samples', dbscan_min_samples) cluster_algo = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples, metric='euclidean', n_jobs=28) elif cluster_algo == 'kmeans': print( 'kmeans clustering not available for speaker clustering yet. Exiting.' ) sys.exit(-1) else: print('cluster_algo:', cluster_algo, 'not supported. Exiting.') clustering = cluster_algo.fit(feats) clustering_labels = list(clustering.labels_) print('Num of clusters (as determined by density clustering):', len(set(clustering_labels))) print(clustering_labels) sys.stdout.flush() #print('Numpy bincount of the clustering:', np.bincount(clustering)) number_format = "%.4f" clustering_labels1 = clustering_labels clustering_labels2 = [] num_outliers = -1 for elem in clustering_labels1: if elem == -1: clustering_labels2.append(num_outliers) num_outliers -= 1 else: clustering_labels2.append(elem) num_outliers = (num_outliers + 1) * -1 if utt_2_spk is not None and utt_2_spk.lower( ) != 'none' and utt_2_spk.strip() != '': if do_save_result: save_result(feat_key, 'outliers_' + fileset, str(num_outliers)) save_result(feat_key, 'clusters_' + fileset, str(len(set(clustering_labels)))) print( 'Number of outliers:', num_outliers, '(', number_format % (float(num_outliers) * 100.0 / float(len(uttids))), '%)') #This would compute scores with all outliers in the same cluster: #ARI = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int, clustering_labels) #print('ARI score:', number_format % ARI) #vmeasure = metrics.v_measure_score(ground_truth_utt_2_spk_int, clustering_labels) #print('V-measure:', number_format % vmeasure) ARI2 = metrics.adjusted_rand_score(ground_truth_utt_2_spk_int, clustering_labels2) print('ARI score (each outlier its own cluster):', number_format % ARI2) vmeasure2 = metrics.v_measure_score(ground_truth_utt_2_spk_int, clustering_labels2) print('NMI / V-measure (each outlier its own cluster):', number_format % vmeasure2) if do_save_result: save_result(feat_key, 'ARI_' + fileset, number_format % ARI2) save_result(feat_key, 'NMI_' + fileset, number_format % vmeasure2) print('Calculating pairwise recall:') cluster_pairwise = pdist( np.asarray(clustering_labels2)[:, np.newaxis], metric='chebyshev') < 1 groundtruth_pairwise = pdist( np.asarray(ground_truth_utt_2_spk_int)[:, np.newaxis], metric='chebyshev') < 1 #scitkits recall_score and precision_score is slow unfortunatly #pairwise_recall = metrics.recall_score(groundtruth_pairwise, cluster_pairwise , pos_label=True, average='binary') #pairwise_precision = metrics.precision_score(groundtruth_pairwise, cluster_pairwise , pos_label=True, average='binary') #print('scikit learn recall / precision:', pairwise_recall, pairwise_precision) # efficient binary comparision, since the pairwise matrix can be huge for large n tp = np.sum( np.bitwise_and(groundtruth_pairwise, cluster_pairwise)) fp = np.sum( np.bitwise_and(np.invert(groundtruth_pairwise), cluster_pairwise)) fn = np.sum( np.bitwise_and(groundtruth_pairwise, np.invert(cluster_pairwise))) pairwise_precision = tp / (tp + fp) pairwise_recall = tp / (tp + fn) pairwise_f1 = 2.0 * pairwise_recall * pairwise_precision / ( pairwise_recall + pairwise_precision) print( 'pairwise recall / precision / f1-score (each outlier its own cluster):', number_format % pairwise_recall, number_format % pairwise_precision, number_format % pairwise_f1) if do_save_result: save_result(feat_key, 'recall_' + fileset, number_format % pairwise_recall) save_result(feat_key, 'prec_' + fileset, number_format % pairwise_precision) save_result(feat_key, 'f1_' + fileset, number_format % pairwise_f1) if pairwise_f1 > best_pairwise_f1: print('Found new best pairwise f1:', pairwise_f1) bestConf = { 'min_cluster_size': min_cluster_size, 'min_sample': min_sample, 'n': len(set(clustering_labels)), 'outliers': num_outliers } best_pairwise_f1 = pairwise_f1 result_mat[i][j] = float(pairwise_f1) result_mat_outliers[i][j] = num_outliers result_mat_n[i][j] = len(set(clustering_labels)) #print('pairwise recall / precision / f1-score:', number_format % pairwise_recall, number_format % pairwise_precision, number_format % pairwise_f1) print('Clustering predicted classes:', len(set(clustering_labels))) print('Ground truth classes', len(set(ground_truth_utt_2_spk_int))) # if len(min_cluster_sizes) > 1 or len(min_samples) > 1: # np.save(ark_file + '.hdbrangescan_cluster_f1' + postfix, result_mat) # # print('best f1:', best_pairwise_f1) # print(bestConf) # # print('f1 scores:') # plt.matshow(result_mat) # plt.show() # # print('num outliers') # plt.matshow(result_mat_outliers) # plt.show() # # print('n') # plt.matshow(result_mat_n) # plt.show() if tsne_viz: print('Calculating TSNE:') model = TSNE(n_components=2, random_state=0, metric='euclidean') tsne_data = model.fit_transform(feats) #model = TSNE(n_components=2, random_state=0, metric='cosine') #tsne_data = model.fit_transform([feat[100:] for feat in feats]) if utt_2_spk is not None and utt_2_spk.lower( ) != 'none' and utt_2_spk.strip() != '': num_speakers = max(ground_truth_utt_2_spk_int) + 1 else: num_speakers = len(set(clustering_labels)) colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired colorst = colormap( np.linspace(0, 0.9, num_speakers) ) #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)] if utt_2_spk is not None and utt_2_spk.lower( ) != 'none' and utt_2_spk.strip() != '': cs = [ colorst[ground_truth_utt_2_spk_int[i]] for i in range(len(clustering_labels)) ] else: cs = [ colorst[clustering_labels[i]] for i in range(len(clustering_labels)) ] #print(tsne_data[:,0]) #print(tsne_data[:,1]) plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs) #for i,elem in enumerate(tsne_data): # print(cs[0]) # print(ground_truth_utt_2_spk[0]) # plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i]) plt.legend() # for i in range(tsne_data.shape[0]): # plt.text(tsne_data[i,0], tsne_data[i,1], uttids[i], fontsize=8, color=cs[i]) print('Now showing tsne plot:') plt.show() if output_utt_2_spk is not None and output_utt_2_spk.lower( ) != 'none' and output_utt_2_spk.strip() != '': if len(min_cluster_sizes) > 1 or len(min_samples) > 1: print( 'Not saving clustering result, since we searched a full range. Rerun with a single min_cluster_size and min_samples parameter.' ) else: output_utt_2_spk = output_utt_2_spk.replace( '%minclustersize', str(min_cluster_size)) output_utt_2_spk = output_utt_2_spk.replace( '%minsample', str(min_sample)) output_utt_2_spk = output_utt_2_spk.replace('%set', fileset) featstr = ark_file.split('/')[-3] featstr = featstr.replace( 'featinput_unnormalized.feats.ark_dot_combine_tied_embs', 'std_end_conf').replace('feats_', '') print('featstr:', featstr) output_utt_2_spk = output_utt_2_spk.replace('%feat', featstr) output_utt_2_spk += ('_l2norm' if normalize else '') #output_utt_2_spk += postfix print('Saving result to:', output_utt_2_spk) with open(output_utt_2_spk, 'w') as output_utt_2_spk_out: for utt, label in zip(uttids, clustering_labels2): output_utt_2_spk_out.write(utt + (' spk%07d' % label).replace('-', 'o') + '\n')
def corr_phn(ark_file, alignment_dir, fileset='train', limit=5000, cmudict_sort=True): print('len cmudict arpabet:', len(cmudict_silence + cmudict_vowels + cmudict_consonants)) feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset), limit=limit) phones_txt = alignment_dir + 'phones.txt' num2phone_orig = load_phones_txt(phones_txt) #phone2num_orig = print(num2phone_orig) phoneset = list(set(num2phone_orig.values())) not_vowels = [] for elem in phoneset: if elem not in cmudict_vowels: not_vowels += [elem] for elem in cmudict_vowels: if elem not in phoneset: print('vowel NA:', elem) print(not_vowels) print('Loaded num2phone_orig for: ', len(set(num2phone_orig.values())), 'phones') if cmudict_sort: num2phone = dict( enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants)) phone2num = dict([ (y, x) for x, y in enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants) ]) else: num2phone = dict(enumerate(sorted(list(set(num2phone_orig.values()))))) phone2num = dict([ (y, x) for x, y in enumerate(sorted(list(set(num2phone_orig.values())))) ]) num2phoneid = dict([(i, phone2num[x]) for i, x in num2phone_orig.items()]) print(num2phoneid) ctm_file = alignment_dir + 'all.ctm' all_ctm_db = load_all_ctm_pos(ctm_file, num2phoneid, limit=limit) phone_len = len(num2phone.keys()) feat_dim = len(feats[0][0]) print(feat_dim) phone_mutual_info_classifs = [] do_2d = True parallel = True if do_2d: if not parallel: for phone in sorted(num2phone.keys()): #for feat_num in range(feat_dim): phone_mutual_info_classif = get_phone_mutual_info( phone, feats, uttids, all_ctm_db) phone_mutual_info_classifs += [phone_mutual_info_classif] phone_mutual_info_classifs = np.vstack(phone_mutual_info_classifs) else: pool = multiprocessing.Pool(28) get_phone_mutual_info_partial = partial(get_phone_mutual_info, feats=feats, uttids=uttids, all_ctm_db=all_ctm_db) phone_mutual_info_classifs = pool.map( get_phone_mutual_info_partial, sorted(num2phone.keys())) #for feat_num in range(feat_dim): else: feat_vars = [] phone_vars = [] for feat, uttid in zip(feats, uttids): ctm = all_ctm_db[uttid] #print('lens (ctm|feat):',len(ctm), len(feat)) if len(ctm) > len(feat): print(uttid, 'warning, ctm is slightly longer than feature len:', len(ctm), 'vs.', len(feat)) ctm = ctm[:len(feat)] elif len(ctm) < len(feat): print(uttid, 'warning, ctm is slightly shorter than feature len:', len(ctm), 'vs.', len(feat)) assert (len(feat) == len(ctm)) feat_vars += [feat] #phone_var = #(np.array(ctm) == phone) * 1.0 phone_vars += [ctm] feat_vars = np.vstack(feat_vars) phone_vars = np.hstack(phone_vars) print(feat_vars.shape) print(phone_vars.shape) phone_mutual_info_classifs = sklearn.feature_selection.mutual_info_classif( feat_vars, phone_vars) print(phone_mutual_info_classifs) print(plt.rcParams["figure.figsize"]) fig = plt.figure(figsize=(11, 7)) ax = fig.add_subplot(111) phone_mutual_info_classifs[0] *= 0.7 ax.matshow(phone_mutual_info_classifs, aspect='auto') plt.yticks(sorted(num2phone.keys()), [num2phone[x] for x in sorted(num2phone.keys())]) plt.show()
def tsne(ark_file, alignment_dir, fileset='train', limit=4000, subsample=1000, only_common_bigrams=True, cmudict_sort=True, normalize=True): feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset), limit=limit) if normalize: print('Normalize to unit length.') feats = [feat / np.linalg.norm(feat) for feat in feats] phones_txt = alignment_dir + 'phones.txt' num2phone_orig = load_phones_txt(phones_txt) #phone2num_orig = print(num2phone_orig) phoneset = list(set(num2phone_orig.values())) if cmudict_sort: num2phone = dict( enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants)) phone2num = dict([ (y, x) for x, y in enumerate(cmudict_silence + cmudict_vowels + cmudict_consonants) ]) else: num2phone = dict(enumerate(sorted(list(set(num2phone_orig.values()))))) phone2num = dict([ (y, x) for x, y in enumerate(sorted(list(set(num2phone_orig.values())))) ]) num2phoneid = dict([(i, phone2num[x]) for i, x in num2phone_orig.items()]) print(num2phoneid) ctm_file = alignment_dir + 'all.ctm' all_ctm_db = load_all_ctm_pos(ctm_file, num2phoneid, limit=limit) phn_ngram_counts = Counter() ngram_func = fivegrams feat_vars = [] phone_vars = [] for feat, uttid in zip(feats, uttids): pos_array, phone_array = all_ctm_db[uttid] assert (len(pos_array) == len(phone_array)) feat_vars += [feat[list(pos_array)]] phone_vars += [phone_array] phn_ngram_counts.update(ngram_func(phone_array)) feat_vars = np.vstack(feat_vars) phone_vars = np.hstack(phone_vars) #print(phn_counts) most_common_ngrams = [] print('Most common phone ngrams:') for phone_ngram, count in phn_ngram_counts.most_common(10): print(phone_ngram, [num2phone[elem] for elem in phone_ngram], count) most_common_ngrams.append(phone_ngram) #most_common_ngrams = [(30, 12, 30, 4, 26), (32, 4, 27, 35, 11), (4, 37, 9, 26, 19)] #print('Manually selected:', most_common_ngrams) most_common_bigrams_to_num = dict([ (b, a) for a, b in enumerate(most_common_ngrams) ]) print('Loaded', len(phone_vars), 'phoneme examples') if only_common_bigrams: select_pos = [ i for i, elem in enumerate(ngram_func(phone_vars)) if elem in most_common_ngrams ] feat_vars = feat_vars[select_pos] phone_vars = np.asarray([ most_common_bigrams_to_num[elem] for i, elem in enumerate(ngram_func(phone_vars)) if elem in most_common_ngrams ]) #phone_vars[select_pos] print(phone_vars) print(feat_vars) assert (len(feat_vars) == len(phone_vars)) if subsample != -1: feat_phone_vars_sampled_idx = np.random.choice(np.arange( len(feat_vars)), subsample, replace=False) feat_vars = feat_vars[feat_phone_vars_sampled_idx] phone_vars = phone_vars[feat_phone_vars_sampled_idx] print('Subsampled to:', len(phone_vars), 'phoneme examples') model = TSNE(n_components=2, random_state=0, metric='cosine') tsne_data = model.fit_transform(feat_vars) num_classes = len(phoneset) if only_common_bigrams: num_classes = len(list(set(phone_vars))) colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired colorst = colormap(np.linspace( 0, 1.0, num_classes + 1)) #[colormap(i) for i in np.linspace(0, 0.9, num_speakers)] cs = [colorst[phone_vars[i]] for i in range(len(phone_vars))] #print(tsne_data[:,0]) #print(tsne_data[:,1]) for i, color in enumerate(colorst): plt.text(30 + float(i) * 3, 30, str(i), color=color) plt.scatter(tsne_data[:, 0], tsne_data[:, 1], color=cs) #for i,elem in enumerate(tsne_data): # print(cs[0]) # print(ground_truth_utt_2_spk[0]) # plt.scatter(elem[0], elem[1], color=cs[i], label=ground_truth_utt_2_spk[i]) #plt.legend(most_common_ngrams) plt.show()
def same_different_experiment(ark_file, utt_2_spk, half_index=-1, normalize=False, fileset='', use_metric='cosine', max_spks=-1, random_seed=42): results_file = 'samedifferent_results.csv' from pyannote.metrics.plot.binary_classification import plot_det_curve, plot_distributions print('Loading feats now:') feats, uttids = kaldi_io.readArk(ark_file.replace('%set', fileset)) print('loaded: ' + str(len(feats)) + ' feats') print('feat[0] shape: ', feats[0].shape) #feats = np.vstack([pairwise_normalize(feat[0]) for feat in feats]) print('Generating mean vector.') feats = np.vstack([feat.mean(0) for utt, feat in zip(uttids, feats)]) if half_index != -1: print('Cutting vectors at ', half_index, 'and normalize to unit length' if normalize else '') feats = np.vstack([ feat[:half_index] / (np.linalg.norm(feat[:half_index]) if normalize else 1.0) for feat in feats ]) else: if normalize: print('Normalize to unit length.') feats = np.vstack([feat / np.linalg.norm(feat) for feat in feats]) #print(type(feats)) #print(feats) if utt_2_spk is not None and utt_2_spk.lower( ) != 'none' and utt_2_spk.strip() != '': utt_2_spk = utils.loadUtt2Spk(utt_2_spk.replace('%set', fileset)) if max_spks != -1: # sample subset of speakers spk_set = list(set(utt_2_spk.values())) # make speaker selection reproducable spk_set.sort() #print('Selecting from these speakers:', spk_set) random.seed(random_seed) #np.random.seed(42) selected_spks = random.sample(spk_set, min(len(spk_set), max_spks)) print('Selecting random subset of speakers with random seed', random_seed, ':', len(selected_spks), 'speakers') print(selected_spks) # make new utt2spk dictionary on subset utt_2_spk_new = dict([(key, utt_2_spk[key]) for key in utt_2_spk if utt_2_spk[key] in selected_spks]) #filter feats and uttids feats = [ feat for feat, uttid in zip(feats, uttids) if uttid in utt_2_spk_new ] uttids = [uttid for uttid in uttids if uttid in utt_2_spk_new] print('Reduced feats to: ' + str(len(feats)) + ' feats') print('Reduced uttids to: ' + str(len(feats)) + ' uttids') utt_2_spk = utt_2_spk_new else: print('Using all speakers:', len(set(utt_2_spk.values()))) ground_truth_utt_2_spk = [utt_2_spk[utt_id] for utt_id in uttids] le = preprocessing.LabelEncoder() le.fit(ground_truth_utt_2_spk) ground_truth_utt_2_spk_int = le.transform(ground_truth_utt_2_spk) print("Ground truth speaker classes available:") print(ground_truth_utt_2_spk_int) print('Calculating', use_metric, 'distance matrix...') #print('feats shape:', feats.shape) distances = pdist(feats, metric=use_metric) print('Calculating ground thruth distance matrix...') y_true = pdist(np.asarray(ground_truth_utt_2_spk_int)[:, np.newaxis], metric='chebyshev') < 1 result_key = ark_file.split('/')[-3] + ('.' + fileset if fileset != '' else '') + '.' + use_metric prefix = 'plots/plot.' + ark_file.split('/')[-3] + '.' + use_metric + ( '.' + fileset if fileset != '' else '') + '.seed_' + str(random_seed) plot_distributions(y_true, distances, prefix, xlim=(0, 2), ymax=3, nbins=100) eer = plot_det_curve(y_true, -distances, prefix) print('EER = {eer:.2f}%'.format(eer=100 * eer)) with open(results_file, 'a') as outfile: outfile.write(result_key + ' ' + '{eer:.2f}%'.format(eer=100 * eer) + '\n')