def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27502, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert_equal(ami, 1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) # This is not accurate to more than 2 places assert_almost_equal(ami, 0.37, 2)
def test_contingency_matrix(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C = contingency_matrix(labels_a, labels_b) C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0] assert_array_almost_equal(C, C2) C = contingency_matrix(labels_a, labels_b, eps=0.1) assert_array_almost_equal(C, C2 + 0.1)
def test_contingency_matrix_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C = contingency_matrix(labels_a, labels_b) C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray() assert_array_almost_equal(C, C_sparse) C_sparse = assert_raise_message( ValueError, "Cannot set 'eps' when sparse=True", contingency_matrix, labels_a, labels_b, eps=1e-10, sparse=True )
def munkres_score(gt, pred): """ :param gt: a list of lists, each containing ints :param pred: a list of lists, each containing ints :return: accuracy """ # Combine all the sequences into one long sequence for both gt and pred gt_combined = np.concatenate(gt) pred_combined = np.concatenate(pred) # Make sure we're comparing the right shapes assert (gt_combined.shape == pred_combined.shape) # Build out the contingency matrix # This follows the methodology suggested by Zhou, De la Torre & Hodgkins, PAMI 2013. mat = contingency_matrix(gt_combined, pred_combined) # Make the cost matrix # Use the fact that no entry can exceed the total length of the sequence cost_mat = make_cost_matrix(mat, lambda x: gt_combined.shape[0] - x) # Apply the Munkres method (also called the Hungarian method) to find the optimal cluster correspondence m = Munkres() indexes = m.compute(cost_mat) # Pull out the associated 'costs' i.e. the cluster overlaps for the correspondences found cluster_overlaps = mat[list(zip(*indexes))] # Now compute the accuracy accuracy = np.sum(cluster_overlaps) / float(np.sum(mat)) return accuracy
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples).fit(X) core_optics, labels_optics = op.extract_dbscan(eps) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, labels_optics) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree # verify core_labels match assert_array_equal(core_optics, db.core_sample_indices_) non_core_count = len(labels_optics) - len(core_optics) percent_mismatch = np.round((disagree - 1) / non_core_count, 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
def clustering_evaluation(model, labels, data): result = " Adjusted Rand Index : " + str( metrics.adjusted_rand_score(labels, model.labels_)) result += "\n Homogeneity Score : " + str( metrics.homogeneity_score(labels, model.labels_)) result += "\n Silhoutte Score : " + str( metrics.silhouette_score(data, model.labels_, metric='l2')) return result, contingency_matrix(labels, model.labels_)
def calculate_clusteringAccuracy(labels_true, labels_pred): labels_true = np.array(labels_true) labels_true = labels_true.reshape(labels_true.size) labels_pred = np.array(labels_pred) labels_pred = labels_pred.reshape(labels_pred.size) matrix = contingency_matrix(labels_true, labels_pred) return get_IndicesClusterxClass(matrix)
def print_5_measure(y_true, y_pred): x1 = metrics.homogeneity_score(y_true, y_pred) x2 = metrics.completeness_score(y_true, y_pred) x3 = metrics.v_measure_score(y_true, y_pred) x4 = metrics.adjusted_rand_score(y_true, y_pred) x5 = metrics.adjusted_mutual_info_score(y_true, y_pred) x6 = contingency_matrix(y_true, y_pred) return [x1, x2, x3, x4, x5], x6
def RandIndex(labels_true, labels_pred): n_samples = len(labels_true) contingency = contingency_matrix(labels_true, labels_pred, sparse=True) a = sum(comb2(n_ij) for n_ij in contingency.data) b = sum(comb2(n_c) for n_c in np.ravel(contingency.sum(axis=0))) - a c = sum(comb2(n_c) for n_c in np.ravel(contingency.sum(axis=1))) - a d = comb(n_samples, 2) - a - b - c return (a + d) / comb(n_samples, 2)
def confusion_matrix(labels_true, labels_pred): n_samples = len(labels_true) c = contingency_matrix(labels_true, labels_pred, sparse=True) total = n_samples * (n_samples - 1) tp = np.dot(c.data, c.data) - n_samples fp = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples - tp fn = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples - tp tn = total - (tp + fn + fp) return tp, tn, fp, fn
def purity(labels_true, labels_pred): ''' input: labels_true: an array of labels of the given partitions labels_pred: an array of labels of the clusters return: the purity between the partitions and clusters ''' # contingency matrix cmat = contingency_matrix(labels_true, labels_pred) return (cmat.max(axis=0) / cmat.sum()).sum()
def purity_weights(gt, pred): # Build the contingency matrix cmat = contingency_matrix(gt, pred) # Find assignments based on a purity criteria # Maps clusters to gt-labels pure_assignments = np.argmax(cmat, axis=0) # A weight for each time-step (= 1 if cluster matches assigned gt-label otherwise = 0) return (gt == pure_assignments[pred]).astype(int)
def purity_score(y_true, y_pred): contingency_matrix1 = contingency_matrix(y_true, y_pred) print("contingency_matrix") print(contingency_matrix1) row_ind, col_ind = linear_sum_assignment(-contingency_matrix1) #print(row_ind,col_ind) #print(contingency_matrix1[row_ind,col_ind]) print("Purity-score is:", end='') return (contingency_matrix1[row_ind, col_ind].sum()) / (np.sum(contingency_matrix1))
def ConditionalEntropy(clusters, partitions): contigencyTable = contingency_matrix(clusters, partitions) H = [] Hci = 0 for i in range(contigencyTable.shape[0]): ni = np.sum(contigencyTable[i]) for j in range(contigencyTable.shape[1]): Hci -= contigencyTable[i][j] / ni * np.log10( contigencyTable[i][j] / ni) H.append(Hci) return H
def Fmeasure(clusters, partitions): F = 0 contigencyTable = contingency_matrix(clusters, partitions) idx = contigencyTable.max(axis=1) for i in range(contigencyTable.shape[0]): nij = contigencyTable.max(axis=1) ni = np.sum(contigencyTable[i]) ji = contigencyTable[:, idx[i]] mji = np.sum(ji) F += 2 * nij / (ni + mji) return F / contigencyTable.shape[0]
def get_cluster_data(X, y, name, km_k, gmm_k, rdir, pdir, perplexity=30): """Generates 2D dataset that contains cluster labels for K-Means and GMM, as well as the class labels for the given dataset. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. perplexity (int): Perplexity parameter for t-SNE. km_k (int): Number of clusters for K-Means. gmm_k (int): Number of components for GMM. rdir (str): Folder to save results CSV. """ print('get_cluster_data: %s' % name) # generate 2D X dataset X2D = TSNE(n_iter=5000, perplexity=perplexity).fit_transform(X) # get cluster labels using best k km = KMeans(random_state=0).set_params(n_clusters=km_k) gmm = GMM(random_state=0).set_params(n_components=gmm_k) km_contingency_matrix = contingency_matrix(y, km.fit(X).labels_) gm_contingency_matrix = contingency_matrix(y, gmm.fit(X).predict(X)) print km.cluster_centers_ generate_contingency_matrix( km_contingency_matrix, gm_contingency_matrix, name, pdir, ) km_cl = np.atleast_2d(km.fit(X2D).labels_).T gmm_cl = np.atleast_2d(gmm.fit(X2D).predict(X2D)).T y = np.atleast_2d(y).T # create concatenated dataset cols = ['x1', 'x2', 'km', 'gmm', 'class'] df = pd.DataFrame(np.hstack((X2D, km_cl, gmm_cl, y)), columns=cols) # save as CSV filename = '{}_2D.csv'.format(name) save_dataset(df, filename, sep=',', subdir=rdir, header=True)
def class_cluster_match(y_true, y_pred): """Translate prediction labels to maximize the accuracy. Translate the prediction labels of a clustering output to enable calc of external metrics (eg. accuracy, f1_score, ...). Translation is done by maximization of the confusion matrix :math:`C` main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal or smaller than the number of true classes. Parameters ---------- y_true : array, shape = [n_samples] Ground truth (correct) target values. y_pred : array, shape = [n_samples] Estimated targets as returned by a clustering algorithm. Returns ------- trans : array, shape = [n_classes, n_classes] Mapping of y_pred clusters, such that :math:`trans\subseteq y_true` References ---------- """ classes = unique_labels(y_true).tolist() n_classes = len(classes) clusters = unique_labels(y_pred).tolist() n_clusters = len(clusters) if n_clusters > n_classes: classes += [ 'DEF_CLASS' + str(i) for i in range(n_clusters - n_classes) ] elif n_classes > n_clusters: clusters += [ 'DEF_CLUSTER' + str(i) for i in range(n_classes - n_clusters) ] C = contingency_matrix(y_true, y_pred) true_idx, pred_idx = linear_sum_assignment(-C) true_idx = true_idx.tolist() pred_idx = pred_idx.tolist() true_idx = [classes[idx] for idx in true_idx] true_idx = true_idx + sorted(set(classes) - set(true_idx)) pred_idx = [clusters[idx] for idx in pred_idx] pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx)) return_list = [true_idx[pred_idx.index(y)] for y in y_pred] return return_list
def doClusters(num_clusters, reducer, X, opt_file, i): start = time.time() if (reducer == 'pca'): pca = PCA(n_components=i) X = pca.fit_transform(X) if (i == 141): for j in range(i): file = io.open(folder_name + "-out" + "\\pca\\pca-" + str(j) + ".txt", 'w', encoding="utf-8") file.write("num_words " + str(len(words)) + "\n\n") for val in range(len(pca.components_[j])): file.write(words[val] + " :" + str(pca.components_[j][val]) + "\n") #print(pca.explained_variance_) elif (reducer == 'kpca,lin'): kpcal = KernelPCA(n_components=i, kernel='linear') X = kpcal.fit_transform(X) elif (reducer == 'kpca,poly'): kpcap = KernelPCA(n_components=i, kernel='poly') X = kpcap.fit_transform(X) elif (reducer == 'kpca,cos'): kpcac = KernelPCA(n_components=i, kernel='cosine') X = kpcac.fit_transform(X) elif (reducer == 'kpca,sig'): kpcas = KernelPCA(n_components=i, kernel='sigmoid') X = kpcas.fit_transform(X) elif (reducer == 'none' and i != 141): return rt = time.time() - start start = time.time() km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=20, random_state=0) y = km.fit_predict(X) ct = time.time() - start if (reducer == 'none'): reducer = 'tfidf' i = 18872 time_file.write("\n" + reducer + "--" + str(i) + "\n" + str(rt) + "\n" + str(ct) + "\n" + str(rt + ct) + "\n") print("reducer: " + reducer + ": " + str(i) + " dims - done") confusion = contingency_matrix(actuallabels, y) dr = (i / 141) * 100 db = round(metrics.davies_bouldin_score(X, y), 4) table.write( str(i) + ", " + str(round(100 - dr, 4)) + ", " + str(db) + ", ") write2d(opt_file, reducer + "--" + str(i), confusion, actuallabels, y, db)
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True): n_samples, = gt_labels.shape c = contingency_matrix(gt_labels, pred_labels, sparse=sparse) tk = np.dot(c.data, c.data) - n_samples pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples avg_pre = tk / pk avg_rec = tk / qk fscore = _compute_fscore(avg_pre, avg_rec) return avg_pre, avg_rec, fscore
def k_means(k, dimensions): rows = dimensions.shape[0] cols = dimensions.shape[1] mn = np.mean(dimensions, axis=0) std = np.std(dimensions, axis=0) centers = np.random.randn(k, cols) * std + mn # plt.scatter(centers[:,0], centers[:,1], marker='+', c='r', s=150) # to store old centers co = np.zeros(centers.shape) # to Store new centers cn = deepcopy(centers) clusters = np.zeros(rows) distances = np.zeros((rows, k)) error = np.linalg.norm(cn - co) # When, after an update, the estimate of that center stays the same, exit loop while error != 0: # Measure the distance to every center for i in range(k): distances[:, i] = np.linalg.norm(dimensions - cn[i], axis=1) # Assign all training data to closest center clusters = np.argmin(distances, axis=1) co = deepcopy(cn) # Calculate mean for every cluster and update the center for i in range(k): cn[i] = np.mean(dimensions[clusters == i], axis=0) error = np.linalg.norm(cn - co) # centers_new # plt.scatter(cn[:,0], cn[:,1], marker='+', c='g', s=150) # print(clusters) # print(np.unique(clusters)) # cmat = contingency_matrix(clusters, lclass) # print(cmat) for i, item in enumerate(cmat): print("Purity of clusters :", i, " :", max(item) * 100 / sum(item)) pure = 0 for row in cmat: # print(max(row)) pure += max(row) purity0 = pure / len(label) return purity0
def calculate_accuracy(labels, pred_labels): label_map = np.argmax(contingency_matrix(labels, pred_labels), axis=1).tolist() # print("argmax ", np.argmax(contingency_matrix(true_labels, pred_labels), axis=1)) def map_labels(x): try: return label_map.index(x) + 1 except ValueError: return 0 mapped_pred_labels = list(map(map_labels, pred_labels)) return accuracy_score(labels, mapped_pred_labels)
def fms_compare(XX, YY, npoints, plot_title, plot_save): #Clustering ZXc = hierarchy.linkage(XX, method=clustering_method) ZYc = hierarchy.linkage(YY, method=clustering_method) #Cut dendrogram to obtain labelling for each k value #Warning: using hierarchy.cut_tree, but this function has a known bug! fms_dict = {} mean_dict = {} mean_dict[npoints]=0 varbound_dict = {} varbound_dict[npoints]=0 for i in range(1,npoints+1): ZXc_cut = [l for sublist in hierarchy.cut_tree(ZXc, i) for l in sublist] ZYc_cut = [l for sublist in hierarchy.cut_tree(ZYc, i) for l in sublist] #Compute FM scores score = fms(ZXc_cut, ZYc_cut) fms_dict[i] = score #Compute moments for plotting and analysis c = contingency_matrix(ZXc_cut, ZYc_cut, sparse=True) tk = np.dot(c.data, c.data) - npoints pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - npoints qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - npoints pk2 = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=0)).ravel())) qk2 = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=1)).ravel())) if i < npoints: mean = (np.sqrt(pk*qk)) / (npoints*(npoints-1)) mean_dict[i] = mean variance = (2/(npoints*(npoints-1))) + ((4*pk2*qk2)/(npoints*(npoints-1)*(npoints-2)*pk*qk))+ (((pk-2-((4*pk2)/pk))*(qk-2-((4*qk2)/qk)))/(npoints*(npoints-1)*(npoints-2)*(npoints-3))) - ((pk*qk)/((npoints**2)*((npoints-1)**2))) varbound_dict[i] = 2* (variance**0.5) #Plot Bk and variance bounds lists = sorted(fms_dict.items()) x, z = zip(*lists) upper = [mean_dict[i]+varbound_dict[i] for i in x] lower = [mean_dict[i]-varbound_dict[i] for i in x] means = [mean_dict[i] for i in x] #plt.plot(x,z) plt.scatter(x,z) plt.plot(x,upper) plt.plot(x, means) plt.plot(x,lower) plt.title(plot_title) plt.xlabel('# clusters') plt.ylabel('B_k') plt.savefig(path_fm_plot+ plot_save+'.jpg') plt.clf()
def eval_cluster(gt_labels, pred_labels, sparse=True): n_samples = gt_labels.shape c = contingency_matrix(gt_labels, pred_labels, sparse=sparse) tk = np.dot(c.data, c.data) - n_samples pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples avg_pre = tk / pk avg_rec = tk / qk rec = avg_rec[0] pre = avg_pre[0] fscore = 2. * pre * rec / (pre + rec) return rec, pre, fscore
def calculate_purity(labels_true, labels_pred): labels_true = np.array(labels_true) labels_true = labels_true.reshape(labels_true.size) labels_pred = np.array(labels_pred) labels_pred = labels_pred.reshape(labels_pred.size) k = np.size(np.unique(labels_pred)) purityVector = np.zeros(k) purity = 0 matrix = as_float_array(contingency_matrix(labels_true, labels_pred)) for i in xrange(k): moda = np.float(np.max(matrix[:, i])) purityVector[i] = moda / np.sum(matrix[:, i]) purity += purityVector[i] * np.sum(matrix[:, i]) / np.size(labels_pred) return purity, purityVector
def _compute_counts(y_true, y_pred): # TODO(tsitsulin): add docstring pylint: disable=missing-function-docstring contingency = contingency_matrix(y_true, y_pred) same_class_true = np.max(contingency, 1) same_class_pred = np.max(contingency, 0) diff_class_true = contingency.sum(axis=1) - same_class_true diff_class_pred = contingency.sum(axis=0) - same_class_pred total = contingency.sum() true_positives = (same_class_true * (same_class_true - 1)).sum() false_positives = (diff_class_true * same_class_true * 2).sum() false_negatives = (diff_class_pred * same_class_pred * 2).sum() true_negatives = total * ( total - 1) - true_positives - false_positives - false_negatives return true_positives, false_positives, false_negatives, true_negatives
def IrisKNN(K): #Implementa o Algoritmo KNN neigh = KNeighborsClassifier(n_neighbors=int(K[0]), leaf_size=int(K[1]), p=int(K[2]) ,weights="uniform") neigh.fit(X_train, y_train) #Prevendo valores da porção de teste y_pred = neigh.predict(X_test) """ Gera a Matriz de Contingência, que mostra os acertos e erros do agrupamento, alem de especificar para qual cluster esses dados foram associados """ contMatrix = contingency_matrix(y_pred, y_test) """ Aqui estou percorrendo a Matriz de Contingência, calculando a porcentagem de acerto para cada cluster e salvando o resultado no vetor clusterScores """ nClusters = len(contMatrix) clusterScores = [] hitPercentage = 0 totalHits = 0 globalScore = 0 for i in range(nClusters): centr = np.argmax(contMatrix[i,:]) centrValue = contMatrix[i, centr] soma = 0 for j in range(nClusters): soma = soma + contMatrix[i,j] hitPercentage = centrValue/soma clusterScores.append(hitPercentage) totalHits = totalHits + centrValue """ Mede a porcentagem total de acertos desconsiderando o nome dado aos clusters (grau de similaridade) """ globalScore = totalHits/len(y_pred) return -globalScore
def fit_labels(gt_labels, tested_labels): gt_unique_classes = np.unique(gt_labels) tested_unique_classes, tested_classes_count = np.unique(tested_labels, return_counts=True) idx = np.argsort(-tested_classes_count) tested_unique_classes = tested_unique_classes[idx] con_mat = contingency_matrix(gt_labels, tested_labels) tested_labels_remap = np.copy(tested_labels) for i_class in range(len(tested_unique_classes)): new_label_idx = np.argmax(con_mat[:, [tested_unique_classes[i_class]]]) tested_labels_remap[ tested_labels == tested_unique_classes[i_class]] = gt_unique_classes[new_label_idx] con_mat[:, tested_unique_classes[i_class]] = -1 con_mat[new_label_idx, :] = -1 return tested_labels_remap
def k_means_clustering(training_data, target_labels, title='Contingency Matrix', n_clusters=20, random_state=0, max_iter=1000, n_init=30): start = time.time() km = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter, n_init=n_init) km.fit(training_data) print("Finished clustering in %f seconds" % (time.time() - start)) cm = contingency_matrix(target_labels, km.labels_) # reorder to maximize along diagonal rows, cols = linear_sum_assignment(cm, maximize=True) new_cm = cm[rows[:, np.newaxis], cols] print("Show Contingency Matrix:") plot_contingency_table_20(new_cm, title=title) print("Report 5 Measures for K-Means Clustering") homogeneity = homogeneity_score(target_labels, km.labels_) completeness = completeness_score(target_labels, km.labels_) v_measure = v_measure_score(target_labels, km.labels_) adjusted_rand_index = adjusted_rand_score(target_labels, km.labels_) adjusted_mutual_info = adjusted_mutual_info_score(target_labels, km.labels_) print("Homogeneity Score: %f" % homogeneity) print("Completeness Score: %f" % completeness) print("V-Measure Score: %f" % v_measure) print("Adjusted Rand Index: %f" % adjusted_rand_index) print("Adjusted Mutual Information: %f" % adjusted_mutual_info) results = { "homogeneity": homogeneity, "completeness": completeness, "v_measure": v_measure, "adjusted_rand_index": adjusted_rand_index, "adjusted_mutual_info": adjusted_mutual_info } return results, km
def annotation(cellname_train, cellname_test, Y_pred_train, Y_pred_test): train_confusion_matrix = contingency_matrix(cellname_train, Y_pred_train) annotated_cluster = np.unique(Y_pred_train)[train_confusion_matrix.argmax( axis=1)] annotated_celltype = np.unique(cellname_train) annotated_score = np.max(train_confusion_matrix, axis=1) / np.sum( train_confusion_matrix, axis=1) annotated_celltype[( np.max(train_confusion_matrix, axis=1) / np.sum(train_confusion_matrix, axis=1)) < 0.5] = "unassigned" final_annotated_cluster = [] final_annotated_celltype = [] for i in np.unique(annotated_cluster): candidate_celltype = annotated_celltype[annotated_cluster == i] candidate_score = annotated_score[annotated_cluster == i] final_annotated_cluster.append(i) final_annotated_celltype.append( candidate_celltype[np.argmax(candidate_score)]) annotated_cluster = np.array(final_annotated_cluster) annotated_celltype = np.array(final_annotated_celltype) succeed_annotated_train = 0 succeed_annotated_test = 0 test_annotation_label = np.array( ["original versions for unassigned cell ontology types"] * len(cellname_test)) for i in range(len(annotated_cluster)): succeed_annotated_train += ( cellname_train[Y_pred_train == annotated_cluster[i]] == annotated_celltype[i]).sum() succeed_annotated_test += ( cellname_test[Y_pred_test == annotated_cluster[i]] == annotated_celltype[i]).sum() test_annotation_label[Y_pred_test == annotated_cluster[i]] = annotated_celltype[i] annotated_train_accuracy = np.around( succeed_annotated_train / len(cellname_train), 4) total_overlop_test = 0 for celltype in np.unique(cellname_train): total_overlop_test += (cellname_test == celltype).sum() annotated_test_accuracy = np.around( succeed_annotated_test / total_overlop_test, 4) test_annotation_label[ test_annotation_label == "original versions for unassigned cell ontology types"] = "unassigned" return annotated_train_accuracy, annotated_test_accuracy, test_annotation_label
def plot_contingency_matrix(ax, labels_audio, labels, cmap=plt.cm.Blues, normalize=True): np.set_printoptions(precision=2) # Compute contingency matrix matrix = contingency_matrix(labels_audio, labels) cm = np.array([i / np.sum(i) for i in matrix]) title = 'Normalized contingency matrix' # Only use the labels that appear in the data labels_audio = np.unique(labels_audio) #ylabels labels = np.unique(labels) #xlabels im = ax.imshow(cm, interpolation='nearest', cmap=cmap) ax.figure.colorbar(im, ax=ax) # We want to show all ticks... ax.set( xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=labels, yticklabels=labels_audio, title=title, ylabel='True label [Audio]', xlabel='Predicted label [Subjetive]') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), ha="right", rotation_mode="anchor") # Loop over data dimensions and create text annotations. fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") ax.figure.tight_layout() return ax
def over_division_cluster_ratio(labels_true, labels_pred): from sklearn.metrics.cluster import contingency_matrix a_true = np.array(labels_true) a_pred = np.array(labels_pred) n_cluster = np.unique(a_true).shape[0] cm = contingency_matrix(a_true, a_pred, sparse=True) nz_true, nz_pred = cm.nonzero() n_fail_cluster = 0 for uniq_label_true, uniq_cnt_true in zip( *np.unique(nz_true, return_counts=True)): # multiple estimated cluster for 1 answer cluster? if uniq_cnt_true > 1: n_fail_cluster += 1 return 1. * n_fail_cluster / n_cluster
def rel_purity(y_true, y_pred): cm = contingency_matrix(y_true, y_pred) labels_sum = np.sum(cm, axis=1) rm = np.zeros(cm.shape) for j in range(cm.shape[1]): for i in range(cm.shape[0]): rm[i][j] = cm[i][j] / labels_sum[i] # print("Relative Contingency Matrix") # print(rm) # print(np.max(rm, axis=0)) max_indexes = np.argmax(rm, axis=0) # print(max_indexes) sum = 0 for j in range(rm.shape[1]): sum += cm[max_indexes[j]][j] return sum / np.sum(cm)
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True): ''' The original function is from `sklearn.metrics.fowlkes_mallows_score`. We output the pairwise precision, pairwise recall and F-measure, instead of calculating the geometry mean of precision and recall. ''' n_samples, = gt_labels.shape c = contingency_matrix(gt_labels, pred_labels, sparse=sparse) tk = np.dot(c.data, c.data) - n_samples pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples avg_pre = tk / pk avg_rec = tk / qk fscore = _compute_fscore(avg_pre, avg_rec) return avg_pre, avg_rec, fscore
def GMM_fun(dimensions): GMM = GaussianMixture(n_components=5).fit(dimensions) gmmlabel = GMM.predict(dimensions) np.unique(gmmlabel) cmat = contingency_matrix(gmmlabel, lclass) for i, item in enumerate(cmat): print("Purity of clusters :", i, " :", max(item) * 100 / sum(item)) pure1 = 0 for i in cmat: pure1 += max(i) # print(max(i)) purity1 = pure1 / len(label) print('GMM Purity:', purity1)
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples, cluster_method='dbscan', eps=eps).fit(X) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, op.labels_) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
def test_adjusted_rand_score_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C_sparse = contingency_matrix(labels_a, labels_b, sparse=True) assert_almost_equal(adjusted_rand_score(labels_a, labels_b), adjusted_rand_score(None, None, contingency=C_sparse))
def test_contingency_matrix_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C = contingency_matrix(labels_a, labels_b) C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray() assert_array_almost_equal(C, C_sparse)