def _insert_stats(stats_dict, docs, d_ref, z_ref, z_lda, phi, idx): φ = np.array(phi).flatten() docsf = np.array([doc['w'] for doc in docs]).flatten() stats_dict['doc_ari'][idx] = ari(d_ref, docsf) stats_dict['ari'][idx] = ari(z_ref, z_lda) stats_dict['phi_std'][idx] = np.std(φ) return stats_dict
def get_accuracy(cluster_assignments, y_true, n_clusters): ''' Computes the accuracy based on the provided kmeans cluster assignments and true labels, using the Munkres algorithm cluster_assignments: array of labels, outputted by kmeans y_true: true labels n_clusters: number of clusters in the dataset returns: a tuple containing the accuracy and confusion matrix, in that order ''' y_pred, confusion_matrix = get_y_preds(cluster_assignments, y_true, n_clusters) from sklearn.metrics import normalized_mutual_info_score as nmi nmi_score = nmi(y_true, y_pred) print('NMI: ' + str(np.round(nmi_score, 4))) from sklearn.metrics import adjusted_rand_score as ari ari_score = ari(y_true, y_pred) print('ARI: ' + str(np.round(ari_score, 4))) # with open('C:/Users/mals6571/Desktop/SpectralNet-master/src/applications/Results.txt','a') as my_file: ProjectDir = get_project_root() with open(os.path.join(ProjectDir, 'Results.txt'), 'a') as my_file: my_file.write("\n") my_file.write('NMI: ' + str(np.round(nmi_score, 4))) my_file.write("\n") my_file.write('ARI: ' + str(np.round(ari_score, 4))) my_file.write("\n") # calculate the accuracy return np.mean(y_pred == y_true), confusion_matrix
def form_matrix(self, result, dataset): n = int(sqrt(len(result))) res_matrix = np.empty(( n, n, )) res_matrix[:] = np.nan if self.type == "sw": for result_row in result: p, beta, sw = result_row row = beta_to_row(beta) # 50 - int(beta * 10) col = p_to_col(p) # int(p * 10) - 10 res_matrix[row, col] = sw if self.type == "ari": for result_row in result: p, beta, labels = result_row labels = labels.replace("[", "") labels = labels.replace("]", "") labels = np.fromstring(labels, sep=" ", dtype=int) row = beta_to_row(beta) # 50 - int(beta * 10) col = p_to_col(p) # int(p * 10) - 10 labels_true = np.loadtxt(ds_directory + "/" + cut_extention(dataset) + ".lbs", skiprows=1) labels_true = labels_true.astype(int) res_matrix[row, col] = ari(labels_true, labels) return res_matrix
def benchmark(self, name: str, features: np.ndarray, labels: np.ndarray) -> Tuple[str, Dict]: """ Returns the clustering performance results in str and dict format. The metrics used are as follows: 1. Duration 2. Adjusted RAND Score 3. Normalized Mutual Information 4. Davies-Bouldin Index 5. Silhouette Score 6. Calinski-Harabasz Score 7. Clustering Accuracy Parameters ---------- name: str The name of the benchmark. features: np.ndarray The test instances to cluster. labels: np.ndarray The test labels. Returns ------- str The formatted string of the benchmark results. results: Dict The dictionary of benchmark results. """ start_time = time.time() predictions = self.predict(features) results = {} results["name"] = name results["duration"] = time.time() - start_time results["ari"] = ari(labels_true=labels, labels_pred=predictions) results["nmi"] = nmi(labels_true=labels, labels_pred=predictions) results["dbi"] = davies_bouldin_score(features, predictions) results["silhouette"] = silhouette_score(features, predictions, metric="euclidean") results["ch_score"] = calinski_harabasz_score(features, predictions) results["clustering_accuracy"] = clustering_accuracy( target=labels, prediction=predictions) return ( "%-9s\t%.2fs\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % ( results.get("name"), results.get("duration"), results.get("dbi"), results.get("silhouette"), results.get("ch_score"), results.get("nmi"), results.get("ari"), results.get("clustering_accuracy"), ), results, )
def metriques(model, y_true, y_pred): pred1 = model.row_labels_ nmi_ = nmi(y_true, pred1) ari_ = ari(y_true, pred1) accuracy = ACCURACY(y_true, pred1) print("NMI: {}\nARI: {} ".format(nmi_, ari_)) print("ACCURACY: %s" % accuracy) return nmi_, ari_, accuracy
def tensfact_baseline(): n_clusters = 81 f = open('buzz_user_tensor_45.npy') X_buzz = np.load(f) print X_buzz.shape X_buzz = X_buzz[buzz_ground.keys()] buzz_ground1 = buzz_ground.values() km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_buzz) sc += nmi(buzz_ground1, km.labels_) sc1 += ari(buzz_ground1, km.labels_) sc2 += ami(buzz_ground1, km.labels_) print "BUZZ" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10)) f = open('poli_user_tensor_75.npy') X_poli = np.load(f) print X_poli.shape X_poli = X_poli[poli_ground.keys()] poli_ground1 = poli_ground.values() sc = 0.0 sc1 = 0.0 km1 = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km1.fit(X_poli) sc += nmi(poli_ground1, km1.labels_) sc1 += ari(poli_ground1, km1.labels_) sc2 += ami(poli_ground1, km1.labels_) print "poli" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10))
def execute_algo(model, model_name, X, y, verbose=True): print("##############\n# {}\n##############".format(model_name)) model.fit(X) res_nmi = nmi(model.row_labels_, y) res_ari = ari(model.row_labels_, y) res_acc = accuracy(model.row_labels_, y) if verbose: print("NMI =", res_nmi) print("ARI =", res_ari) print("ACC =", res_acc) return res_nmi, res_ari, res_acc
def train(self): x, y = np.load('images/64px_image_x.npy'), np.load( 'images/64px_image_y.npy') x = np.reshape(x, (40000, 64, 64, 1)) kmeans = KMeans(n_clusters=2, n_init=20) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights( [kmeans.cluster_centers_]) loss = 0 ae_loss = 0 index = 0 maxiter = 80000 update_interval = 100 index_array = np.arange(x.shape[0]) batch_size = 16 tol = 0.001 # model.load_weights('DEC_model_final.h5') for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.model.predict(x, verbose=0) # update the auxiliary target distribution p p = self.target_distribution(q) # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) loss = np.round(loss, 5) print( 'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f, loss=%.5f' % (ite, acc, nmi, ari, loss)) # check stop criterion - model convergence delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') break idx = np.random.randint(low=0, high=x.shape[0], size=batch_size) # ae_loss = ae.train_on_batch(x=x[idx], y=x[idx]) loss = self.model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 self.model.save_weights('DEC_model_final_64px.h5') self.test_model()
def for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M): print(combo) n, d = X_hat.shape unique_labels, counts = np.unique(est_labels, return_counts=True) K = len(unique_labels) class_idx = np.array([np.where(c_hat == u)[0] for u in unique_labels]) temp_quad_labels = np.concatenate(class_idx[combo]) surface_count = np.sum(counts[temp_quad_labels]) surface_prop = surface_count/n temp_n = len(temp_quad_labels) temp_K = K - len(combo) temp_mean_params = temp_K * d temp_cov_params = temp_K * d * (d + 1) / 2 temp_quad_params = (d - 1)*2 + d - 1 + (d - 1) * (d - 2) / 2 + 1 temp_n_params = temp_mean_params + temp_cov_params temp_n_params = temp_quad_params + temp_K - 1 temp_label = min(combo) new_counts = np.zeros(temp_K) for i in range(new_counts): if unique_labels[i] == temp_label: new_counts = surface_count else: new_counts = counts[i] new_props = (surface_count / n)**counts prop_log_likelihoods = np.sum(np.log(new_props)) temp_c_hat = c_hat.copy() temp_c_hat[temp_quad_labels] = temp_label params, pcov = optimize.curve_fit(func, X_hat[temp_quad_labels, :2], X_hat[temp_quad_labels, 2]) # integral = abs(monte_carlo_integration(X_hat[temp_quad_labels], func, params, M)) delly = Delaunay(X_hat[temp_quad_labels ,:-1]) content = np.sum([calculate_simplex_content(X_hat[temp_quad_labels][del_]) for del_ in delly.simplices]) quad_log_likelihood = quadratic_log_likelihood(X_hat[temp_quad_labels], params, curve_density=False) quad_log_likelihood -= temp_n * np.log(content) gmm_log_likelihood = np.sum(gclust.model_.score_samples(X_hat[-temp_quad_labels])) log_likeli = quad_log_likelihood + gmm_log_likelihood + prop_log_likelihoods bic_ = 2*log_likeli - temp_n_params * np.log(n) ari_ = ari(c, temp_c_hat) print(likeli, ari_, bic_) return [combo, likeli, ari_, bic_]
def calculate_ari(p, beta, dataset): start = time() dataset = ".".join([get_basename(dataset), "pts"]) data_file = "/".join([data_directory, dataset]) k_star = get_k_star(dataset) algorithm, t1, t2, t3, labels, cluster_structure = single_run( data_file, p, beta, k_star) labels_file = ".".join([cut_extention(data_file), "lbs"]) labels_true = pd.read_csv(labels_file).as_matrix().astype(int).flatten() assert len(labels_true) == 1000 ari_value = ari(labels_true=labels_true, labels_pred=labels) end = time() print("calculate ARI = {} in {:5.2f} sec".format(ari_value, end - start)) return ari_value
def run_trial(X, labels, k): errors = '"' # Run our dbscan start = time() """ if metric == 'seuclidean': db = KMeans(eps,minPts,metric=metric,metric_params={'V':V}) else: db = kmean(,minPts,metric=metric) """ db = KMeans(k, n_jobs=12) pred_labels = db.fit_predict(X) elapsed = time() - start try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: ss_score = ss(X, pred_labels) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan errors += '"' return [ k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score, errors ]
def get_scores(x, y, n, k, dtr, dev): tx = tens0((n, k), dt=dtr, dev=dev) ty = tens0((n, k), dt=dtr, dev=dev) tx = tens_sel_set(tx, x, 1) ty = tens_sel_set(ty, y, 1) t = tx.t().matmul(ty) del tx, ty tt = t.max() - t tt = tt.cpu().numpy() row, col = ass(tt) del tt t = t.cpu().numpy() t = t[row, col].sum() t = t.tolist() / n x = x.cpu().numpy() y = y.cpu().numpy() s = { 'nmi': nmi(x, y, average_method='geometric'), 'ari': ari(x, y), 'acc': t, } return s
for index in range(1, 101): goldtopics = [gold[r] for r in results if math.floor(float(r)) == index] hyper_topics = [ HyperLex[r] for r in results if math.floor(float(r)) == index ] spinglass_topics = [ spinglass[r] for r in results if math.floor(float(r)) == index ] gold_count.append(len(set(goldtopics))) HyperLex_count.append(len(set(hyper_topics))) spinglass_count.append(len(set(spinglass_topics))) # print(hyper_topics) # print(goldtopics) # exit() # total_clusters.append(max(int(number.split(".")[1]) + 1 for number in hyper_topics)) scores.append((index, ari(goldtopics, spinglass_topics))) # HyperLex_count = [count for count in HyperLex_count if count > 1] # spinglass_count = [count for count in spinglass_count if count > 1] histogram(gold_count, "Gold standard") histogram(HyperLex_count, "HyperLex") histogram(spinglass_count, "Spinglass") # histogram(total_clusters, "total") print(scores) print(gold_count) print(HyperLex_count) # print(total_clusters) # print("ARI:", numpy.average(scores)) scores = sorted(scores, key=itemgetter(1)) topics = numpy.genfromtxt("topics.txt", dtype=None, skip_header=1) print(scores)
def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, update_interval=140, save_dir='./results/temp'): print('Update interval', update_interval) save_interval = int(x.shape[0] / batch_size) * 5 # 5 epochs print('Save interval', save_interval) # Step 1: initialize cluster centers using k-means t1 = time() print('Initializing cluster centers with k-means.') kmeans = KMeans(n_clusters=self.n_clusters, n_init=20) y_pred = kmeans.fit_predict(self.encoder.predict(x)) y_pred_last = np.copy(y_pred) self.model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_]) # Step 2: deep clustering # logging file import csv logfile = open(save_dir + '/dec_log.csv', 'w') logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss']) logwriter.writeheader() loss = 0 index = 0 index_array = np.arange(x.shape[0]) for ite in range(int(maxiter)): if ite % update_interval == 0: q = self.model.predict(x, verbose=0) p = self.target_distribution(q) # update the auxiliary target distribution p # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) loss = np.round(loss, 5) logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, loss=loss) logwriter.writerow(logdict) print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss) # check stop criterion delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') logfile.close() break # train on batch # if index == 0: # np.random.shuffle(index_array) idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])] loss = self.model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 # save intermediate model if ite % save_interval == 0: print('saving model to:', save_dir + '/DEC_model_' + str(ite) + '.h5') self.model.save_weights(save_dir + '/DEC_model_' + str(ite) + '.h5') ite += 1 # save the trained model logfile.close() print('saving model to:', save_dir + '/DEC_model_final.h5') self.model.save_weights(save_dir + '/DEC_model_final.h5') return y_pred
def for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M): print(combo) # Grab number of data points and dimension of data n, d = X_hat.shape # Grab cluster labels and corresponding counts and the number of clusters unique_labels, counts = np.unique(est_labels, return_counts=True) K = len(unique_labels) # Partition the data by cluster class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) # Combine clusters in combo into a single cluster temp_quad_labels = np.concatenate(class_idx[combo]) # Grab the number of data included in the surface surface_count = np.sum(counts[combo]) temp_n = len(temp_quad_labels) assert temp_n == surface_count temp_K = K - len(combo) + 1 # new number of "clusters" temp_mean_params = (temp_K - 1) * d # temp_K - 1 means in d space temp_cov_params = (temp_K - 1) * d * ( d - 1) / 2 # temp_K - 1 symmetric covariances in M^{d x d} temp_quad_params = ( d - 1) + 1 + 1 # polynomial parameters + variance off surface #- Total parameters = parameters from gaussians + parameters from surface temp_n_params = temp_mean_params + temp_cov_params temp_n_params += temp_quad_params + temp_K - 1 # Include mixing proportions #- Give each cluster in combos the label of the smallest label in combos temp_label = min(combo) temp_c_hat = est_labels.copy() temp_c_hat[temp_quad_labels] = temp_label #- New counts vector new_counts = np.zeros(temp_K) for i in range(temp_K): if unique_labels[i] == temp_label: new_counts = surface_count else: new_counts = counts[i] #- New proportions vector new_props = (new_counts / n)**new_counts #- For BIC prop_log_likelihoods = np.sum(np.log(new_props)) guess = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1) #- Find fitted surface params, pcov = optimize.curve_fit(func, X_hat[temp_quad_labels, :-1], X_hat[temp_quad_labels, -1], guess) #- Estimate surface area (comutationally expensive!!!) integral = abs( monte_carlo_integration(X_hat[temp_quad_labels], func, params, M)) #- Find likelihood of surface surface_log_likelihood = quadratic_log_likelihood(X_hat[temp_quad_labels], params, curve_density=False) surface_log_likelihood -= surface_count * np.log(integral) #- Find likelihood of Gaussians gmm_log_likelihood = np.sum(gclust.model_.score(X_hat[-temp_quad_labels])) #- Total likelihood likeli = surface_log_likelihood + gmm_log_likelihood + prop_log_likelihoods #- BIC bic_ = 2 * likeli - temp_n_params * np.log(n) #- ARI ari_ = ari(true_labels, temp_c_hat) return [combo, likeli, ari_, bic_]
ari_ = ari(true_labels, temp_c_hat) return [combo, likeli, ari_, bic_] np.random.seed(16661) A = binarize(right_adj) X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1) n, d = X_hat.shape gclust = GCLUST(max_components=15) est_labels = gclust.fit_predict(X_hat) loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))] combos = [None] aris = [ari(right_labels, est_labels)] bic = [gclust.model_.bic(X_hat)] unique_labels = np.unique(est_labels) class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) for k in range(len(unique_labels)): for combo in list(combinations(np.unique(est_labels), k + 1)): combo = np.array(list(combo)).astype(int) combos.append(combo) M = 10**8 condensed_func = lambda combo: for_loop_function(combo, X_hat, est_labels, right_labels, gclust, M)
def tensfact_baseline(): G_buzz, N_buzz, C_buzz, G_poli, N_poli, C_poli = parse_graphs() n_news1 = N_buzz.shape[0] n_news2 = N_poli.shape[0] y_buzz = [0] * n_news1 y_poli = [0] * n_news2 y_buzz = np.array(y_buzz) y_poli = np.array(y_poli) y_buzz[91:] = 1 y_poli[120:] = 1 n_clusters = 81 if not os.path.isfile('tensor_buzz.npy'): T = np.zeros((N_buzz.shape[0], G_buzz.shape[0], C_buzz.shape[1])) n_users = G_buzz.shape[0] n_news = N_buzz.shape[0] n_comm = C_buzz.shape[1] for i in xrange(n_news): for j in xrange(n_users): for k in xrange(n_comm): T[i,j,k] = N_buzz[i,j] * C_buzz[j, k] np.save('tensor_buzz.npy', T) else: f = open('tensor_buzz.npy') T_buzz = np.load(f) print T_buzz.shape print "Buzz tensor loaded" #T = dtensor(T_buzz) #print T.shape #factors = parafac(T_buzz, rank=25, init='random') #T_buzz = tl.tensor(T_buzz) # Best so far [50, 100, 5] core, factors = tucker(T_buzz, ranks=[45, 100, 5]) print core.shape print factors[0].shape print factors[1].shape #P, fit, itr, exectimes = cp_als(T, 35, init='random') #P, F, D, A, fit, itr, exectimes = parafac2.parafac2(T, 10, init=42) # Extracting news embeddings #X_buzz = T_buzz X_buzz = factors[1] #X_buzz = P.U[0] F = open('buzz_lsi.npy', 'r') buzz_lsi = np.load(F) #X_buzz = np.hstack((X_buzz, buzz_lsi)) print X_buzz.shape #caler = MinMaxScaler() #X_buzz = preprocessing.scale(X_buzz) #X_buzz = scaler.fit_transform(X_buzz) #assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0 #X_buzz = X_buzz[buzz_ground.keys()] buzz_ground1 = buzz_ground.values() km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False) print "Buzzfeed dataset's feat. extracted" #print X_buzz.shape #X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_buzz) sc+=nmi(buzz_ground1, km.labels_) sc1+=ari(buzz_ground1, km.labels_) sc2+=ami(buzz_ground1, km.labels_) print "BUZZ" print "nmi score %f"%(sc/float(10)) print "ari score %f"%(sc1/float(10)) print "ami score %f"%(sc2/float(10)) if not os.path.isfile('tensor_poli.npy'): T = np.zeros((N_poli.shape[0], G_poli.shape[0], C_poli.shape[1])) n_users = G_poli.shape[0] n_news = N_poli.shape[0] n_comm = C_poli.shape[1] for i in xrange(n_news): for j in xrange(n_users): for k in xrange(n_comm): T[i,j,k] = N_poli[i,j] * C_poli[j, k] np.save('tensor_poli.npy', T) else: f = open('tensor_poli.npy') T_poli = np.load(f) print T_poli.shape print "Politifact tensor loaded" T = dtensor(T_poli) #factors = parafac(T_poli, rank=50) #P, fit, itr, exectimes = cp_als(T, 35, init='random') # Best so far: [50, 100, 5] T_poli = tl.tensor(T_poli) core, factors = tucker(T_poli, ranks=[45, 100, 5]) #print " Fit value, Itr and Exectimes are:" #print fit #print itr #print exectimes # Extracting news embeddings X_poli = factors[1] #X_poli = P.U[0] F = open('poli_lsi.npy', 'r') poli_lsi = np.load(F) #X_poli = X_poli[poli_ground.keys()] #X_poli = np.hstack((X_poli, poli_lsi)) print X_poli.shape #X_buzz = preprocessing.scale(X_poli) #X_poli = scaler.fit_transform(X_poli) assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0 print X_poli.shape print "Politifact news feats. extracted" poli_ground1 = poli_ground.values() km = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False) print "Buzzfeed dataset's feat. extracted" #print X_buzz.shape #X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(X_poli) sc+=nmi(poli_ground1, km.labels_) sc1+=ari(poli_ground1, km.labels_) sc2+=ami(poli_ground1, km.labels_) print "BUZZ" print "nmi score %f"%(sc/float(10)) print "ari score %f"%(sc1/float(10)) print "ami score %f"%(sc2/float(10))
T = np.zeros((n, m, l)) y = np.zeros(m) for index, row in final.iterrows(): T[row['user_le'], row['movie_le'], row['tag_le']] = 1 y[row['movie_le']] = row['genre_le'] sparsity = 1 - (np.sum(T > 0) / np.product(T.shape)) model = CoClust(n_iterations=np.sum(T.shape) * 100, optimization_strategy=alg, path=output_path) model.fit(T) tau = model.final_tau_ n = nmi(model.y_, y, average_method='arithmetic') a = ari(model.y_, y) f.write( f"{T.shape[0]},{T.shape[1]},{T.shape[2]},,{len(set(y))},,,{tau[0]},{tau[1]},{tau[2]},,{n},,,{a},,{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n" ) f.close() gy = open(output_path + alg + "_assignments_ML_" + k + "_y.txt", 'w') for i in range(T.shape[1]): gy.write(f"{i}\t{model._assignment[1][i]}\n") gy.close() gz = open(output_path + alg + "_assignments_ML_" + k + "_z.txt", 'w') for i in range(T.shape[2]): gz.write(f"{i}\t{model._assignment[2][i]}\n")
y_pred = torch.argmax(result['y_pred'], dim=-1) y_true = result['y_true'] embeddings = result['embedding'].cpu().numpy() print(y_pred.size()) print(y_true.size()) print(embeddings.shape) ''' caculate NMI and ARI normalized_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic') adjusted_rand_score(labels_true, labels_pred)[source] ''' kmeans = KMeans(n_clusters=n_class, random_state=0).fit(embeddings) y_cluster = kmeans.labels_ nmi_score = nmi(y_true.cpu().numpy(), y_cluster) ari_score = ari(y_true.cpu().numpy(), y_cluster) print('nmi_score={}, ari_score={}'.format(nmi_score, ari_score)) ''' visualize (using t-SNE) ''' time_start = time.time() tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=500) tsne_results = tsne.fit_transform(embeddings) print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start)) df_subset = pd.DataFrame(index=list(range(embeddings.shape[0])), columns=['axis_x', 'axis_y']) df_subset['axis_x'] = tsne_results[:, 0] df_subset['axis_y'] = tsne_results[:, 1] df_subset['y'] = y_true.cpu().numpy() # plt.figure(figsize=(16,7))
gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(wineX) gmm.fit(wineX) SSE[k]['Wine'] = km.score(wineX) ll[k]['Wine'] = gmm.score(wineX) acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(), km.predict(wineX)) acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX)) adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX)) adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX)) adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX)) adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX)) bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX) bic[k]['Wine']['GMM'] = gmm.bic(wineX) silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX)) silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX)) km.fit(digitX) gmm.fit(digitX) SSE[k]['Digit'] = km.score(digitX) ll[k]['Digit'] = gmm.score(digitX) acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(), km.predict(digitX)) acc[k]['Digit']['GMM'] = cluster_acc(digitY.ravel(), gmm.predict(digitX)) adjMI[k]['Digit']['Kmeans'] = ami(digitY.ravel(), km.predict(digitX))
# 谱聚类 # ================================================================================ from sklearn import metrics from sklearn.cluster import SpectralClustering chs = metrics.calinski_harabasz_score from sklearn.metrics import adjusted_rand_score as ari neig = np.arange(5, 50, 5) ychs = [] for k in neig: y_pred = SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors=k).fit_predict(X) s = ari(y_true, y_pred) ychs.append(s) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(neig, ychs) ax.set_xlabel('numbers of neighbors') ax.set_ylabel('ARI') plt.show() #观察选取最佳聚类数 t1 = time.time() sc_pred = SpectralClustering(n_clusters=3, affinity='nearest_neighbors', n_neighbors=20).fit_predict(X) t2 = time.time() sc_t = t2 - t1 # ===================================================================
def run_trial(X, labels, eps, minPts, metric, V): errors = '"' # Run our dbscan start = time() if metric == 'seuclidean': db = DBSCAN(eps, minPts, metric=metric, metric_params={'V': V}, n_jobs=6) else: db = DBSCAN(eps, minPts, metric=metric, n_jobs=6) pred_labels = db.fit_predict(X) elapsed = time() - start perc_noise = np.sum(pred_labels == -1) / len(pred_labels) n_clust = pred_labels.max() # Remove noisy points clean_idx = np.where(pred_labels != -1) nn_preds = pred_labels[clean_idx] nn_labels = labels[clean_idx] nn_X = X[clean_idx] try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: if metric == 'seuclidean': ss_score = ss(X, pred_labels, metric=metric, V=V) else: ss_score = ss(X, pred_labels, metric=metric) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan try: nn_ari_score = ari(nn_preds, nn_labels) except Exception as e: errors += str(e) + '; ' nn_ari_score = np.nan try: nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nn_nmi_score = np.nan try: if metric == 'seuclidean': nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V) else: nn_ss_score = ss(nn_X, nn_preds, metric=metric) except Exception as e: errors += str(e) + '; ' nn_ss_score = np.nan try: nn_vrc_score = vrc(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_vrc_score = np.nan try: nn_dbs_score = dbs(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_dbs_score = np.nan errors += '"' return [ metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score, nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score, vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors ]
def get_metrics(metrics, x, y_true, mu_true, y_pred, mu_pred, outliers_identified_by=-1): ''' this function computes all metrics that are available in known_metrics (see below) it outputs the desired metrics in the same order as in the argument 'metrics' ''' from sklearn.metrics import adjusted_rand_score as ari known_metrics = [ "RMSE", "ACC", "ARI", "DISTORSION", "n_sample", "OT_CENTERS" ] if not any(np.isin(element=known_metrics, test_elements=metrics)): raise Exception( 'all desired metrics are unknown of the function get_metrics') if outliers_identified_by != -1: raise exception( "outliers must be identified by '-1' in the assignment array y_true and y_pred" ) if type(y_pred[0]) != np.int64 or type(y_true[0]) != np.int64: raise Exception("y_pred and y_true must contain integer") nb_outliers_true = np.sum(y_true == -1) # nb_outliers_pred = np.sum(y_pred==-1) # in case it is useful one day, but now it's not used res = np.zeros(len(metrics)) for metric in metrics: # take one metric after one another and compute the corresponding piece of code if metric == "RMSE": mapp = mapping(y_true, y_pred) # position of RMSE position = np.isin(element=metrics, test_elements="RMSE") # compute RMSE rmse = RMSE( mu_true, mu_pred, mapp ) # RMSE in which a map indicate how points should be associated # store RMSE res[position] = rmse # print('rmse . = '+str(rmse)) elif metric == "RMSE_ot": # position of RMSE position = np.isin(element=metrics, test_elements="RMSE") # compute RMSE rmse = RMSE_ot( mu_true, mu_pred ) # RMSE in which the points are automatically assciated so that the distance is minimal # store RMSE res[position] = rmse # print('RMSE_ot = '+str(rmse)) elif metric == "ACC": mapp = mapping(y_true, y_pred) # print('mapp in ACC'+str(mapp)) # position of ACC position = np.isin(element=metrics, test_elements="ACC") # compute ACC acc = accuracy(y_true[nb_outliers_true:], y_pred[nb_outliers_true:], mapp) # store ACC res[position] = acc elif metric == "ARI": # position of ARI position = np.isin(element=metrics, test_elements="ARI") # compute ARI ari = ari(y_true[nb_outliers_true:], y_pred[nb_outliers_true:]) # store ARI res[position] = ari elif metric == "DISTORSION" or metric == "n_sample": # position of DISTORSION position = np.isin(element=metrics, test_elements="DISTORSION") # a == distorsion value obtained with data that are inliers for both y_pred and y_true against the empirical probability measure Pn (=against data) distorsion, inliers_qtty = my_inlier_distorsion( data=x, y_true=y_true, y_pred=y_pred, centers_pred=mu_pred) # store DISTORSION res[position] = distorsion if 'n_sample' in metrics: position = np.isin(element=metrics, test_elements='n_sample') res[position] = inliers_qtty elif metric == "OT_CENTERS": raise Exception("OT_CENTERS is not yet available in get_metrics") mapp = mapping(y_true, y_pred) # position of OT_CENTERS position = np.isin(element=metrics, test_elements="OT_CENTERS") # compute OT_CENTERS ot_centers = RMSE(mu_true, mu_pred, mapp) # store OT_CENTERS res[position] = ot_centers else: raise Exception("the desired metric " + str(metric) + " is not supported in the function get_metrics") return (res)
def evaluate(labels, labels_pred): # Compare predicted labels to known ground-truths # Returns error rate, 1-NMI, 1-ARI return err_rate(labels, labels_pred), 1 - nmi( labels, labels_pred, average_method="geometric"), 1 - ari( labels, labels_pred)
buzz_featvec = buzz_featvec[buzz_ground.keys()] poli_featvec = poli_featvec[poli_ground.keys()] buzz_ground = buzz_ground.values() poli_ground = poli_ground.values() km = KMeans(n_clusters=81, n_init=1) km1 = KMeans(n_clusters=310, n_init=1) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km.fit(buzz_featvec) sc += nmi(buzz_ground, km.labels_) sc1 += ari(buzz_ground, km.labels_) sc2 += ami(buzz_ground, km.labels_) print "BUZZ" print "nmi score %f" % (sc / float(10)) print "ari score %f" % (sc1 / float(10)) print "ami score %f" % (sc2 / float(10)) sc = 0.0 sc1 = 0.0 sc2 = 0.0 for i in xrange(10): km1.fit(poli_featvec) sc += nmi(poli_ground, km1.labels_) sc1 += ari(poli_ground, km1.labels_) sc2 += ami(poli_ground, km1.labels_)
sparsity = 1 - (np.sum(T>0) / np.product(T.shape)) f, dt = CreateOutputFile("yelp", date = True) output_path = f"./output/_yelp/" + dt[:10] + "_" + dt[11:13] + "." + dt[14:16] + "." + dt[17:19] + "/" directory = os.path.dirname(output_path) if not os.path.exists(directory): os.makedirs(directory) model = CoClust(np.sum(T.shape) * 10, optimization_strategy = alg, path = output_path) model.fit(T) tau = model.final_tau_ nmi_x = nmi(y, model.x_, average_method='arithmetic') ari_x = ari(y, model.x_) f.write(f"{T.shape[0]},{T.shape[1]},{T.shape[2]},{len(set(y))},,,,{tau[0]},{tau[1]},{tau[2]},{nmi_x},,,{ari_x},,,{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n") f.close() gx = open(output_path + alg + "_assignments_"+ tensor + "_x.txt", 'w') for i in range(T.shape[0]): gx.write(f"{i}\t{model._assignment[0][i]}\n") gx.close() gy = open(output_path + alg + "_assignments_"+ tensor + "_y.txt", 'w') for i in range(T.shape[1]): gy.write(f"{i}\t{model._assignment[1][i]}\n") gy.close()
tsne_ari, pca_ari, km_ari, sc_ari = [], [], [], [] n_features_range = range(50, n_features_max + 1, step) for n_features in n_features_range: samples = 300 X, y = generate_data(samples, n_features) tsne_proj = TSNE(random_state=1).fit_transform(X) tsne_pred = KMeans(n_clusters=2, random_state=9).fit_predict(tsne_proj) pca = PCA(n_components=int(n_features / 2)).fit_transform(X) pca_pred = KMeans(n_clusters=2, n_init=1).fit_predict(pca) km_pred = KMeans(init='random', n_clusters=2, n_init=10, algorithm='full').fit_predict(X) sc_pred = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', n_neighbors=20).fit_predict(X) t_a = ari(y, tsne_pred) p_a = ari(y, pca_pred) k_a = ari(y, km_pred) s_a = ari(y, sc_pred) tsne_ari.append(t_a) pca_ari.append(p_a) km_ari.append(k_a) sc_ari.append(s_a) plt.plot(n_features_range, tsne_ari, linewidth=2, label="ARI of t-SNE") plt.plot(n_features_range, pca_ari, linewidth=2, label="ARI of PCA") plt.plot(n_features_range, km_ari, linewidth=2, label="ARI of k-means") plt.plot(n_features_range, sc_ari, linewidth=2, label="ARI of SC") plt.xlabel('n_features ')
# In[326]: acc(y_test, y_pred_init) # In[327]: nmi(y_test,y_pred_init,average_method='arithmetic') # In[328]: ari(y_test,y_pred_init) # ##### K-means sur les images encodées # In[329]: encoder.compile(optimizer='adam', loss='categorical_crossentropy',metrics=["acc"]) # In[330]: y_train_hot = to_categorical(y_train, 10) y_test_hot = to_categorical(y_test, 10)
num_classes_y = len(set(y)) num_classes_z = len(set(z)) f, dt = CreateOutputFile("DBLP4A", date = True) output_path = f"./output/_DBLP4A/" + dt[:10] + "_" + dt[11:13] + "." + dt[14:16] + "." + dt[17:19] + "/" directory = os.path.dirname(output_path) if not os.path.exists(directory): os.makedirs(directory) model = CoClust(np.sum(np.shape(T)) * 10, optimization_strategy = alg, path = output_path) model.fit(T) tau = model.final_tau_ nmi_y = nmi(y, model.y_, average_method='arithmetic') ari_y = ari(y, model.y_) nmi_z = nmi(z, model.z_, average_method='arithmetic') ari_z = ari(z, model.z_) sparsity = 1 - (np.sum(T>0) / np.product(T.shape)) f.write(f"{T.shape[0]},{T.shape[1]},{T.shape[2]},,{num_classes_y},{num_classes_z},,{tau[0]},{tau[1]},{tau[2]},,{nmi_y},{nmi_z},,{ari_y},{ari_z},{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n") f.close() gx = open(output_path + alg + "_assignments_x.txt", 'w') gy = open(output_path + alg + "_assignments_y.txt", 'w') gz = open(output_path + alg + "_assignments_z.txt", 'w') for i in range(T.shape[0]): gx.write(f"{i}\t{model._assignment[0][i]}\n") for i in range(T.shape[1]): gy.write(f"{i}\t{model._assignment[1][i]}\n")
A, counts = generate_cyclops(X, n, pi, None) c = [0]*counts[0] c += [1]*counts[1] true_labels = c ase = ASE(n_components=3) X_hat = ase.fit_transform(A) gclust_model = GCLUST(max_components=8) est_labels = gclust.fit_predict(X_hat) loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))] combos = [None] aris = [ari(c, est_labels)] bic = [gclust.model_.bic(X_hat)] unique_labels = np.unique(est_labels) class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) for k in range(len(unique_labels)): for combo in list(combinations(np.unique(est_labels), k+1)): combo = np.array(list(combo)).astype(int) combos.append(combo) M = 10**8 condensed_func = lambda combo : for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M) results = Parallel(n_jobs=15)(delayed(condensed_func)(combo) for combo in combos[1:])