def run_silhouette_metrics(adata, batch_key='orig.ident', cell_label='paper.cell.type', embed='X_pca'): # global silhouette coefficient sil_global = me.silhouette(adata, group_key='paper.cell.type', embed=embed, metric='euclidean') # silhouette coefficient per batch _, sil_clus = me.silhouette_batch(adata, batch_key=batch_key, group_key=cell_label, embed=embed, metric='euclidean', verbose=False) sil_clus = sil_clus['silhouette_score'].mean() il_score_sil = me.isolated_labels(adata, label_key=cell_label, batch_key=batch_key, cluster=False, n=1, verbose=False) il_score_clus = me.isolated_labels(adata, label_key=cell_label, batch_key=batch_key, cluster=True, n=1, verbose=False) return (sil_global, sil_clus, il_score_clus, il_score_sil)
def cost_func(args): partial_ff = partial(ff, s=args[0]) res = pool.map(partial_ff, cnt) si = metrics.silhouette(scale(np.array(res)), np.array(Y) - 1) db = metrics.db(scale(np.array(res)), np.array(Y) - 1) ch = metrics.ch(scale(np.array(res)), np.array(Y) - 1) w1, w2, w3 = float(args[1]), float(args[2]), float(args[3]) fit = (np.std(si) + np.mean(1. - si)) * db / (ch + 1e-12) return (np.mean(si), db, ch, fit)
# casestudy_iris_pca.py import data_iris import hierarchical import matplotlib.pyplot as plt import metrics import numpy as np import pca import plot_data # (1) load data iris = data_iris.iris() X,class_label = iris.load() # perform pca and reduce dimension to 2 model_pca = pca.pca() model_pca.fit(X) R = model_pca.data_reduced_dimension(reduced_dim=2) plot_data.plot_scatter_class(R,class_label,"Iris Data Projected to 2 Dimensions using PCA","u0","u1") # (2) create model model = hierarchical.hierarchical() # (3) fit model model.fit(R) print("Time fit: {}".format(model.time_fit)) # (4) results level = -3 print("Purity: {}".format(metrics.purity(model.clustersave[level],class_label))) print("Davies-Bouldin: {}".format(metrics.davies_bouldin(R,model.clustersave[level]))) print("Silhouette: {}".format(metrics.silhouette(R,model.clustersave[level]))) model.plot_cluster(nlevel=level,title="Hierarchical Clustering for Iris Dataset reduced to 2d",xlabel="u0",ylabel="u1") metrics.plot_cluster_distribution(model.clustersave[level],class_label) plt.show()
#!/usr/bin/python import sys import descritores import pylab import cPickle import metrics db = cPickle.load(open(sys.argv[1] + "/classes.txt")) #names = cPickle.load(open(sys.argv[1]+"/names.pkl")) names = db.keys() scales = pylab.loadtxt(sys.argv[2]) X = [ pylab.vstack(([db[f] for f in names], pylab.array([ pylab.log(descritores.bendenergy(sys.argv[1] + f, s)()) for f in names ]).T)).T for s in scales ] for x in X: s = metrics.silhouette(x[:, 1:], x[:, 0].astype(int) - 1) print pylab.mean(s), pylab.std(s) for n, x in zip(['nmbe_pso.pkl', 'nmbe_de.pkl', 'nmbe_sa.pkl'], X): with open(n, "wb") as f: cPickle.dump(dict(zip(names, x)), f)
mod = kmeans.kmeans(ncluster=2, initialization='kmeans++') elif model == "GaussianMM": mod = gaussianmm.gaussianmm(ncluster=2, initialization='kmeans++') elif model == "DBSCAN": mod = dbscan.dbscan(minpts=5, epsilon=0.18) # fit model print("Model: {}".format(model)) if model == "DBSCAN": mod.fit(X[dataset]) else: mod.fit(X[dataset], 100, 1e-5, False) print("Time fit: {}".format(mod.time_fit)) # davies-bouldin and silhouette db = metrics.davies_bouldin(X[dataset], mod.clustersave[-1]) s = metrics.silhouette(X[dataset], mod.clustersave[-1]) print("Davies-Bouldin: {}".format(db)) print("Silhouette: {}".format(s)) colors = (mod.clustersave[-1] + 1) / mod.ncluster axes[i, j].scatter(X[dataset][0, :], X[dataset][1, :], color=cm.jet(colors), s=15) axes[i, j].set_xticklabels([]) axes[i, j].set_yticklabels([]) if i == 0: title = model + "\ndb:{:.2f} s:{:.2f} t:{:.3f}".format( db, s, mod.time_fit) else: title = "db: {:.2f} s:{:.2f} t:{:.3f}".format(db, s, mod.time_fit) axes[i, j].set_title(title)
display_clusters(y, "Настоящие метки") def display_metrics(n_clusters, metrics, title): plt.figure(figsize=(8, 6)) plt.grid(linestyle='--') plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r') plt.title(title) plt.xlabel("Количество кластеров") plt.ylabel("Значение метрики") plt.show() external_metrics = [] internal_metrics = [] for i in range(1, 11): kMean = KMeans(k=i) centroids = kMean.fit(X_norm) y_pred = kMean.predict(X_norm) if i == 1: internal_metrics.append(0.0) else: internal_metrics.append(silhouette(X_norm, y_pred, centroids)) external_metrics.append(adjusted_rand_index(y, y_pred)) display_clusters(y_pred, str(i) + ' кластеров') display_metrics(range(1, 11), external_metrics, 'Внешняя метрика') display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
def cost_func(args): partial_ff = partial(ff, s = args) res = pool.map(partial_ff,cnt) s = metrics.silhouette(scale(np.array(res)),np.array(Y)-1) return np.median(np.abs(1.-s))
#!/usr/bin/python import sys import descritores import pylab import cPickle import metrics db = cPickle.load(open(sys.argv[1]+"/classes.txt")) #names = cPickle.load(open(sys.argv[1]+"/names.pkl")) names = db.keys() scales = pylab.loadtxt(sys.argv[2]) X = [pylab.vstack(([db[f] for f in names],pylab.array([pylab.log(descritores.bendenergy(sys.argv[1]+f,s)()) for f in names]).T)).T for s in scales] for x in X: s = metrics.silhouette(x[:,1:],x[:,0].astype(int)-1) print pylab.mean(s),pylab.std(s) for n,x in zip(['nmbe_pso.pkl','nmbe_de.pkl','nmbe_sa.pkl'],X): with open(n,"wb") as f: cPickle.dump(dict(zip(names,x)),f)
#!/usr/bin/python import cPickle import sys import metrics import pylab color = ["b","r","g"] db = cPickle.load(open(sys.argv[1])) X,Y = db[:,1:],db[:,0].astype(int) print "CE = {0}".format(metrics.CE(X,Y-1)) print "PC = {0}".format(metrics.PC(X,Y-1)) print "CS = {0}".format(metrics.CS(X,Y-1)) print "DB = {0}".format(metrics.db(X,Y-1)) print "DI = {0}".format(metrics.di(X,Y)) print "Silhouette = {0}".format(metrics.silhouette(X,Y-1).mean()) for c,i in zip(color,range(1,4)): pylab.plot(X[pylab.where(Y == i)][:,0],X[pylab.where(Y == i)][:,1],"o"+c) pylab.show()