def run_silhouette_metrics(adata,
                           batch_key='orig.ident',
                           cell_label='paper.cell.type',
                           embed='X_pca'):
    # global silhouette coefficient
    sil_global = me.silhouette(adata,
                               group_key='paper.cell.type',
                               embed=embed,
                               metric='euclidean')
    # silhouette coefficient per batch
    _, sil_clus = me.silhouette_batch(adata,
                                      batch_key=batch_key,
                                      group_key=cell_label,
                                      embed=embed,
                                      metric='euclidean',
                                      verbose=False)
    sil_clus = sil_clus['silhouette_score'].mean()
    il_score_sil = me.isolated_labels(adata,
                                      label_key=cell_label,
                                      batch_key=batch_key,
                                      cluster=False,
                                      n=1,
                                      verbose=False)
    il_score_clus = me.isolated_labels(adata,
                                       label_key=cell_label,
                                       batch_key=batch_key,
                                       cluster=True,
                                       n=1,
                                       verbose=False)

    return (sil_global, sil_clus, il_score_clus, il_score_sil)
Example #2
0
    def cost_func(args):
        partial_ff = partial(ff, s=args[0])
        res = pool.map(partial_ff, cnt)
        si = metrics.silhouette(scale(np.array(res)), np.array(Y) - 1)
        db = metrics.db(scale(np.array(res)), np.array(Y) - 1)
        ch = metrics.ch(scale(np.array(res)), np.array(Y) - 1)
        w1, w2, w3 = float(args[1]), float(args[2]), float(args[3])
        fit = (np.std(si) + np.mean(1. - si)) * db / (ch + 1e-12)

        return (np.mean(si), db, ch, fit)
Example #3
0
# casestudy_iris_pca.py

import data_iris
import hierarchical
import matplotlib.pyplot as plt
import metrics
import numpy as np
import pca
import plot_data

# (1) load data
iris = data_iris.iris()
X,class_label = iris.load()
# perform pca and reduce dimension to 2
model_pca = pca.pca()
model_pca.fit(X)
R = model_pca.data_reduced_dimension(reduced_dim=2)
plot_data.plot_scatter_class(R,class_label,"Iris Data Projected to 2 Dimensions using PCA","u0","u1")
# (2) create model
model = hierarchical.hierarchical()
# (3) fit model
model.fit(R)
print("Time fit: {}".format(model.time_fit))
# (4) results
level = -3
print("Purity: {}".format(metrics.purity(model.clustersave[level],class_label)))
print("Davies-Bouldin: {}".format(metrics.davies_bouldin(R,model.clustersave[level])))
print("Silhouette: {}".format(metrics.silhouette(R,model.clustersave[level])))
model.plot_cluster(nlevel=level,title="Hierarchical Clustering for Iris Dataset reduced to 2d",xlabel="u0",ylabel="u1")
metrics.plot_cluster_distribution(model.clustersave[level],class_label)
plt.show()
Example #4
0
#!/usr/bin/python

import sys
import descritores
import pylab
import cPickle
import metrics

db = cPickle.load(open(sys.argv[1] + "/classes.txt"))
#names = cPickle.load(open(sys.argv[1]+"/names.pkl"))
names = db.keys()
scales = pylab.loadtxt(sys.argv[2])

X = [
    pylab.vstack(([db[f] for f in names],
                  pylab.array([
                      pylab.log(descritores.bendenergy(sys.argv[1] + f, s)())
                      for f in names
                  ]).T)).T for s in scales
]

for x in X:
    s = metrics.silhouette(x[:, 1:], x[:, 0].astype(int) - 1)
    print pylab.mean(s), pylab.std(s)

for n, x in zip(['nmbe_pso.pkl', 'nmbe_de.pkl', 'nmbe_sa.pkl'], X):
    with open(n, "wb") as f:
        cPickle.dump(dict(zip(names, x)), f)
         mod = kmeans.kmeans(ncluster=2, initialization='kmeans++')
     elif model == "GaussianMM":
         mod = gaussianmm.gaussianmm(ncluster=2,
                                     initialization='kmeans++')
     elif model == "DBSCAN":
         mod = dbscan.dbscan(minpts=5, epsilon=0.18)
 # fit model
 print("Model: {}".format(model))
 if model == "DBSCAN":
     mod.fit(X[dataset])
 else:
     mod.fit(X[dataset], 100, 1e-5, False)
 print("Time fit: {}".format(mod.time_fit))
 # davies-bouldin and silhouette
 db = metrics.davies_bouldin(X[dataset], mod.clustersave[-1])
 s = metrics.silhouette(X[dataset], mod.clustersave[-1])
 print("Davies-Bouldin: {}".format(db))
 print("Silhouette: {}".format(s))
 colors = (mod.clustersave[-1] + 1) / mod.ncluster
 axes[i, j].scatter(X[dataset][0, :],
                    X[dataset][1, :],
                    color=cm.jet(colors),
                    s=15)
 axes[i, j].set_xticklabels([])
 axes[i, j].set_yticklabels([])
 if i == 0:
     title = model + "\ndb:{:.2f} s:{:.2f} t:{:.3f}".format(
         db, s, mod.time_fit)
 else:
     title = "db: {:.2f} s:{:.2f} t:{:.3f}".format(db, s, mod.time_fit)
 axes[i, j].set_title(title)
Example #6
0
display_clusters(y, "Настоящие метки")


def display_metrics(n_clusters, metrics, title):
    plt.figure(figsize=(8, 6))
    plt.grid(linestyle='--')
    plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r')
    plt.title(title)
    plt.xlabel("Количество кластеров")
    plt.ylabel("Значение метрики")
    plt.show()


external_metrics = []
internal_metrics = []
for i in range(1, 11):
    kMean = KMeans(k=i)
    centroids = kMean.fit(X_norm)
    y_pred = kMean.predict(X_norm)
    if i == 1:
        internal_metrics.append(0.0)
    else:
        internal_metrics.append(silhouette(X_norm, y_pred, centroids))

    external_metrics.append(adjusted_rand_index(y, y_pred))
    display_clusters(y_pred, str(i) + ' кластеров')

display_metrics(range(1, 11), external_metrics, 'Внешняя метрика')
display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
Example #7
0
 def cost_func(args):   
  partial_ff = partial(ff, s = args)
  res = pool.map(partial_ff,cnt)
  s = metrics.silhouette(scale(np.array(res)),np.array(Y)-1)
  return np.median(np.abs(1.-s))
Example #8
0
#!/usr/bin/python

import sys
import descritores
import pylab
import cPickle
import metrics

db = cPickle.load(open(sys.argv[1]+"/classes.txt"))
#names = cPickle.load(open(sys.argv[1]+"/names.pkl"))
names = db.keys()
scales = pylab.loadtxt(sys.argv[2])

X = [pylab.vstack(([db[f] for f in names],pylab.array([pylab.log(descritores.bendenergy(sys.argv[1]+f,s)()) for f in names]).T)).T for s in scales]

for x in X:
 s = metrics.silhouette(x[:,1:],x[:,0].astype(int)-1)
 print pylab.mean(s),pylab.std(s)

for n,x in zip(['nmbe_pso.pkl','nmbe_de.pkl','nmbe_sa.pkl'],X): 
 with open(n,"wb") as f:
  cPickle.dump(dict(zip(names,x)),f)


Example #9
0
#!/usr/bin/python

import cPickle
import sys
import metrics
import pylab

color = ["b","r","g"]
db = cPickle.load(open(sys.argv[1]))
X,Y = db[:,1:],db[:,0].astype(int)
print "CE = {0}".format(metrics.CE(X,Y-1))
print "PC = {0}".format(metrics.PC(X,Y-1))
print "CS = {0}".format(metrics.CS(X,Y-1))
print "DB = {0}".format(metrics.db(X,Y-1))
print "DI = {0}".format(metrics.di(X,Y))
print "Silhouette = {0}".format(metrics.silhouette(X,Y-1).mean())
for c,i in zip(color,range(1,4)):
 pylab.plot(X[pylab.where(Y == i)][:,0],X[pylab.where(Y == i)][:,1],"o"+c)
 
pylab.show()