def adjusted_rand_index():
	#The text file is updated by a stream of data
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost")
	inputf1=Streaming_AbstractGenerator.StreamAbsGen("TextHistogramPartition",["/var/log/kern.log","/var/log/syslog","/var/log/ufw.log","/var/log/dmesg","/var/log/kern.log"])
	histograms=[]
	for p in inputf1:
		histograms.append(p)
	ari=adjusted_rand_score(tocluster(histograms[0],"Text")[:20000],tocluster(histograms[1],"Text")[:20000])
	print "Adjusted Rand Index of first two histogram set partitions(truncated):",ari
	prev=0
	for n in range(1,len(histograms)):
		truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9)
		ari=adjusted_rand_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen])
		print "Adjusted Rand Index(truncated):",ari
		ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen])
		print "Adjusted Mutual Info Index(truncated):",ami
		prev=n
	#################################################################
	histograms=[]
	inputf2=Streaming_AbstractGenerator.StreamAbsGen("DictionaryHistogramPartition","Streaming_SetPartitionAnalytics.txt")
	for p in inputf2:
		histograms.append(p)
	prev=0
	print "histograms:",histograms
	for n in range(1,len(histograms)):
		truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9)
		ari=adjusted_rand_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen])
		print "Adjusted Rand Index (truncated):",ari
		ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen])
		print "Adjusted Mutual Info Index (truncated):",ami
		prev=n
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def compute_cluster_metrics_raw(chains, cells):

    all_chains = []
    for chain_i, chain in enumerate(chains):

        sample_latent = chain['state']
        cell_assignment = np.array(sample_latent['domains']['d1']['assignment'])
        ca = irm.util.canonicalize_assignment(cell_assignment)

        cells['cluster'] = ca

        canon_true_fine = irm.util.canonicalize_assignment(cells['type_id'])
        canon_true_coarse = irm.util.canonicalize_assignment(cells['coarse'])



        ari = metrics.adjusted_rand_score(canon_true_fine, ca)
        ari_coarse = metrics.adjusted_rand_score(canon_true_coarse, ca)

        ami = metrics.adjusted_mutual_info_score(canon_true_fine, ca)
        ami_coarse = metrics.adjusted_mutual_info_score(canon_true_coarse, ca)


        jaccard = rand.compute_jaccard(canon_true_fine, ca)
        jaccard_coarse = rand.compute_jaccard(canon_true_coarse, ca)

        ss = rand.compute_similarity_stats(canon_true_fine, ca)

        # other statistics 

        # cluster count

        # average variance x
        vars = cells.groupby('cluster').var()
        # average variance y
        # average variance z

        chain_info = {'ari' : ari, 
                     'ari_coarse' : ari_coarse, 
                     'ami' : ami, 
                     'ami_coarse' : ami_coarse, 
                     'jaccard' : jaccard, 
                     'jaccard_coarse' : jaccard_coarse,
                     'n11' : ss['n11'], 
                     'vars' : vars, 
                      'cluster_n' : len(np.unique(cells['cluster'])),
                      'chain_i' : chain_i, 
                      'score' : chain['scores'][-1],
                      'df' : cells, 
                     }
        all_chains.append(chain_info)
    df = pandas.DataFrame(all_chains)
    return df
Beispiel #4
0
  def results(self, algo, hasgnc = False, filename="_"):
    title = self.__class__.__name__
    AMI_increase = []
    ARI_increase = []
    rounds = 1
    if hasgnc: rounds = 10
    print "Runing ", algo.__name__, "for", rounds, "rounds"
    for i in range(rounds):
      vd = algo(self.g, weights = [ (lambda w: max(w,0) )(w) for w in self.g.es["weight"]] )
      try:
        vc = vd.as_clustering()
      except:
        vc = vd #in case a VertexCluster instance is returned
      self.write_vertex_clustering(vc, "_weighted%s" % filename)
      if hasgnc:
        for cc in range(len(vc)):
          for cci in vc[cc]:
            self.g.vs[cci]["fastgreedy_withweight"] = str(cc)
      vd = algo(self.g)
      try:
        vc = vd.as_clustering()
      except:
        vc = vd #in case a VertexCluster instance is returned
      self.write_vertex_clustering(vc, "_unweighted%s" % filename)
      if hasgnc:
        for cc in range(len(vc)):
          for cci in vc[cc]:
            self.g.vs[cci]["fastgreedy_withoutweight"] = str(cc)
        #self.g.write_gml("%s.gml" % title)
        #print "%s.gml written with attributes" % title,
        #print self.g.vs.attributes()
      if hasgnc:
        #print "Weighted:"
        #print "Adjusted Mutual Information:", 
        ami_weight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"])
        #print "Adjusted Rand index:", 
        ari_weight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"])
        #print "~"*30
        #print "Unweighted:"
        #print "Adjusted Mutual Information:", 
        ami_unweight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"])
        #print "Adjusted Rand index:", 
        ari_unweight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"])

        AMI_increase.append(ami_weight - ami_unweight)
        ARI_increase.append(ari_weight - ari_unweight)
    if hasgnc:
      print "Adjusted Mutual Information increases by",
      print 1.0 * sum(AMI_increase) / len(AMI_increase)
      print "Adjusted Rand index increases by",
      print 1.0 * sum(ARI_increase) / len(ARI_increase)
      print "-" * 20
      return AMI_increase
Beispiel #5
0
    def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""):
        print("\n --------- tracking ...")

        times_fsp, axes_fsp, labels_fsp = [], [], []
        times_ssp, axes_ssp, labels_ssp = [], [], []

        timedelta = datetime.timedelta(
            milliseconds=60 * 60 * 1000)  # read chunk by chunk (each chunk is of 'timedelta' milliseconds)
        date = d_start
        while date < d_end:
            if date + timedelta >= d_end: timedelta = d_end - date

            times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta)
            # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png")
            times_fsp += times;
            axes_fsp += axes;
            labels_fsp += labels

            times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True)
            # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png")
            times_ssp += times;
            axes_ssp += axes;
            labels_ssp += labels

            date += timedelta

        # ----------------------------
        if gb.ARTIFICIAL:
            times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED,
                                                                      get_modes=True)

            ari_fps = adjusted_rand_score(true_labels, labels_fsp);
            ari_sps = adjusted_rand_score(true_labels, labels_ssp)
            ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp);
            ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp)
            ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp);
            ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp)

            print("---------------------------------------------------")
            print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps))
            print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps))
            print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps))
            print("completeness \t (com_fps, com_sps)", (com_fps, com_sps))
            print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps))

            #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)
            return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp)

        else:
            return 0., 0.
Beispiel #6
0
def kmeans_setup(data):
	

	if pca_f == 1:
		pca = PCA(n_components = num_clusters).fit(data)
		initializer = pca.components_
		name = 'PCA'
	else:
		initializer = 'k-means++'
		name = 'k-means++'

	t0 = time()
	
	estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations)
	estimator.fit(data)
	
	if debug == True:
		sample_size = 300
		print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
	          % (name, (time() - t0), estimator.inertia_,
	             metrics.homogeneity_score(labels, estimator.labels_),
	             metrics.completeness_score(labels, estimator.labels_),
	             metrics.v_measure_score(labels, estimator.labels_),
	             metrics.adjusted_rand_score(labels, estimator.labels_),
	             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
	             metrics.silhouette_score(data, estimator.labels_,
	                                      metric='euclidean',
	                                      sample_size=sample_size)))
	return estimator
Beispiel #7
0
def affin_test():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    
     
    x_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
        x_train, y_train, test_size=0.9, random_state=42)    
    
    
    labels_true = y_train 
    
    x_train = StandardScaler().fit_transform(x_train)
    af = AffinityPropagation(preference=-50).fit(x_train)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    n_clusters_ = len(cluster_centers_indices)
    
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
 def evaluate(self):
     ARI = round(metrics.adjusted_rand_score(self.labels, self.pred), 4)
     AMI = round(metrics.adjusted_mutual_info_score(self.labels, self.pred), 4)
     NMI = round(metrics.normalized_mutual_info_score(self.labels, self.pred), 4)
     print("Adjusted Rand index:", "%.4f" % ARI)
     print("Adjusted Mutual Information:", "%.4f" % AMI)
     print("Normalized Mutual Information:", "%.4f" % NMI)
Beispiel #9
0
def predictAffinityPropagation(X, labels_true):
	#ranX, ranY = shuffle(X, y, random_state=0)
	af = AffinityPropagation(preference=-50).fit(X)
	cluster_centers_indices = af.cluster_centers_indices_
	labels = af.labels_

	n_clusters_ = len(cluster_centers_indices)

	print('Estimated number of clusters: %d' % n_clusters_)
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
	print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
	print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
	print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

	plt.close('all')
	plt.figure(1)
	plt.clf()

	colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
	for k, col in zip(range(n_clusters_), colors):
	    class_members = labels == k
	    cluster_center = X[cluster_centers_indices[k]]
	    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
	    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
	             markeredgecolor='k', markersize=14)
	    for x in X[class_members]:
	        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

	plt.title('Estimated number of clusters: %d' % n_clusters_)
	plt.show()
def bench_k_means(estimator, name, data, sample_size, labels,postIds):
    data=sparse.csr_matrix(data)
    t0 = time()
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    lsa = TruncatedSVD(500)

    data = lsa.fit_transform(data)
    data = Normalizer(copy=False).fit_transform(data)

    print("done in %fs" % (time() - t0))
    print()

    #sData=sparse.csr_matrix(data)
    val=estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f '
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))

    print("Parsing USer File:")
    parseUserFile()
    print("extracting User File:")
    clusterDict=extractCluster(postIds,estimator.labels_)
    print("writing Cluster Data to File")
    writeCluterToFile(clusterDict)
def bench_k_means(estimator, name, data, target_labels, sample_size):
  """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy
  ARGS
    estimator: K-Means clustering algorithm <sklearn.cluster.KMeans>
    name: estimator name <str>
    data: array-like or sparse matrix, shape=(n_samples, n_features)
    target_labels: labels of data points <number array>
    sample_size: size of the sample to use when computing the Silhouette Coefficient <int>
  """ 
  t0 = time()
  estimator.fit(data)

  _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels)

  print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        % (name, (time() - t0), estimator.inertia_,
           metrics.homogeneity_score(target_labels, estimator.labels_),
           metrics.completeness_score(target_labels, estimator.labels_),
           metrics.v_measure_score(target_labels, estimator.labels_),
           metrics.adjusted_rand_score(target_labels, estimator.labels_),
           metrics.adjusted_mutual_info_score(target_labels,  estimator.labels_),
           metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size),
           train_accuracy
          )
        )
Beispiel #12
0
def my_clustering(X, y, n_clusters, pca):
    # =======================================
    # Complete the code here.
    # return scores like this: return [score, score, score, score]
    # =======================================
    from sklearn.cluster import KMeans
    #print('f**k X ', X.shape)
    #print('f**k y ', y.shape)
    clf = KMeans(n_clusters)
    clf.fit(X)

    from sklearn import metrics
    ari = metrics.adjusted_rand_score(y, clf.labels_)
    mri = metrics.adjusted_mutual_info_score(y, clf.labels_)
    v_measure = metrics.v_measure_score(y, clf.labels_)
    '''
    silhouette_coeff = metrics.silhouette_score(X, clf.labels_,
                                      metric='euclidean',
                                      sample_size=300)
    '''
    silhouette_coeff = metrics.silhouette_score(X, clf.labels_)

    show_images(n_clusters, clf, pca)


    return [ari,mri,v_measure,silhouette_coeff]
Beispiel #13
0
def bench_k_means(estimator, data, labels):
    t0 = time()
    estimator.fit(data)
    print("time to fit: {:.5}".format(time() - t0))
    homogenity = metrics.homogeneity_score(labels, estimator.labels_)
    completeness = metrics.completeness_score(labels, estimator.labels_)
    v_measure = metrics.v_measure_score(labels, estimator.labels_)
    print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format(
        homogenity, completeness, v_measure)
    )

    adj_rand_score = metrics.adjusted_rand_score(
        labels, estimator.labels_
    )
    print("adjusted_rand_score {:.5}".format(adj_rand_score))

    adj_mutual_info_score = metrics.adjusted_mutual_info_score(
        labels,  estimator.labels_
    )
    print("adjusted_mutual_info_score {:.5}".format(
        adj_mutual_info_score)
    )

    silhouette_score = metrics.silhouette_score(
        data, estimator.labels_, metric='euclidean'
    )
    print("silhouette_score {:.5}".format(
        metrics.silhouette_score(data, estimator.labels_,
                                 metric='euclidean'))
    )

    return [
        homogenity, completeness, v_measure, adj_rand_score,
        adj_mutual_info_score, silhouette_score
    ]
Beispiel #14
0
def intersubjectconsensus():
    """Compute inter-subjects clustering consensus.

    """
    base_dir = r'/nfs/h1/workingshop/huanglijie/uni_mul_analysis'
    db_dir = os.path.join(base_dir, 'multivariate', 'detection', 'mvpcluster')

    n_clusters = 60

    mask_file = os.path.join(base_dir, 'multivariate', 'detection',
                             'mask.nii.gz')
    mask = nib.load(mask_file).get_data()

    for n in range(1, n_clusters):
        n += 1
        merged_file = os.path.join(db_dir, 'merged_cluster_'+str(n)+'.nii.gz')
        merged_data = nib.load(merged_file).get_data()
        n_subjs = merged_data.shape[3]
        mtx = np.zeros((n_subjs, n_subjs))
        for i in range(n_subjs):
            for j in range(n_subjs):
                data_i = merged_data[..., i]
                data_j = merged_data[..., j]
                vtr_i = data_i[np.nonzero(mask)]
                vtr_j = data_j[np.nonzero(mask)]
                tmp = metrics.adjusted_mutual_info_score(vtr_i, vtr_j)
                mtx[i, j] = tmp
        outfile = os.path.join(db_dir, 'consensus_'+str(n)+'.csv')
        np.savetxt(outfile, mtx, delimiter=',')
Beispiel #15
0
def cluster(Z, K=4, algo='kmeans'):
	descr = Z.columns
	X = Imputer().fit_transform(Z)

	##############################################################################
	if algo == 'dbscan':
		# Compute DBSCAN
		db = DBSCAN(eps=0.3, min_samples=10).fit(X)
		core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
		core_samples_mask[db.core_sample_indices_] = True
		labels = db.labels_
        
		# Number of clusters in labels, ignoring noise if present.
		n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        
		print('Estimated number of clusters: %d' % n_clusters_)
		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
		print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
		print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
		print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
	
	elif algo == 'kmeans':
		km = KMeans(n_clusters=K)
		km.fit(X)
		print(km.labels_)
		return km
Beispiel #16
0
def cluster(model, uids):
    ##############################################################################
    # Generate sample data
    X = []
    for uid in uids:
        X.append(model.docvecs[uid])
    labels_true = uids

    ##############################################################################
    # Compute Affinity Propagation
    af = AffinityPropagation(preference=-50).fit(X)
    pickle.dump(af, open('data/af.pick', 'w'))
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Beispiel #18
0
def run_clustering( clusterer, data, labels ):
    """
    Cluster: Using a predefined and parameterized clustering algorithm, fit
    some dataset and perform metrics given a set of ground-truth labels.

        clusterer: the clustering algorithm, from sklearn
        data:      array-like dataset input
        labels:    vector of ground-truth labels

    """

    # Time the operation
    t0 = time()
    clusterer.fit(data)
    t1 = time()

    # Perform metrics
    runtime         = (t1 - t0)
    homogeneity     = metrics.homogeneity_score(   labels, clusterer.labels_ )
    completeness    = metrics.completeness_score(  labels, clusterer.labels_ )
    v_measure       = metrics.v_measure_score(     labels, clusterer.labels_ )
    adjusted_rand   = metrics.adjusted_rand_score( labels, clusterer.labels_ )
    adjusted_mutual = metrics.adjusted_mutual_info_score( labels,
                                                          clusterer.labels_ )

    # Output to logs
    logging.info("  |-        Execution time: %fs"   % runtime)
    logging.info("  |-           Homogeneity: %0.3f" % homogeneity)
    logging.info("  |-          Completeness: %0.3f" % completeness)
    logging.info("  |-             V-measure: %0.3f" % v_measure)
    logging.info("  |-   Adjusted Rand-Index: %.3f"  % adjusted_rand)
    logging.info("  |-  Adjusted Mutual Info: %.3f"  % adjusted_mutual)
Beispiel #19
0
def get_result(km, labels):
    homo_score = metrics.homogeneity_score(labels, km.labels_)
    complete_score = metrics.completeness_score(labels, km.labels_)
    v_score = metrics.v_measure_score(labels, km.labels_)
    rand_score = metrics.adjusted_rand_score(labels, km.labels_)
    mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_)
    return homo_score, complete_score, v_score, rand_score, mutual_info
def get_constant_height_labels(clustering, n_clusters=None):
    """
    use silhouette analysis to select the best heigh to cut a linkage matrix
    :df: a correlation matrix
    parse_heatmap: int (optional). If defined, devides the columns of the 
                    heatmap based on cutting the dendrogram
    """
    N_variables = len(clustering['reorder_vec'])
    scores = []
    if n_clusters is None:
        for k_clusters in range(2,N_variables//3):
            labels = cut_tree(clustering['linkage'], n_clusters=k_clusters)
            try:
                score = silhouette_score(clustering['distance_df'], 
                                         labels.ravel(), metric='precomputed')
            except ValueError:
                continue
            scores.append((k_clusters,score))
        best_k = max(scores, key=lambda x: x[1])[0]
        labels = cut_tree(clustering['linkage'], n_clusters=best_k)

    else:
        labels = cut_tree(clustering['linkage'], n_clusters=n_clusters)
        score = silhouette_score(clustering['distance_df'], 
                                         labels, metric='precomputed')
        scores.append((n_clusters, score))
    labels = reorder_labels(labels.flatten(), clustering['linkage'])
    # comparison
    MI = adjusted_mutual_info_score(labels, clustering['labels'])
    return labels, scores, MI
Beispiel #21
0
def compareClusters(labelsA, labelsB, method='ARI', alignFirst=True, useCommon=False):
    """Requre that labelsA and labelsB have the same index"""
    if useCommon:
        labelsA, labelsB = labelsA.align(labelsB, join='inner')
    assert len(labelsA.index) == len(labelsB.index)
    assert (labelsA.index == labelsB.index).sum() == len(labelsA.index)
    uLabels = np.unique(labelsA)
    assert (uLabels == np.unique(labelsB)).sum() == uLabels.shape[0]

    if alignFirst:
        alignedB = alignClusters(labelsA, labelsB)
    else:
        alignedB = labelsB

    if method == 'ARI':
        s = metrics.adjusted_rand_score(labelsA.values, alignedB.values)
    elif method == 'AMI':
        s = metrics.adjusted_mutual_info_score(labelsA.values, alignedB.values)
    elif method == 'overlap':
        s = np.zeros(uLabels.shape[0])
        for labi, lab in enumerate(uLabels):
            membersA = labelsA.index[labelsA == lab]
            membersB = alignedB.index[alignedB == lab]
            accA = np.sum([1 for cy in membersA if cy in membersB]) / len(membersA)
            accB = np.sum([1 for cy in membersB if cy in membersA]) / len(membersB)
            s[labi] = (accA + accB) / 2

    return s
def drawlableCLuster(yyaxis,twitterlabel,cityname):
    ##############################################################################
    # Compute Affinity Propagation
    X = yyaxis
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(twitterlabel, labels))

    ##############################################################################
    # Plot result
    matplotlib.style.use('ggplot')
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)

    plt.title('Estimated number of clusters: %d' % n_clusters_)

    imgname = "./clusterimage/hourcondimention/" +"hour_dimention_twitterinfo_"+cityname+'.png'
    fig = plt.gcf()
    fig.set_size_inches(16.5, 12.5)
    fig.savefig(imgname)
    # plt.show()
    return [n_clusters_,metrics.silhouette_score(X, labels),metrics.adjusted_mutual_info_score(twitterlabel, labels)]
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'):  
    HCA = results.HCA
    # get all clustering solutions
    clusterings = HCA.results.items()
    # plot cluster agreement across embedding spaces
    names = [k for k,v in clusterings]
    cluster_similarity = np.zeros((len(clusterings), len(clusterings)))
    cluster_similarity = pd.DataFrame(cluster_similarity, 
                                     index=names,
                                     columns=names)
    
    distance_similarity = np.zeros((len(clusterings), len(clusterings)))
    distance_similarity = pd.DataFrame(distance_similarity, 
                                     index=names,
                                     columns=names)
    for clustering1, clustering2 in combinations(clusterings, 2):
        name1 = clustering1[0].split('-')[-1]
        name2 = clustering2[0].split('-')[-1]
        # record similarity of distance_df
        dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']),
                                squareform(clustering2[1]['distance_df']))[1,0]
        distance_similarity.loc[name1, name2] = dist_corr
        distance_similarity.loc[name2, name1] = dist_corr
        # record similarity of clustering of dendrogram
        clusters1 = clustering1[1]['labels']
        clusters2 = clustering2[1]['labels']
        rand_score = adjusted_rand_score(clusters1, clusters2)
        MI_score = adjusted_mutual_info_score(clusters1, clusters2)
        cluster_similarity.loc[name1, name2] = rand_score
        cluster_similarity.loc[name2, name1] = MI_score
    
    with sns.plotting_context(context='notebook', font_scale=1.4):
        clust_fig = plt.figure(figsize = (12,12))
        sns.heatmap(cluster_similarity, square=True)
        plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand',
                  y=1.02)
        
        dist_fig = plt.figure(figsize = (12,12))
        sns.heatmap(distance_similarity, square=True)
        plt.title('Distance Similarity, metric: %s' % HCA.dist_metric,
                  y=1.02)
        
    if plot_dir is not None:
        save_figure(clust_fig, path.join(plot_dir, 
                                   'cluster_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        save_figure(dist_fig, path.join(plot_dir, 
                                   'distance_similarity_across_measures.%s' % ext),
                    {'bbox_inches': 'tight'})
        plt.close(clust_fig)
        plt.close(dist_fig)
    
    if verbose:
        # assess relationship between two measurements
        rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)]
        MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)]
        score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1]
        print('Correlation between measures of cluster consistency: %.2f' \
              % score_consistency)
def evaluate(labels_true, labels):
    homogeneity = metrics.homogeneity_score(labels_true, labels)
    completeness = metrics.completeness_score(labels_true, labels)
    v_measure = metrics.v_measure_score(labels_true, labels)
    adjusted_rand = metrics.adjusted_rand_score(labels_true, labels)
    adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels)
    #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean')
    return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
Beispiel #25
0
def print_cluster(clusterTrainClass, labels, clusterTestStory):
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels))
	print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels))
	print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels))
	print "Silhouette Coefficient:"
	print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
Beispiel #26
0
def ami_score_op(s, s_hat):
    scores = []
    for i in range(s.shape[1]):
        true_labels = s[:, i, :].argmax(0)
        m = s[:, i, :].max(0) > 0.9
        pred_labels = s_hat[:, i, :].argmax(0)
        scores.append(adjusted_mutual_info_score(true_labels[m], pred_labels[m]))
    return np.array(scores, dtype=np.float32)
Beispiel #27
0
def cluseval(label, truth):
    rand = metrics.adjusted_rand_score(truth, label)
    mutual = metrics.adjusted_mutual_info_score(truth, label)
    h**o = metrics.homogeneity_score(truth, label)
    complete = metrics.completeness_score(truth, label)
    v = metrics.v_measure_score(truth, label)
    result = [rand, mutual, h**o, complete, v]
    return result
 def compare_clusters(self):
     # compares original to consensus clustering
     if self.consensus_clustering is None:
         print("First run consensusCluster!")
         return
     else:
         orig_labels = self.orig_clustering['labels']
         new_labels = self.consensus_clustering['labels']
     return adjusted_mutual_info_score(orig_labels, new_labels)
Beispiel #29
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(usage=__doc__,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description='Perform spectral clustering.')
    parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.')
    parser.add_argument("--knn", "-k", type=int, default=0, 
            help='Number of nearest neighbors, 0 means all.')
    parser.add_argument("--sm", "-s", 
            help='File containing similarity matrix')
    parser.add_argument("--iterations", "-i", type=int, default=10,
            help='Number of KMeans iterations.')
    parser.add_argument("--true_labels", "-t", 
            help='File containing the true labels.')
    parser.add_argument("--output", "-o", help='Name of the file to write' +
            ' the labels to.')
    parser.add_argument("--normalize", "-n", action='store_true', 
            help='Normalize each row so that the max value is one.')
    args = parser.parse_args()


    sm = np.load(args.sm)
    if args.normalize:
        sm /= sm.max(axis=1)[:, np.newaxis]
        # Ensure symmetric
        sm = (sm + sm.T) / 2
    labels = []
    if args.knn > 0:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='nearest_neighbors', n_neighbors=args.knn,
                n_init=args.iterations).fit(sm).labels_
    else:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='precomputed',
                n_init=args.iterations).fit(sm).labels_
    
    with open(args.output, 'w') as fout:
        for l in labels:
            fout.write(str(l) + '\n')

    # Load the true labels.
    if args.true_labels:
        true_labels = []
        with open(args.true_labels, 'r') as fin:
            for line in fin:
                true_labels.append(int(line.strip()))
        # Run the metrics.
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels))
        print("Adjusted Rand Index: %0.3f"
                      % metrics.adjusted_rand_score(true_labels, labels))
        print("Adjusted Mutual Information: %0.3f"
                      % metrics.adjusted_mutual_info_score(true_labels, labels))
        print("Silhouette Coefficient: %0.3f"
                      % metrics.silhouette_score(sm, labels))
Beispiel #30
0
 def evaluateAllAlgorithms(self):
   algs = [self.labels_db,self.labels_ap]
   t**s =['DBASE','AP']
   for i in range(2):
     print 'Algorithm:',t**s[i]
     print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i]))
     print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i]))
     print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i]))
     print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i]))
     print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
Beispiel #31
0
def ami_score(U, V):
    return metrics.adjusted_mutual_info_score(U, V)
Beispiel #32
0
def calcMaxState(G_data, B_data, name, encoder):
    index = 0
    max_value = 0

    if name not in 'email':
        iterations = 1001
    else:
        iterations = 300

    for r_state in range(0, iterations):
        B_data_X = encoder.detach().numpy()

        kmeans = KMeans(n_clusters=get_clusters(G_data, name),
                        init='k-means++',
                        random_state=r_state)
        kmeans.fit(B_data_X)

        X_ae = kmeans.labels_  # Calculated labels

        # Finding truth values
        if name == 'karate':
            c_groups = [
                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
                0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
            ]
        elif name == 'email':
            c_groups = [
                1, 1, 21, 21, 21, 25, 25, 14, 14, 14, 9, 14, 14, 26, 4, 17, 34,
                1, 1, 14, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
                11, 11, 11, 11, 11, 11, 11, 5, 34, 14, 14, 17, 17, 10, 10, 36,
                37, 5, 7, 4, 22, 22, 21, 21, 21, 21, 7, 7, 36, 21, 25, 4, 8,
                15, 15, 15, 37, 37, 9, 1, 1, 10, 10, 3, 3, 3, 29, 15, 36, 36,
                37, 1, 36, 34, 20, 20, 8, 15, 9, 4, 5, 4, 20, 16, 16, 16, 16,
                16, 38, 7, 7, 34, 38, 36, 8, 27, 8, 8, 8, 10, 10, 13, 13, 6,
                26, 10, 1, 36, 0, 13, 16, 16, 22, 6, 5, 4, 0, 28, 28, 4, 2, 13,
                13, 21, 21, 17, 17, 14, 36, 8, 40, 35, 15, 23, 0, 0, 7, 10, 37,
                27, 35, 35, 0, 0, 19, 19, 36, 14, 37, 24, 17, 13, 36, 4, 4, 13,
                13, 10, 4, 38, 32, 32, 4, 1, 0, 0, 0, 7, 7, 4, 15, 16, 40, 15,
                15, 15, 15, 0, 21, 21, 21, 21, 5, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4,
                4, 22, 19, 19, 22, 34, 14, 0, 1, 17, 37, 1, 1, 1, 1, 1, 1, 1,
                1, 1, 1, 1, 10, 23, 0, 4, 19, 19, 19, 19, 19, 19, 19, 19, 19,
                19, 19, 19, 10, 14, 14, 1, 14, 7, 13, 20, 31, 40, 6, 4, 0, 8,
                9, 9, 10, 0, 10, 14, 14, 14, 14, 39, 17, 4, 28, 17, 17, 17, 4,
                4, 0, 0, 23, 4, 21, 36, 36, 0, 22, 21, 15, 37, 0, 4, 4, 4, 14,
                4, 7, 7, 1, 15, 15, 38, 26, 20, 20, 20, 21, 9, 1, 1, 1, 1, 1,
                1, 1, 1, 1, 1, 1, 10, 19, 7, 7, 17, 16, 14, 9, 9, 9, 8, 8, 13,
                39, 14, 10, 17, 17, 13, 13, 13, 13, 2, 1, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 16, 16, 27, 8, 8, 14, 14, 14, 10, 14, 35, 37, 14,
                36, 10, 7, 20, 10, 16, 36, 36, 14, 8, 7, 7, 7, 7, 7, 7, 7, 7,
                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 9, 4, 0, 4, 16,
                38, 14, 14, 21, 26, 27, 28, 21, 4, 1, 1, 9, 10, 15, 4, 26, 14,
                35, 10, 34, 4, 4, 12, 17, 17, 14, 37, 37, 37, 34, 6, 13, 13,
                13, 13, 4, 14, 10, 10, 10, 3, 17, 17, 17, 1, 4, 14, 14, 6, 27,
                22, 21, 4, 4, 1, 34, 17, 30, 30, 4, 23, 14, 15, 1, 22, 12, 31,
                6, 15, 15, 8, 15, 8, 8, 1, 15, 22, 2, 3, 4, 10, 4, 14, 14, 25,
                6, 6, 40, 4, 36, 23, 14, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14,
                31, 15, 15, 14, 0, 23, 35, 8, 4, 1, 1, 35, 23, 21, 2, 4, 4, 9,
                14, 4, 10, 25, 14, 14, 3, 21, 35, 4, 9, 15, 6, 9, 3, 15, 23, 4,
                4, 4, 11, 35, 10, 6, 15, 15, 15, 22, 2, 2, 14, 4, 3, 14, 27,
                31, 34, 4, 4, 19, 14, 14, 4, 4, 14, 14, 21, 4, 14, 4, 0, 4, 27,
                27, 17, 3, 15, 2, 4, 4, 21, 21, 11, 23, 11, 23, 17, 5, 36, 15,
                23, 23, 2, 19, 4, 36, 14, 1, 22, 1, 21, 34, 14, 13, 6, 4, 37,
                6, 24, 35, 6, 17, 16, 6, 4, 0, 21, 4, 26, 21, 4, 15, 7, 1, 20,
                19, 7, 21, 21, 21, 19, 38, 19, 16, 23, 6, 37, 25, 1, 22, 6, 14,
                1, 26, 8, 37, 4, 0, 17, 6, 17, 14, 16, 4, 32, 14, 15, 0, 23,
                21, 29, 14, 14, 1, 17, 26, 15, 0, 0, 0, 22, 34, 21, 6, 16, 4,
                15, 21, 0, 36, 4, 1, 1, 22, 14, 14, 30, 4, 9, 10, 4, 4, 14, 16,
                16, 15, 21, 0, 4, 15, 29, 24, 21, 14, 11, 11, 9, 13, 10, 31, 4,
                22, 14, 23, 1, 4, 9, 17, 27, 28, 22, 14, 20, 7, 23, 1, 6, 15,
                15, 23, 4, 20, 5, 36, 10, 21, 39, 41, 31, 17, 7, 21, 34, 1, 14,
                2, 18, 16, 27, 16, 38, 7, 38, 21, 1, 9, 15, 15, 15, 0, 6, 23,
                28, 11, 23, 34, 24, 4, 4, 4, 24, 23, 17, 10, 17, 1, 1, 15, 15,
                4, 21, 14, 14, 20, 28, 20, 22, 26, 3, 32, 4, 0, 21, 13, 4, 15,
                17, 5, 4, 14, 0, 9, 21, 14, 38, 4, 14, 31, 21, 14, 6, 4, 4, 6,
                17, 0, 4, 7, 16, 4, 4, 21, 1, 10, 3, 21, 4, 0, 1, 7, 17, 15,
                14, 0, 9, 32, 13, 5, 2, 21, 28, 21, 22, 22, 7, 7, 33, 0, 1, 15,
                4, 31, 30, 15, 11, 19, 21, 9, 21, 13, 21, 9, 32, 9, 32, 38, 9,
                38, 38, 14, 9, 10, 38, 10, 22, 21, 13, 21, 4, 0, 1, 1, 23, 0,
                5, 4, 4, 15, 14, 14, 13, 11, 1, 5, 5, 10, 23, 21, 14, 9, 20,
                10, 19, 19, 21, 17, 19, 19, 36, 17, 35, 16, 4, 16, 4, 6, 4, 41,
                6, 7, 23, 9, 23, 7, 6, 22, 36, 14, 15, 11, 35, 5, 14, 14, 15,
                4, 6, 4, 9, 19, 11, 4, 29, 14, 15, 15, 5, 32, 15, 14, 5, 9, 10,
                19, 13, 23, 12, 10, 21, 10, 35, 7, 22, 22, 22, 8, 21, 32, 4,
                21, 21, 6, 14, 11, 14, 15, 4, 21, 1, 6, 22
            ]
        else:
            c_attributes = nx.get_node_attributes(G_data, 'value')
            c_groups = []
            for i, val in enumerate(c_attributes.values()):
                c_groups.append(val)

        X_gt = np.array(c_groups)
        ami = metrics.adjusted_mutual_info_score(X_gt,
                                                 X_ae,
                                                 average_method='arithmetic')

        if (ami > max_value):
            index = r_state
            max_value = ami

        if (r_state % 100 == 0):
            print("Index:{}\tMax AMI till now:{}".format(index, max_value))

    return index
Beispiel #33
0
import rcc
import pdb
import numpy as np
from sklearn.metrics import adjusted_mutual_info_score

X = []
Y = []

with open('pendigits.txt', 'r') as f:
    for line in f:
        line_split = line.strip().replace(' ', '').split(',')
        x = np.array([int(s) for s in line_split[:-1]])
        y = int(line_split[-1])
        X.append(x)
        Y.append(y)

X = np.array(X).astype(np.float32)
Y = np.array(Y)
clusterer = rcc.rcc_cluster(measure='cosine')
P = clusterer.fit(X)
P = clusterer.labels_
print('AMI: {}'.format(adjusted_mutual_info_score(Y, P)))
Beispiel #34
0
fcm.fit(X_train)

# outputs
fcm_centers = fcm.centers  # 첫번째는 'Bengin' cetroid, 두번쨰는 'attack' centroid
fcm_labels = fcm.u.argmax(axis=1)
probability = fcm.predict(X_test)

result_df = pd.DataFrame(data=probability, columns=[0, 1, 'pre_class'])
result_df['class'] = y_test
print(color.BOLD + "Result" + color.END)
print(result_df.head())

print(color.BOLD + "\nScoring" + color.END)
h_score = homogeneity_score(result_df['class'], result_df['pre_class'])
ar_score = adjusted_rand_score(result_df['class'], result_df['pre_class'])
ami_socre = adjusted_mutual_info_score(result_df['class'],
                                       result_df['pre_class'])
print("Homogeneity_score : %.4f" % h_score)
print("Adjusted_rand_score : %.4f" % ar_score)
print("Adjusted_mutual_info_score : %.4f" % ami_socre)
print('Accuracy : %0.4f' %
      accuracy_score(result_df['class'], result_df['pre_class']))
Precision = precision_score(result_df['class'],
                            result_df['pre_class'],
                            average=None)
Precision = sum(Precision) / 2
print('Precision : %0.4f' % Precision)
Recall = recall_score(result_df['class'], result_df['pre_class'], average=None)
Recall = sum(Recall) / 2
print('Recall : %0.4f' % Recall)
F1 = f1_score(result_df['class'], result_df['pre_class'], average=None)
F1 = sum(F1) / 2
Beispiel #35
0
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(
          labels_true, labels, average_method='arithmetic'))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [
    plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]
Beispiel #36
0
X, y = make_blobs(n_samples=500,
                  n_features=2,
                  centers=4,
                  cluster_std=1,
                  center_box=(-10.0, 10.0),
                  shuffle=True,
                  random_state=1)
plot_data(X, y)

kmeans_model = cluster.KMeans(n_clusters=2, random_state=1)
kmeans_model.fit(X)
kmeans_model.cluster_centers_
kmeans_model.labels_

#metrics when target labels are not known
silhouette_avg = metrics.silhouette_score(X,
                                          kmeans_model.labels_,
                                          metric='euclidean')
print(silhouette_avg)
silhouette_samples = metrics.silhouette_samples(X,
                                                kmeans_model.labels_,
                                                metric='euclidean')
print(silhouette_samples)
ch_score = metrics.calinski_harabaz_score(X, kmeans_model.labels_)
print(ch_score)

#metrics when target labels are known
print(metrics.adjusted_rand_score(y, kmeans_model.labels_))
print(metrics.adjusted_mutual_info_score(y, kmeans_model.labels_))
Beispiel #37
0
def ami(X1, X2):
    return adjusted_mutual_info_score(X1, X2)
Beispiel #38
0
                        elist.append([v_o, v_t])
            vlist.remove(v_o)
    g.add_edge_list(elist)

    state = gt.minimize_blockmodel_dl(g, deg_corr=False)
    #write_classes('sim/sim_SBM.tsv', g, state)
    #state.draw(output="sim/sim_SBM.png")
    blocks = state.get_blocks()
    preds = get_blocksCC(g, blocks)
    nmi_sbm.append([normalized_mutual_info_score(g.vp.RealClass.a, blocks.a), normalized_mutual_info_score(g.vp.RealClass.a, list(preds))])
    print("  NMI_SBM = %.5f\tNMI_SBMCC = %.5f" % (nmi_sbm[i][0], nmi_sbm[i][1]), flush=True)
    print("  NMI_SBMavg = %.5f\tNMI_SBMCCavg = %.5f" % (np.mean(np.asarray(nmi_sbm), 0)[0], np.mean(np.asarray(nmi_sbm), 0)[1]), flush=True)
    if i > 2:
        print("  NMI_SBMstd = %.5f\tNMI_SBMCCstd = %.5f" % (np.std(np.asarray(nmi_sbm), 0, ddof=1)[0], np.std(np.asarray(nmi_sbm), 0, ddof=1)[1]), flush=True)
    print(flush=True)
    ami_sbm.append([adjusted_mutual_info_score(g.vp.RealClass.a, blocks.a), adjusted_mutual_info_score(g.vp.RealClass.a, list(preds))])
    print("  AMI_SBM = %.5f\tAMI_SBMCC = %.5f" % (ami_sbm[i][0], ami_sbm[i][1]), flush=True)
    print("  AMI_SBMavg = %.5f\tAMI_SBMCCavg = %.5f" % (np.mean(np.asarray(ami_sbm), 0)[0], np.mean(np.asarray(ami_sbm), 0)[1]), flush=True)
    if i > 2:
        print("  AMI_SBMstd = %.5f\tAMI_SBMCCstd = %.5f" % (np.std(np.asarray(ami_sbm), 0, ddof=1)[0], np.std(np.asarray(ami_sbm), 0, ddof=1)[1]), flush=True)
    print(flush=True)
    ar_sbm.append([adjusted_rand_score(g.vp.RealClass.a, blocks.a), adjusted_rand_score(g.vp.RealClass.a, list(preds))])
    print("  AR_SBM = %.5f\tAR_SBMCC = %.5f" % (ar_sbm[i][0], ar_sbm[i][1]), flush=True)
    print("  AR_SBMavg = %.5f\tAR_SBMCCavg = %.5f" % (np.mean(np.asarray(ar_sbm), 0)[0], np.mean(np.asarray(ar_sbm), 0)[1]), flush=True)
    if i > 2:
        print("  AR_SBMstd = %.5f\tAR_SBMCCstd = %.5f" % (np.std(np.asarray(ar_sbm), 0, ddof=1)[0], np.std(np.asarray(ar_sbm), 0, ddof=1)[1]), flush=True)
    print(flush=True)

    state_nested = gt.minimize_nested_blockmodel_dl(g, deg_corr=False)
    #write_classes_hierarchical('sim/sim_NSBM.tsv', g, state_nested)
    state_nested_l0 = state_nested.get_levels()[0]
Beispiel #39
0
        continue

    # Split 20% test - 80% training
    #train, test = train_test_split(dmn, test_size=0.1, stratify=c.loc[:,l])

    # What we try to predict is the count of the activity l (i.e., target)
    y_train = c.loc[:, l]

    tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes)
    tree.fit(dmn, y_train)
    prediction = tree.predict(dmn)

    f1 = f1_score(prediction, y_train, average='micro')
    mutual_info = round(
        adjusted_mutual_info_score(prediction,
                                   y_train,
                                   average_method='arithmetic'), 3)
    tree_score = tree.score(dmn, y_train)
    if f1 > 0.95 and mutual_info > 0.2 and tree_score > 0.95:
        print()
        print(l)
        print('tree1:::::')
        print('tree score:', tree_score)
        print('f1_score:', f1)
        print('adjusted_mutual_info_score:', mutual_info)
        print(
            list(
                reversed(
                    sorted(zip(tree.feature_importances_.round(2),
                               dmn.columns)))))
        print(prediction)
    def kmeans_model(self, test_size, random_state,show=None):
        # pre-process the data
        standardized_data = scale(self.data)

        # splitting the data into training and testing sets
        # typically 3/4 of the data is used to train, 1/4 of the data is used to test
        # x is the data you are testing : y is the target values of the corresponding data
        x_train, x_test, y_train, y_test, images_train, images_test = train_test_split(standardized_data, self.target,
                                                                                       self.images,
                                                                                       test_size=test_size,
                                                                                       random_state=random_state)
        # gets the number of training features
        n_samples, n_features = x_train.shape

        # print out the number of samples and features
        print("# of training samples: ", n_samples)
        print("# of training features: ", n_features)

        # num_digits is the amount of unique targets
        n_digits = len(np.unique(y_train))

        # create the KMeans model.
        # init defaults to init='k-means++'
        # add n-init argument to determine how many different centroid configurations the algorithm will try
        clf = cluster.KMeans(init='k-means++', n_clusters=n_digits, random_state=random_state)

        # fit the x_train data to the model
        clf.fit(x_train)

        if show:
            #  create the figure with a size of 8x3 inches
            fig = plt.figure(figsize=(8, 4))

            # Add title
            fig.suptitle('Cluster Center Images', fontsize=14, fontweight='bold')

            # For all labels (0-9)
            for i in range(10):
                # Initialize subplots in a grid of 2X5, at i+1th position
                ax = fig.add_subplot(2, 5, 1 + i)
                # Display images
                ax.imshow(clf.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest")
                # Don't show the axes
                plt.axis('off')

            # Show the plot
            plt.show()

        # predict the labels for x_test
        y_pred = clf.predict(x_test)

        # print out the first 50 predicted and test values
        print("Predicted Values:\n",y_pred[:50])
        print("Target Values:\n",y_test[:50])
        print("Shape of Data:\n",clf.cluster_centers_.shape)

        # Create an isomap and fit the `digits` data to it
        x_iso = Isomap(n_neighbors=10).fit_transform(x_train)

        # Compute cluster centers and predict cluster index for each sample
        clusters = clf.fit_predict(x_train)

        if show:
            # Create a plot with subplots in a grid of 1X2
            fig = plt.figure(1, (8, 4))
            gs = gridspec.GridSpec(1, 2)
            ax = [fig.add_subplot(ss) for ss in gs]

            # Adjust layout
            fig.suptitle('Predicted Versus Training Labels(ISOMAP)', fontsize=14, fontweight='bold')

            # Add scatterplots to the subplots
            ax[0].scatter(x_iso[:, 0], x_iso[:, 1], c=clusters, edgecolors='black')
            ax[0].set_title('Predicted Training Labels')
            ax[1].scatter(x_iso[:, 0], x_iso[:, 1], c=y_train, edgecolors='black')
            ax[1].set_title('Actual Training Labels')

            gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])

            # Show the plots
            plt.show()

        # Model and fit the `digits` data to the PCA model
        x_pca = PCA(n_components=2).fit_transform(x_train)

        # Compute cluster centers and predict cluster index for each sample
        clusters = clf.fit_predict(x_train)

        if show:
            # Create a plot with subplots in a grid of 1X2
            fig = plt.figure(1, (8, 4))
            gs = gridspec.GridSpec(1, 2)
            ax = [fig.add_subplot(ss) for ss in gs]

            # Adjust layout
            fig.suptitle('Predicted Versus Training Labels (PCA)', fontsize=14, fontweight='bold')
            fig.subplots_adjust(top=0.85)

            # Add scatterplots to the subplots
            ax[0].scatter(x_pca[:, 0], x_pca[:, 1], c=clusters, edgecolors='black')
            ax[0].set_title('Predicted Training Labels')
            ax[1].scatter(x_pca[:, 0], x_pca[:, 1], c=y_train, edgecolors='black')
            ax[1].set_title('Actual Training Labels')

            gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])

        # Show the plots
        plt.show()

        # Print out the confusion matrix to see how the model is incorrect
        print("Classification Report:\n",metrics.classification_report(y_test, y_pred))
        print("Confusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred))

        # So looking at these numbers we can see that the kmeans model is not a good fit for our problem
        # this means that we must pick a different model for our data
        print('% 9s' % 'inertia    h**o   compl  v-meas     ARI AMI  silhouette')
        print('%i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (clf.inertia_,
                 homogeneity_score(y_test, y_pred),
                 completeness_score(y_test, y_pred),
                 v_measure_score(y_test, y_pred),
                 adjusted_rand_score(y_test, y_pred),
                 adjusted_mutual_info_score(y_test, y_pred),
                 silhouette_score(x_test, y_pred, metric='euclidean')))
Beispiel #41
0
op.add_option("--show_fig", default=False, help="Show visual quality assessment.")
(opts, args) = op.parse_args(sys.argv[1:])

X = np.load("comparison/MNIST/wip_MNIST_X_org_41.npy")
Xadv_s = np.load("comparison/MNIST/wip_MNIST_X_adv_41.npy")

X = torch.from_numpy(X).unsqueeze(2)
Xadv_s = torch.from_numpy(Xadv_s).unsqueeze(2)

eps_s = Xadv_s - X

h = Hierarchical(n_clusters=2)
model = ClusteringWrapper3Dto2D(h)
yhat = model.fit_predict(X)
yadv_s = model.fit_predict(Xadv_s)
print(adjusted_mutual_info_score(yhat, yadv_s))
print((yhat != yadv_s).sum())

set_seed(4)
T = ConstrainedAdvPoisoningGlobal(
    delta=(Xadv_s - X).norm(float("inf")),
    s=1,
    clst_model=model,
    lb=1.0,
    G=150,
    mutation_rate=0.01,
    crossover_rate=0.85,
    zero_rate=0.10,
    domain_cons=[0, 255],
    objective="AMI",
    mode="guided",
Beispiel #42
0
    # KMeans
    km = KMeans(n_clusters=100, n_init=1)
    itime = time.perf_counter()
    kmlabels = km.fit_predict(citypos)
    etime = time.perf_counter()
    print('K-means Time = ', etime - itime)

    # Minibatch Kmeans
    itime = time.perf_counter()
    mbkm = MiniBatchKMeans(n_clusters=100,
                           batch_size=1000,
                           n_init=1,
                           max_iter=5000)
    mbkmlabels = mbkm.fit_predict(citypos)
    etime = time.perf_counter()
    print('MB K-means Time = ', etime - itime)

    print('Similarity Km vs MBKm',
          adjusted_mutual_info_score(kmlabels, mbkmlabels))

    # Birch
    itime = time.perf_counter()
    birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100)
    birchlabels = birch.fit_predict(citypos)
    etime = time.perf_counter()
    print('BIRCH Time = ', etime - itime)

    print('Similarity Km vs BIRCH',
          adjusted_mutual_info_score(kmlabels, birchlabels))
print("Top 10 terms per cluster:")
for i in range(kvalue):
    print("Cluster %d:" % i, end='')
    for j in order_centroids[i, :10]:
        print(' %s' % terms[j], end='')
    print()
print("Confusion matrix:")
print(cm)
print("Homogeneity score: %0.3f" %
      metrics.homogeneity_score(labels, km.labels_))
print("Completeness score: %0.3f" %
      metrics.completeness_score(labels, km.labels_))
print("Adjusted rand score: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Adjusted mutual info score: %0.3f" %
      metrics.adjusted_mutual_info_score(labels, km.labels_))
print(
    "------------------------------------------------------------------------")
print()

# Plot imformation
plt.figure()
plot_confusion_matrix(cm,
                      classes=class_names,
                      clusters=cluster_names,
                      title='Confusion matrix after LSA without normalization')

# Print information
print("Clustering sparse data with k-means with k = 2...")
print()
Beispiel #44
0
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-10).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
              % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
              % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
              % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
Beispiel #45
0
        plt.title('Dominant set + SVM Clustering')

    return labels


if __name__ == '__main__':
    np.random.seed(6)

    nclust = 3
    N = 1000  # number of samples
    d = 2  # dimension of samples (number of features)
    weights = np.ones(nclust)
    weights /= sum(weights)

    X, y = make_classification(weights=weights.tolist(),
                               n_classes=nclust,
                               n_samples=N,
                               n_features=d,
                               n_redundant=0,
                               class_sep=1,
                               n_clusters_per_class=1,
                               n_informative=d)

    dist_metric = 'mahalanobis'  #cosine, euclidean, l1, l2, manhattan, mahalanobis
    labels = ds_svm_clustering(X,
                               n_clust=nclust,
                               plot=True,
                               metric=dist_metric)
    print 'Adjusted Mutual Information Score: ', adjusted_mutual_info_score(
        y, labels)
    plt.show()
Beispiel #46
0
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
data = load_iris()
X = data.data
y = data.target
cl = KMeans(3)
cl.fit(X)
print(cl.cluster_centers_)
print(cl.inertia_)
print(cl.labels_)
print(y)
from sklearn.metrics import adjusted_mutual_info_score
print(adjusted_mutual_info_score(y, cl.labels_))
print("\t n_samples %d, \t n_features %d" % (n_samples, n_features))

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

t0 = time.time()
kmeans = KMeans(init='random', n_clusters=10, n_init=10)
kmeans.fit(data)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
      ('Random', (time.time() - t0), kmeans.inertia_,
       metrics.homogeneity_score(labels, kmeans.labels_),
       metrics.completeness_score(labels, kmeans.labels_),
       metrics.v_measure_score(labels, kmeans.labels_),
       metrics.adjusted_rand_score(labels, kmeans.labels_),
       metrics.adjusted_mutual_info_score(
           labels, kmeans.labels_, average_method='arithmetic'),
       metrics.silhouette_score(
           data, kmeans.labels_, metric='euclidean', sample_size=sample_size)))

print(82 * '_')

# Visualize the results on PCA-reduced data - random_raw

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='random', n_clusters=10, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
 def calculate_AMI(self, query_labels, cluster_labels, **kwargs):
     return adjusted_mutual_info_score(c_f.to_numpy(query_labels), cluster_labels)
np.random.seed(1)

# Get your mentioned graph
G = nx.karate_club_graph()

# Get ground-truth: club-labels -> transform to 0/1 np-array
#     (possible overcomplicated networkx usage here)
gt_dict = nx.get_node_attributes(G, 'club')
gt = [gt_dict[i] for i in G.nodes()]
gt = np.array([0 if i == 'Mr. Hi' else 1 for i in gt])

# Get adjacency-matrix as numpy-array
adj_mat = nx.to_numpy_matrix(G)

print('ground truth')
print(gt)

# Cluster
sc = SpectralClustering(2, affinity='precomputed', n_init=100)
sc.fit(adj_mat)

# Compare ground-truth and clustering-results
print('spectral clustering')
print(sc.labels_)
print('just for better-visualization: invert clusters (permutation)')
print(np.abs(sc.labels_ - 1))

# Calculate some clustering metrics
print(metrics.adjusted_rand_score(gt, sc.labels_))
print(metrics.adjusted_mutual_info_score(gt, sc.labels_))
Beispiel #50
0
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None):
    vecinos = algorithms[ALGORITHM]

    algoritmos = "coseno"
    if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]:
        algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS
    elif PORCENTAJE_VECINOS != 0:
        algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS)

    titulo = MODELO + "-" + algoritmos
    if normalizar is not None:
        titulo += "-" + normalizar

    fname = sys.argv[2] + "/" + titulo + ".out"

    if os.path.isfile(fname):
        return

    print(titulo)
    print("-" * 20)

    if PORCENTAJE_VECINOS == 0:
        X = coseno
        if MODELO == "dbscan":
            # Solo sirve para coseno!
            X = 1 - X
    else:
        neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy"
        if os.path.isfile(neighbour_file_name):
            NEIGHBOURS = np.load(neighbour_file_name)
        else:
            print("Calculando vecinos")
            NEIGHBOURS = np.zeros((len(service_number), len(service_number)))
            for i in range(0, len(service_number)):
                for j in range(i, len(service_number)):
                    NEIGHBOURS[i][j] = vecinos(followers, users, i, j)
                    if i != j:
                        NEIGHBOURS[j][i] = NEIGHBOURS[i][j]
            np.save(neighbour_file_name, NEIGHBOURS)

        if normalizar is not None:
            print("Normalizando Vecinos")
            if normalizar == 'minmax':
                NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS)
            elif normalizar == 'scale':
                NEIGHBOURS = preprocessing.scale(NEIGHBOURS)
            elif normalizar == 'robust':
                NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS)
            elif normalizar == 'softmax':
                NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True)
            elif normalizar == 'matrixminmax':
                NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS))
            elif normalizar == 'matrixmax':
                NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS)
        if MODELO == "dbscan":  # Si es distancia
            if normalizar is not None:
                NEIGHBOURS = 1 - NEIGHBOURS
            else:
                NEIGHBOURS = - NEIGHBOURS
            X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS
        else:  # Si es afinidad
            if PORCENTAJE_VECINOS == "boost":
                X = np.multiply(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "maxsim":
                X = np.maximum(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "dist":
                NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS))
                COSINE_SORTED = np.argsort(np.argsort(coseno))
                POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED)))
                X = POS_BOOST
            else:
                X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS

    print("Generando Modelo")

    if MODELO == 'kmedoids':
        model = KMedoids(n_clusters=1500).fit(X)
    if MODELO == 'kmedoids470':
        model = KMedoids(n_clusters=470).fit(X)
    elif MODELO == 'ap':
        model = AffinityPropagation(affinity='precomputed').fit(X)
    elif MODELO == 'dbscan':
        model = DBSCAN(metric='precomputed').fit(X)

    labels = model.labels_

    clusters = defaultdict(list)
    for index, classif in enumerate(labels):
        clusters[classif].append(index)

    n_clusters_ = len(clusters)

    info = ""
    info += 'Clusters: %d\n' % n_clusters_
    # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels)
    info += 'Entropy: %0.3f\n' % entropy(labels_true, labels)
    info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels)
    info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels)
    info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels)
    info += 'Purity: %0.3f\n' % purity(labels_true, labels)
    info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels)
    info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels)
    info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels)

    clustersize = Counter(labels)

    salida = open(fname, 'w', encoding='UTF-8')

    print(info)

    salida.write(titulo + "\n")
    for cluster, services in clusters.items():
        countcat = Counter([labels_true[svc] for svc in services])
        max_key, num = countcat.most_common(1)[0]
        salida.write("%i (%s - %i/%i): %s \n" % (
            cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services])))
    salida.write("-" * 20 + "\n")
    salida.write(info)
    salida.close()
Beispiel #51
0
def Clu_Eval_Givenlabels(labels_true, labels_pred):
    #The format should be list
    ARI = metrics.adjusted_rand_score(labels_true, labels_pred)
    AMI = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    return ARI, AMI
# In[447]:

#check out just the target cells 
#d[(d['x']>2) & (d['y']>7)]
temp = subset[(subset['cell_type']=="blast") | (subset['cell_type']=="healthy")]
kmeans_temp, scores_temp = k_means_optimized(temp[colsOfInterest].as_matrix(),scale=True)    

temp['kmeans_temp'] = kmeans_temp.labels_
print kmeans_temp
print scores_temp
plt.bar(range(len(scores_temp)), scores_temp.keys(), align='center')
plt.xticks(range(len(scores_temp)), scores_temp.values())

plt.show()
print 'KMEANS DAD NMI:', adjusted_mutual_info_score(temp['cell_type'], kmeans_temp.labels_)
temp.groupby(['cell_type',"kmeans_temp"]).count()


# In[448]:

#check out just the target cells forced k=2
kmeans_temp = KMeans(2)
kmeans_temp.fit(temp[colsOfInterestFlow].as_matrix())
temp['kmeans_temp2'] = kmeans_temp.labels_
print kmeans_temp
print scores_temp
plt.bar(range(len(scores_temp)), scores_temp.keys(), align='center')
plt.xticks(range(len(scores_temp)), scores_temp.values())

plt.show()
     continue
 feat_name = 'gsdmm'
 best_acc = 0.0
 best_pred = None
 all_pred = []
 all_acc = []
 all_nmi = []
 all_ari = []
 for i in range(trial_num):
     print(corpora_id, n_topics, i)
     # pred = gsdmm_cluster_alg(corpora_name, n_topics, alpha, beta, iter_nums)
     pred = gsdmm_cluster_alg(train_path, n_topics, alpha, beta,
                              iter_nums)
     acc = cluster_acc(labels, pred)
     nmi = normalized_mutual_info_score(labels, pred)
     ari = adjusted_mutual_info_score(labels, pred)
     all_pred.append(pred.tolist())
     all_acc.append(acc)
     all_nmi.append(nmi)
     all_ari.append(ari)
     if acc > best_acc:
         best_pred = pred
         best_acc = acc
 print('{} best acc is {}'.format(feat_name, best_acc))
 dump_mongo(corpora=corpora_name,
            feat_name=feat_name,
            n_topics=n_topics,
            pred=best_pred.tolist(),
            acc=best_acc,
            all_pred=all_pred,
            all_acc=all_acc,
Beispiel #54
0
def compute_results(G_data,
                    B_data,
                    name,
                    encoder,
                    r_state=0,
                    only_kmeans=False):
    B_data_X = encoder.detach().numpy()

    kmeans = KMeans(n_clusters=get_clusters(G_data, name),
                    init='k-means++',
                    random_state=r_state)

    if not only_kmeans:
        kmeans.fit(B_data_X)
    else:
        kmeans.fit(B_data)

    X_ae = kmeans.labels_  # Calculated labels

    # Finding truth values
    if name == 'karate':
        c_groups = [
            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
        ]
    elif name == 'email':
        c_groups = [
            1, 1, 21, 21, 21, 25, 25, 14, 14, 14, 9, 14, 14, 26, 4, 17, 34, 1,
            1, 14, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
            11, 11, 11, 11, 11, 5, 34, 14, 14, 17, 17, 10, 10, 36, 37, 5, 7, 4,
            22, 22, 21, 21, 21, 21, 7, 7, 36, 21, 25, 4, 8, 15, 15, 15, 37, 37,
            9, 1, 1, 10, 10, 3, 3, 3, 29, 15, 36, 36, 37, 1, 36, 34, 20, 20, 8,
            15, 9, 4, 5, 4, 20, 16, 16, 16, 16, 16, 38, 7, 7, 34, 38, 36, 8,
            27, 8, 8, 8, 10, 10, 13, 13, 6, 26, 10, 1, 36, 0, 13, 16, 16, 22,
            6, 5, 4, 0, 28, 28, 4, 2, 13, 13, 21, 21, 17, 17, 14, 36, 8, 40,
            35, 15, 23, 0, 0, 7, 10, 37, 27, 35, 35, 0, 0, 19, 19, 36, 14, 37,
            24, 17, 13, 36, 4, 4, 13, 13, 10, 4, 38, 32, 32, 4, 1, 0, 0, 0, 7,
            7, 4, 15, 16, 40, 15, 15, 15, 15, 0, 21, 21, 21, 21, 5, 4, 4, 4, 4,
            4, 4, 4, 5, 5, 4, 4, 22, 19, 19, 22, 34, 14, 0, 1, 17, 37, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 10, 23, 0, 4, 19, 19, 19, 19, 19, 19, 19,
            19, 19, 19, 19, 19, 10, 14, 14, 1, 14, 7, 13, 20, 31, 40, 6, 4, 0,
            8, 9, 9, 10, 0, 10, 14, 14, 14, 14, 39, 17, 4, 28, 17, 17, 17, 4,
            4, 0, 0, 23, 4, 21, 36, 36, 0, 22, 21, 15, 37, 0, 4, 4, 4, 14, 4,
            7, 7, 1, 15, 15, 38, 26, 20, 20, 20, 21, 9, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 10, 19, 7, 7, 17, 16, 14, 9, 9, 9, 8, 8, 13, 39, 14, 10,
            17, 17, 13, 13, 13, 13, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16,
            16, 27, 8, 8, 14, 14, 14, 10, 14, 35, 37, 14, 36, 10, 7, 20, 10,
            16, 36, 36, 14, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
            7, 7, 7, 7, 7, 7, 7, 4, 9, 4, 0, 4, 16, 38, 14, 14, 21, 26, 27, 28,
            21, 4, 1, 1, 9, 10, 15, 4, 26, 14, 35, 10, 34, 4, 4, 12, 17, 17,
            14, 37, 37, 37, 34, 6, 13, 13, 13, 13, 4, 14, 10, 10, 10, 3, 17,
            17, 17, 1, 4, 14, 14, 6, 27, 22, 21, 4, 4, 1, 34, 17, 30, 30, 4,
            23, 14, 15, 1, 22, 12, 31, 6, 15, 15, 8, 15, 8, 8, 1, 15, 22, 2, 3,
            4, 10, 4, 14, 14, 25, 6, 6, 40, 4, 36, 23, 14, 3, 14, 14, 14, 14,
            14, 14, 14, 14, 14, 31, 15, 15, 14, 0, 23, 35, 8, 4, 1, 1, 35, 23,
            21, 2, 4, 4, 9, 14, 4, 10, 25, 14, 14, 3, 21, 35, 4, 9, 15, 6, 9,
            3, 15, 23, 4, 4, 4, 11, 35, 10, 6, 15, 15, 15, 22, 2, 2, 14, 4, 3,
            14, 27, 31, 34, 4, 4, 19, 14, 14, 4, 4, 14, 14, 21, 4, 14, 4, 0, 4,
            27, 27, 17, 3, 15, 2, 4, 4, 21, 21, 11, 23, 11, 23, 17, 5, 36, 15,
            23, 23, 2, 19, 4, 36, 14, 1, 22, 1, 21, 34, 14, 13, 6, 4, 37, 6,
            24, 35, 6, 17, 16, 6, 4, 0, 21, 4, 26, 21, 4, 15, 7, 1, 20, 19, 7,
            21, 21, 21, 19, 38, 19, 16, 23, 6, 37, 25, 1, 22, 6, 14, 1, 26, 8,
            37, 4, 0, 17, 6, 17, 14, 16, 4, 32, 14, 15, 0, 23, 21, 29, 14, 14,
            1, 17, 26, 15, 0, 0, 0, 22, 34, 21, 6, 16, 4, 15, 21, 0, 36, 4, 1,
            1, 22, 14, 14, 30, 4, 9, 10, 4, 4, 14, 16, 16, 15, 21, 0, 4, 15,
            29, 24, 21, 14, 11, 11, 9, 13, 10, 31, 4, 22, 14, 23, 1, 4, 9, 17,
            27, 28, 22, 14, 20, 7, 23, 1, 6, 15, 15, 23, 4, 20, 5, 36, 10, 21,
            39, 41, 31, 17, 7, 21, 34, 1, 14, 2, 18, 16, 27, 16, 38, 7, 38, 21,
            1, 9, 15, 15, 15, 0, 6, 23, 28, 11, 23, 34, 24, 4, 4, 4, 24, 23,
            17, 10, 17, 1, 1, 15, 15, 4, 21, 14, 14, 20, 28, 20, 22, 26, 3, 32,
            4, 0, 21, 13, 4, 15, 17, 5, 4, 14, 0, 9, 21, 14, 38, 4, 14, 31, 21,
            14, 6, 4, 4, 6, 17, 0, 4, 7, 16, 4, 4, 21, 1, 10, 3, 21, 4, 0, 1,
            7, 17, 15, 14, 0, 9, 32, 13, 5, 2, 21, 28, 21, 22, 22, 7, 7, 33, 0,
            1, 15, 4, 31, 30, 15, 11, 19, 21, 9, 21, 13, 21, 9, 32, 9, 32, 38,
            9, 38, 38, 14, 9, 10, 38, 10, 22, 21, 13, 21, 4, 0, 1, 1, 23, 0, 5,
            4, 4, 15, 14, 14, 13, 11, 1, 5, 5, 10, 23, 21, 14, 9, 20, 10, 19,
            19, 21, 17, 19, 19, 36, 17, 35, 16, 4, 16, 4, 6, 4, 41, 6, 7, 23,
            9, 23, 7, 6, 22, 36, 14, 15, 11, 35, 5, 14, 14, 15, 4, 6, 4, 9, 19,
            11, 4, 29, 14, 15, 15, 5, 32, 15, 14, 5, 9, 10, 19, 13, 23, 12, 10,
            21, 10, 35, 7, 22, 22, 22, 8, 21, 32, 4, 21, 21, 6, 14, 11, 14, 15,
            4, 21, 1, 6, 22
        ]
    else:
        c_attributes = nx.get_node_attributes(G_data, 'value')
        c_groups = []
        for i, val in enumerate(c_attributes.values()):
            c_groups.append(val)

    X_gt = np.array(c_groups)
    # print(X_ae)
    # print(X_gt)

    return metrics.adjusted_mutual_info_score(X_gt,
                                              X_ae,
                                              average_method='arithmetic')
Beispiel #55
0
def unSupervised(x_data, y_data, x, n):
    fNames = ['Heart', 'Credit Card']

    clusterValues = []
    silScores = []
    noComponents = []
    bic = []
    aic = []

    arScore = []
    amiScore = []
    homogeneityScore = []
    completenessScore = []
    fmScore = []

    for a in range(2, 7):
        ## K-means
        kmeans = KMeans(n_clusters=a)
        kmeans.fit(x_data)
        kmeans.predict(x_data)
        labels = kmeans.labels_
        silScore = silhouette_score(x_data, labels)

        clusterVisuals(x_data, labels, a, fNames[x] + ': ' + n + ': K-Means')

        clusterValues.append(a)
        silScores.append(silScore)
        arScore.append(adjusted_rand_score(y_data, labels))
        amiScore.append(adjusted_mutual_info_score(y_data, labels))
        homogeneityScore.append(homogeneity_score(y_data, labels))
        completenessScore.append(completeness_score(y_data, labels))
        fmScore.append(fowlkes_mallows_score(y_data, labels))

        ###Expected maximization
        em = GaussianMixture(n_components=a)
        em.fit(x_data)
        labels = em.predict(x_data)

        noComponents.append(a)
        bic.append(em.bic(x_data))
        aic.append(em.aic(x_data))
        clusterVisuals(x_data, labels, a, fNames[x] + ': ' + n + ': EM')

    plt.plot(clusterValues, silScores, label='Silhouette')
    plt.plot(clusterValues, arScore, label='Adjusted Rand Index')
    plt.plot(clusterValues, amiScore, label='Ajusted Mutual Index')
    plt.plot(clusterValues, homogeneityScore, label='Homogeneity')
    plt.plot(clusterValues, completenessScore, label='Completeness')
    plt.plot(clusterValues, fmScore, label='Fowlkes-Mallows')
    plt.xlabel('No. of Clusters')
    plt.ylabel('Scores')
    plt.legend()
    plt.title(n + ': K-Means: ' + fNames[x])
    plt.savefig(n + ' K-Means ' + fNames[x] + '.png')
    plt.figure()

    plt.title(n + ': ' + 'Expected Maximzation: ' + fNames[x])
    plt.plot(noComponents, bic, label="BIC")
    plt.plot(noComponents, aic, label="AIC")
    plt.xlabel("No. of Components")
    plt.ylabel("BIC & AIC")
    plt.legend()
    plt.savefig(n + ' ' + 'Expected Maximzation: ' + fNames[x] + '.png')
    plt.figure()
kmeans = KMeans(n_clusters=2, random_state=0,
                n_init=20).fit(X_scaled_transformed)
align_clusters_labels(kmeans.labels_)
gmm = GaussianMixture(n_components=2).fit_predict(X_scaled_transformed)
align_clusters_labels(gmm)

metrics_report = {'kmeans': {}, 'gmm': {}}

labels = {'kmeans': kmeans.labels_, 'gmm': gmm}

for each in metrics_report.keys():
    metrics_report[each]['ARI'] = round(
        metrics.adjusted_rand_score(y, labels[each]), 2)
    metrics_report[each]['AMI'] = round(
        metrics.adjusted_mutual_info_score(y, labels[each]), 2)
    metrics_report[each]['homogeneity'] = round(
        metrics.homogeneity_score(y, labels[each]), 2)
    metrics_report[each]['completeness'] = round(
        metrics.completeness_score(y, labels[each]), 2)
    metrics_report[each]['v_measure'] = round(
        metrics.v_measure_score(y, labels[each]), 2)
    metrics_report[each]['silhouette'] = round(
        metrics.silhouette_score(X, labels[each]), 2)
    metrics_report[each]['accuracy'] = round(
        metrics.accuracy_score(y, labels[each]) * 100, 2)

print(metrics_report)

#visualizing - k-means clustering of ICA transformed dataset
plt.scatter(X_scaled_transformed[kmeans.labels_ == 1, 0],
Beispiel #57
0
    # ----------------------------------------------------------------------
    # stats
    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_clusters_true = len(set(labels_true)) - (1 if -1 in labels else 0)

    printlog('\t Estimated number of clusters: {0}'.format(n_clusters))
    # print stats
    args = [labels_true, labels]
    pargs = [
        metrics.homogeneity_score(*args),
        metrics.completeness_score(*args),
        metrics.v_measure_score(*args),
        metrics.adjusted_rand_score(*args),
        metrics.adjusted_mutual_info_score(*args)
    ]
    printlog("\t Homogeneity: {0:.3f}\n\t Completeness: {1:.3f}"
             "\n\t V-measure: {2:.3f}\n\t Adjusted Rand Index: {3:.3f}"
             "\n\t Adjusted Mutual Information: {4:.3f}".format(*pargs))

    # ----------------------------------------------------------------------
    # comparing results
    printlog('Comparing results...')
    merged = compare_results(groups, labels_true, labels)

    # ----------------------------------------------------------------------
    # Plot result
    printlog('Plotting graphs...')

    if PLOT_3D_ALL:
Beispiel #58
0
def train_test_model(run_id, hparams, X_train, y_train, X_test, y_test):

    # hp.hparams(hparams) # record the values used in this trial
    seed = hparams[HP_seed]
    tf.random.set_seed(seed)
    params = {
        "components": hparams[HP_components],
        "input_dimension": X_train.shape[1],
        "embedding_dimensions": eval(hparams[HP_encoder_dims])[0],
        "latent_dimensions": eval(hparams[HP_encoder_dims])[1],
        "mixture_embedding_dimensions": eval(hparams[HP_mixture_dims])[0],
        "mixture_latent_dimensions": eval(hparams[HP_mixture_dims])[1],
        "embedding_activations": tf.nn.relu,
        "kind": "binary",
        "learning_rate": 1.0,
        "gradient_clip": None,
        "bn_before": True if hparams[HP_bn] == "before" else False,
        "bn_after": True if hparams[HP_bn] == "after" else False,
        "categorical_epsilon": 0.0,
        "reconstruction_epsilon": 0.0,
        "latent_epsilon": 0.0,
        "latent_prior_epsilon": 0.0,
        "z_kl_lambda": 1.0,
        "c_kl_lambda": 1.0,
        "cat_latent_bias_initializer": None,
        "connected_weights": hparams[HP_connected_weights],
        # "optimizer":tf.keras.optimizers.Adam(lr_schedule, epsilon=1e-16),
        "optimizer": tf.keras.optimizers.Adam(1e-3, epsilon=1e-16),
        "categorical_latent_embedding_dropout": 0.2,
        "mixture_latent_mu_embedding_dropout": 0.2,
        "mixture_latent_var_embedding_dropout": 0.2,
        "mixture_posterior_mu_dropout": 0.2,
        "mixture_posterior_var_dropout": 0.2,
        "recon_dropouut": 0.2,
        #'latent_fixed_var': 0.01,
    }

    z_cooling = lambda: 1.0
    y_cooling = lambda: 1.0

    m1 = model.Gmvae(**params)

    params["embedding_activations"] = "relu"
    params["optimizer"] = "adam_1e-3_1e-9"

    param_string = (
        "/seed__" + str(seed) + "/" +
        "/".join([str(k) + "_" + str(v) for k, v in params.items()]))

    train(
        m1,
        X_train,
        y_train,
        X_test,
        y_test,
        num=100,
        samples=hparams[HP_samples],
        epochs=110,
        iter_train=1,
        num_inference=1000,
        save="model_w_5",
        batch=True,
        temperature_function=lambda x: exponential_multiplicative_cooling(
            x, 1.0, 0.5, 0.99),
        # temperature_function = lambda x: 0.1
        save_results="./gumble_results.txt",
        beta_z_method=z_cooling,
        beta_y_method=y_cooling,
        tensorboard=run_id,
    )

    idx_tr = m1.predict(X_train).numpy().argmax(1)
    idx_te = m1.predict(X_test).numpy().argmax(1)

    ami_tr = adjusted_mutual_info_score(y_train,
                                        idx_tr,
                                        average_method="arithmetic")
    ami_te = adjusted_mutual_info_score(y_test,
                                        idx_te,
                                        average_method="arithmetic")

    attch_te = np.array(np.unique(idx_te,
                                  return_counts=True)[1]).max() / len(idx_te)

    purity_train = purity_score(y_train, idx_tr)
    purity_test = purity_score(y_test, idx_te)

    return ami_tr, ami_te, purity_train, purity_test
raw_data = np.loadtxt('cluster.txt')  # 导入数据文件
X = raw_data[:, :-1]  # 分割要聚类的数据
y_true = raw_data[:, -1]

# 训练聚类模型
n_clusters = 3  # 设置聚类数量
model_kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # 建立聚类模型对象
model_kmeans.fit(X)  # 训练聚类模型
y_pre = model_kmeans.predict(X)  # 预测聚类模型

# 模型效果指标评估
n_samples, n_features = X.shape  # 总样本量,总特征数
inertias = model_kmeans.inertia_  # 样本距离最近的聚类中心的总和
adjusted_rand_s = metrics.adjusted_rand_score(y_true, y_pre)  # 调整后的兰德指数
mutual_info_s = metrics.mutual_info_score(y_true, y_pre)  # 互信息
adjusted_mutual_info_s = metrics.adjusted_mutual_info_score(y_true,
                                                            y_pre)  # 调整后的互信息
homogeneity_s = metrics.homogeneity_score(y_true, y_pre)  # 同质化得分
completeness_s = metrics.completeness_score(y_true, y_pre)  # 完整性得分
v_measure_s = metrics.v_measure_score(y_true, y_pre)  # V-measure得分
silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean')  # 平均轮廓系数
calinski_harabaz_s = metrics.calinski_harabaz_score(
    X, y_pre)  # Calinski和Harabaz得分
print('samples: %d \t features: %d' % (n_samples, n_features))  # 打印输出样本量和特征数量
print(70 * '-')  # 打印分隔线
print('ine\tARI\tMI\tAMI\thomo\tcomp\tv_m\tsilh\tc&h')  # 打印输出指标标题
print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d' %
      (inertias, adjusted_rand_s, mutual_info_s, adjusted_mutual_info_s,
       homogeneity_s, completeness_s, v_measure_s, silhouette_s,
       calinski_harabaz_s))  # 打印输出指标值
print(70 * '-')  # 打印分隔线
print('short name \t full name')  # 打印输出缩写和全名标题
Beispiel #60
0
def main(cvfold=0,
         alpha_T=1.0,
         alpha_E=1.0,
         lambda_TE=1.0,
         root_node='n88',
         start_i=0,
         stop_i=11000,
         embedding='zE',
         latent_dim=3,
         rand_seed=0,
         exp_name='LR_v2_bal'):

    exp_name = exp_name+'_ld'+str(latent_dim)
    alpha_M=alpha_E
    cvfold_fname='v2_aT_'+str(alpha_T)+\
                '_aE_'+str(alpha_E)+\
                '_aM_'+str(alpha_M)+\
                '_cs_'+str(lambda_TE)+\
                '_ld_'+str(latent_dim)+\
                '_bs_200_se_500_ne_1500_cv_'+str(cvfold)+\
                '_ri_0500_ft-summary'
    cvfold_fname=cvfold_fname.replace('.','-')+'.mat'
    dir_pth = set_paths(exp_name=exp_name)
    
    #Load pruned tree, embeddings, and cell type annotations
    with open(dir_pth['data']+"PS_v4_beta_0-4_matched_well-sampled_dend_RData_Tree_20181220_pruned_n88_n60_classifications.json") as f:
        all_classifications = json.load(f)        
    O = sio.loadmat(dir_pth['data']+'PS_v4_beta_0-4_matched_well-sampled.mat',squeeze_me=True)
    CV = sio.loadmat(dir_pth['cvfolds']+cvfold_fname,squeeze_me=True)
    htree_df = pd.read_csv(dir_pth['data']+'dend_RData_Tree_20181220_pruned.csv')
    htree = HTree(htree_df=htree_df)
    all_descendants = htree.get_all_descendants()
                
    result_fname = 'cv_classification_results_' + \
                    embedding + \
                    '_aT_'+str(alpha_T) + \
                    '_aE_'+str(alpha_E) + \
                    '_aM_'+str(alpha_M)+ \
                    '_csTE_'+str(lambda_TE) + \
                    '_ld_'+str(latent_dim) + \
                    '_randseed_'+str(rand_seed) + \
                    '_start_'+str(start_i) + \
                    '_stop_'+str(stop_i) + \
                    '_cv_'+str(cvfold) +\
                    '_rn_'+root_node
    result_fname = result_fname.replace('.','-')+'.csv'

    max_i = min(stop_i,len(all_classifications[root_node]))
    write_header=True
    for i in range(start_i,max_i,1):
        print('Iter {:6d} in range {:6d} to {:6d}'.format(i,start_i,max_i))
        classification_id = root_node+'_'+str(i)
        this_classification = all_classifications[root_node][i]
        n_classes=len(this_classification)

        #Classifier only works for n_classes > 1 
        if n_classes>1: 
            X = relabel_restrict_inputs(CV=CV,O=O,this_classification=this_classification,descendant_dict=all_descendants)
            clf = LogisticRegression(penalty='none',
                                     random_state=rand_seed,
                                     solver='lbfgs',
                                     max_iter=10000,
                                     multi_class='multinomial', 
                                     class_weight='balanced').fit(X['train'][embedding], X['train']['cluster'])
            
            result={}
            for ds in ['train','val','test']:
                pred_label = clf.predict(X[ds][embedding])
                result[ds+'_acc'] = np.sum(pred_label==X[ds]['cluster'])/X[ds]['cluster'].size
                result[ds+'_ari'] = adjusted_rand_score(X[ds]['cluster'], pred_label)
                result[ds+'_ami'] = adjusted_mutual_info_score(X[ds]['cluster'], pred_label)
                result[ds+'_nmi'] = normalized_mutual_info_score(X[ds]['cluster'], pred_label)
                result[ds+'_samples'] = pred_label.size

            result_list = [result['train_acc'], result['val_acc'], result['test_acc'],
                           result['train_ari'], result['val_ari'], result['test_ari'],
                           result['train_ami'], result['val_ami'], result['test_ami'],
                           result['train_nmi'], result['val_nmi'], result['test_nmi'],
                           result['train_samples'], result['val_samples'], result['test_samples'],
                           cvfold, classification_id, n_classes]

            with open(dir_pth['result']+result_fname,'a') as f:
                writer = csv.writer(f)
                if write_header:
                    writer.writerow(['train_acc', 'val_acc', 'test_acc',
                                      'train_ari', 'val_ari', 'test_ari',
                                      'train_ami', 'val_ami', 'test_ami',
                                      'train_nmi', 'val_nmi', 'test_nmi',
                                      'train_samples', 'val_samples', 'test_samples',
                                      'cvfold', 'classification_id', 'n_classes'])
                    write_header=False
                writer.writerow(result_list)
    return