def aggregate_stats(infiles, outfile):
    """
    Combine all the aggstats into a single file
    
    Compute summary statistics
    """

    res = []
    for infile in infiles:
        d = pickle.load(open(infile, 'r'))
        print "The file is", infile
        assigndf = d['df']
        meta = d['meta']
        neurons = meta['neurons']


        m = extract_metadata(infile)
        if len(m) == 0:
            # skip the stupid non-replicated ones
            continue 

        for k, v in m.iteritems():
            assigndf[k] = v
        

        assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))]
        # compute the statistics
        assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)


        # don't consider the ones where the role is "none" as these are multi-role ones
        neurons.ix[neurons['role'].isnull(), 'role'] = 'I'
        
        assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], 
                                                                                     irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], 
                                                                                           irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], 
                                                                                             irm.util.canonicalize_assignment(x['assign'])), axis=1)



        assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1)
        assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1)
        assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)
        #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)

        # 

        # fraction of mass in top N types
        
        res.append(assigndf)
    alldf = pandas.concat(res)
    pickle.dump(alldf, open(outfile, 'w'), -1)
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Esempio n. 3
0
def run_clustering( clusterer, data, labels ):
    """
    Cluster: Using a predefined and parameterized clustering algorithm, fit
    some dataset and perform metrics given a set of ground-truth labels.

        clusterer: the clustering algorithm, from sklearn
        data:      array-like dataset input
        labels:    vector of ground-truth labels

    """

    # Time the operation
    t0 = time()
    clusterer.fit(data)
    t1 = time()

    # Perform metrics
    runtime         = (t1 - t0)
    homogeneity     = metrics.homogeneity_score(   labels, clusterer.labels_ )
    completeness    = metrics.completeness_score(  labels, clusterer.labels_ )
    v_measure       = metrics.v_measure_score(     labels, clusterer.labels_ )
    adjusted_rand   = metrics.adjusted_rand_score( labels, clusterer.labels_ )
    adjusted_mutual = metrics.adjusted_mutual_info_score( labels,
                                                          clusterer.labels_ )

    # Output to logs
    logging.info("  |-        Execution time: %fs"   % runtime)
    logging.info("  |-           Homogeneity: %0.3f" % homogeneity)
    logging.info("  |-          Completeness: %0.3f" % completeness)
    logging.info("  |-             V-measure: %0.3f" % v_measure)
    logging.info("  |-   Adjusted Rand-Index: %.3f"  % adjusted_rand)
    logging.info("  |-  Adjusted Mutual Info: %.3f"  % adjusted_mutual)
Esempio n. 4
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
Esempio n. 5
0
def clustering(dataset):
    vectorizer = dataset.vectorizer
    X = dataset.X
    true_k = dataset.n_classes
    labels = dataset.target

    km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f"
          % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, sample_size=1000))
    print()

    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0)
    for i in range(true_k):
        print("Cluster %d (%d):" % (i, sizes[i]), end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
Esempio n. 6
0
def compare(method1, method2, fig=False):
    X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1))
    X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2))
    
    print 'n_cluster\tHomo\tCompl\tNMI\tARI'
    for i in range(2, 6):
        clust1 = Clustering(species, method1, X1, None, n_clusters=i)
        clust2 = Clustering(species, method2, X2, None, n_clusters=i)
        
        clust1.agglomerative(linkage='ward')
        clust2.agglomerative(linkage='ward')
        
        label1 = clust1.pred_labels('ward')
        label2 = clust2.pred_labels('ward')
        
        
        if i == 3 and fig:
            names = np.unique(label1)
            figName = '{0}_{1}_on_{2}'.format(species, method1, method2)
            plot2d(X2, label1, names, figName, figName)

            names = np.unique(label2)
            figName = '{0}_{1}_on_{2}'.format(species, method2, method1)
            plot2d(X1, label2, names, figName, figName)
    
        print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2),
                                                metrics.completeness_score(label1, label2),
                                                metrics.normalized_mutual_info_score(label1, label2),
                                                metrics.adjusted_rand_score(label1, label2))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Esempio n. 8
0
def cluster(algorithm, data, topics, make_silhouette=False):
  print str(algorithm)
  clusters = algorithm.fit_predict(data)
  labels = algorithm.labels_
  print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
  print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
  print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
  print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
  print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
  print ' ***************** '
  
  silhouettes = metrics.silhouette_samples(data, labels)
  num_clusters = len(set(clusters))
  print 'num clusters: %d' % num_clusters
  print 'num fitted: %d' % len(clusters)

  # Make a silhouette plot if the flag is set
  if make_silhouette:
    order = numpy.lexsort((-silhouettes, clusters)) 
    indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
    ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
    ytickLabels = ["%d" % x for x in range(num_clusters)]
    cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
    clr = [cmap[i] for i in clusters[order]]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,   
            edgecolor='none', color=clr)
    ax.set_ylim(ax.get_ylim()[::-1])
    plt.yticks(ytick, ytickLabels)
    plt.xlabel('Silhouette Value')
    plt.ylabel('Cluster')
    plt.savefig('cluster.png')
Esempio n. 9
0
def predictAffinityPropagation(X, labels_true):
	#ranX, ranY = shuffle(X, y, random_state=0)
	af = AffinityPropagation(preference=-50).fit(X)
	cluster_centers_indices = af.cluster_centers_indices_
	labels = af.labels_

	n_clusters_ = len(cluster_centers_indices)

	print('Estimated number of clusters: %d' % n_clusters_)
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
	print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
	print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
	print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

	plt.close('all')
	plt.figure(1)
	plt.clf()

	colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
	for k, col in zip(range(n_clusters_), colors):
	    class_members = labels == k
	    cluster_center = X[cluster_centers_indices[k]]
	    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
	    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
	             markeredgecolor='k', markersize=14)
	    for x in X[class_members]:
	        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

	plt.title('Estimated number of clusters: %d' % n_clusters_)
	plt.show()
def bench_k_means(estimator, name, data, sample_size, labels,postIds):
    data=sparse.csr_matrix(data)
    t0 = time()
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    lsa = TruncatedSVD(500)

    data = lsa.fit_transform(data)
    data = Normalizer(copy=False).fit_transform(data)

    print("done in %fs" % (time() - t0))
    print()

    #sData=sparse.csr_matrix(data)
    val=estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f '
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))

    print("Parsing USer File:")
    parseUserFile()
    print("extracting User File:")
    clusterDict=extractCluster(postIds,estimator.labels_)
    print("writing Cluster Data to File")
    writeCluterToFile(clusterDict)
Esempio n. 11
0
def bench_k_means(estimator, name, data, target_labels, sample_size):
  """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy
  ARGS
    estimator: K-Means clustering algorithm <sklearn.cluster.KMeans>
    name: estimator name <str>
    data: array-like or sparse matrix, shape=(n_samples, n_features)
    target_labels: labels of data points <number array>
    sample_size: size of the sample to use when computing the Silhouette Coefficient <int>
  """ 
  t0 = time()
  estimator.fit(data)

  _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels)

  print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        % (name, (time() - t0), estimator.inertia_,
           metrics.homogeneity_score(target_labels, estimator.labels_),
           metrics.completeness_score(target_labels, estimator.labels_),
           metrics.v_measure_score(target_labels, estimator.labels_),
           metrics.adjusted_rand_score(target_labels, estimator.labels_),
           metrics.adjusted_mutual_info_score(target_labels,  estimator.labels_),
           metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size),
           train_accuracy
          )
        )
Esempio n. 12
0
def bench_k_means(estimator, data, labels):
    t0 = time()
    estimator.fit(data)
    print("time to fit: {:.5}".format(time() - t0))
    homogenity = metrics.homogeneity_score(labels, estimator.labels_)
    completeness = metrics.completeness_score(labels, estimator.labels_)
    v_measure = metrics.v_measure_score(labels, estimator.labels_)
    print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format(
        homogenity, completeness, v_measure)
    )

    adj_rand_score = metrics.adjusted_rand_score(
        labels, estimator.labels_
    )
    print("adjusted_rand_score {:.5}".format(adj_rand_score))

    adj_mutual_info_score = metrics.adjusted_mutual_info_score(
        labels,  estimator.labels_
    )
    print("adjusted_mutual_info_score {:.5}".format(
        adj_mutual_info_score)
    )

    silhouette_score = metrics.silhouette_score(
        data, estimator.labels_, metric='euclidean'
    )
    print("silhouette_score {:.5}".format(
        metrics.silhouette_score(data, estimator.labels_,
                                 metric='euclidean'))
    )

    return [
        homogenity, completeness, v_measure, adj_rand_score,
        adj_mutual_info_score, silhouette_score
    ]
Esempio n. 13
0
def cluster(Z, K=4, algo='kmeans'):
	descr = Z.columns
	X = Imputer().fit_transform(Z)

	##############################################################################
	if algo == 'dbscan':
		# Compute DBSCAN
		db = DBSCAN(eps=0.3, min_samples=10).fit(X)
		core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
		core_samples_mask[db.core_sample_indices_] = True
		labels = db.labels_
        
		# Number of clusters in labels, ignoring noise if present.
		n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        
		print('Estimated number of clusters: %d' % n_clusters_)
		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
		print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
		print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
		print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
	
	elif algo == 'kmeans':
		km = KMeans(n_clusters=K)
		km.fit(X)
		print(km.labels_)
		return km
Esempio n. 14
0
File: a.py Progetto: chengxwcq/ee219
def get_result(km, labels):
    homo_score = metrics.homogeneity_score(labels, km.labels_)
    complete_score = metrics.completeness_score(labels, km.labels_)
    v_score = metrics.v_measure_score(labels, km.labels_)
    rand_score = metrics.adjusted_rand_score(labels, km.labels_)
    mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_)
    return homo_score, complete_score, v_score, rand_score, mutual_info
Esempio n. 15
0
    def run(self):
        meandist=[]
        homogeneity_scores=[]
        completeness_scores=[]
        rand_scores=[]
        silhouettes=[]

        for k in self.clusters:
            model = KMeans(n_clusters=k, max_iter=5000, init='k-means++')
            labels = model.fit_predict(self.X)

            if k == self.targetcluster and self.stats:
                nd_data = np.concatenate((self.X, np.expand_dims(labels, axis=1),np.expand_dims(self.y, axis=1)), axis=1)
                pd_data = pd.DataFrame(nd_data)
                pd_data.to_csv("cluster.csv", index=False, index_label=False, header=False)
                print model.cluster_centers_

                for i in range (0,3):
                    print "Cluster {}".format(i)
                    cluster = pd_data.loc[pd_data.iloc[:,-2]==i].iloc[:,-2:]
                    print cluster.shape[0]
                    print float(cluster.loc[cluster.iloc[:,-1]==0].shape[0])/cluster.shape[0]
                    print float(cluster.loc[cluster.iloc[:,-1]==1].shape[0])/cluster.shape[0]

            meandist.append(sum(np.min(cdist(self.X, model.cluster_centers_, 'euclidean'), axis=1))/ self.X.shape[0])
            homogeneity_scores.append(metrics.homogeneity_score(self.y, labels))
            completeness_scores.append(metrics.completeness_score(self.y, labels))
            rand_scores.append(metrics.adjusted_rand_score(self.y, labels))

        if self.gen_plot:
            #self.visualize()

            self.plot(meandist, homogeneity_scores, completeness_scores, rand_scores, silhouettes)
Esempio n. 16
0
def cluster(model, uids):
    ##############################################################################
    # Generate sample data
    X = []
    for uid in uids:
        X.append(model.docvecs[uid])
    labels_true = uids

    ##############################################################################
    # Compute Affinity Propagation
    af = AffinityPropagation(preference=-50).fit(X)
    pickle.dump(af, open('data/af.pick', 'w'))
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
Esempio n. 17
0
def affin_test():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    
     
    x_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
        x_train, y_train, test_size=0.9, random_state=42)    
    
    
    labels_true = y_train 
    
    x_train = StandardScaler().fit_transform(x_train)
    af = AffinityPropagation(preference=-50).fit(x_train)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    n_clusters_ = len(cluster_centers_indices)
    
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
def bestClassify(X,Y):
	"Best classifier function"
	tfidf = True

	if tfidf:
		vec = TfidfVectorizer(preprocessor = identity,
							tokenizer = identity, sublinear_tf = True)
	else:
		vec = CountVectorizer(preprocessor = identity,
							tokenizer = identity)

	km = KMeans(n_clusters=2, n_init=100, verbose=1)
	clusterer = Pipeline( [('vec', vec),
								('cls', km)] )

	prediction = clusterer.fit_predict(X,Y)

	checker = defaultdict(list)
	for pred,truth in zip(prediction,Y):
		checker[pred].append(truth)

	labeldict = {}
	for pred, label in checker.items():
		labeldict[pred] = Counter(label).most_common(1)[0][0]
		#print(pred, Counter(label).most_common(1)[0][0])

	prediction = [labeldict[p] for p in prediction]
	labels = list(labeldict.values())
	print(labels)
	print(confusion_matrix(Y, prediction, labels=labels))

	print("Homogeneity:", homogeneity_score(Y,prediction))
	print("Completeness:", completeness_score(Y,prediction))
	print("V-measure:", v_measure_score(Y,prediction))
	print("Rand-Index:", adjusted_rand_score(Y,prediction))
Esempio n. 19
0
def kmeans_setup(data):
	

	if pca_f == 1:
		pca = PCA(n_components = num_clusters).fit(data)
		initializer = pca.components_
		name = 'PCA'
	else:
		initializer = 'k-means++'
		name = 'k-means++'

	t0 = time()
	
	estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations)
	estimator.fit(data)
	
	if debug == True:
		sample_size = 300
		print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
	          % (name, (time() - t0), estimator.inertia_,
	             metrics.homogeneity_score(labels, estimator.labels_),
	             metrics.completeness_score(labels, estimator.labels_),
	             metrics.v_measure_score(labels, estimator.labels_),
	             metrics.adjusted_rand_score(labels, estimator.labels_),
	             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
	             metrics.silhouette_score(data, estimator.labels_,
	                                      metric='euclidean',
	                                      sample_size=sample_size)))
	return estimator
Esempio n. 20
0
def clustering_by_kmeans(vectorizer, X, true_k):
    print "Clustering in " + str(true_k) + " groups by K-means..."
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
    km.fit_predict(X)

    print "Measuring..."

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_))  #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied.
    print("Adjusted Rand-Index: %.3f"   % metrics.adjusted_rand_score(documents, km.labels_))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
    #print top terms per cluster clusters

    clusters = km.labels_.tolist()  # 0 iff term is in cluster0, 1 iff term is in cluster1 ...  (lista de termos)
    #print "Lista de termos pertencentes aos clusters " + str(clusters)
    print "Total de " + str(len(km.labels_)) + " documents"

    #Example to get all documents in cluster 0
    #cluster_0 = np.where(clusters==0) # don't forget import numpy as np
    #print cluster_0
    #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do:
    #X_cluster_0 = documents[cluster_0]
    terms = vectorizer.get_feature_names()

    #print terms
    measuring_kmeans(true_k,clusters)
def evaluate(labels_true, labels):
    homogeneity = metrics.homogeneity_score(labels_true, labels)
    completeness = metrics.completeness_score(labels_true, labels)
    v_measure = metrics.v_measure_score(labels_true, labels)
    adjusted_rand = metrics.adjusted_rand_score(labels_true, labels)
    adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels)
    #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean')
    return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
Esempio n. 22
0
def print_cluster(clusterTrainClass, labels, clusterTestStory):
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels))
	print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels))
	print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels))
	print "Silhouette Coefficient:"
	print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
Esempio n. 23
0
def cluseval(label, truth):
    rand = metrics.adjusted_rand_score(truth, label)
    mutual = metrics.adjusted_mutual_info_score(truth, label)
    h**o = metrics.homogeneity_score(truth, label)
    complete = metrics.completeness_score(truth, label)
    v = metrics.v_measure_score(truth, label)
    result = [rand, mutual, h**o, complete, v]
    return result
def cluster_metrics(labels_1, labels_2):
    print("\n".join(
        [
            "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)),
            "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)),
            "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)),
            "Completeness: %f" % (completeness_score(labels_1, labels_2))
        ]
    ))
Esempio n. 25
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(usage=__doc__,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description='Perform spectral clustering.')
    parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.')
    parser.add_argument("--knn", "-k", type=int, default=0, 
            help='Number of nearest neighbors, 0 means all.')
    parser.add_argument("--sm", "-s", 
            help='File containing similarity matrix')
    parser.add_argument("--iterations", "-i", type=int, default=10,
            help='Number of KMeans iterations.')
    parser.add_argument("--true_labels", "-t", 
            help='File containing the true labels.')
    parser.add_argument("--output", "-o", help='Name of the file to write' +
            ' the labels to.')
    parser.add_argument("--normalize", "-n", action='store_true', 
            help='Normalize each row so that the max value is one.')
    args = parser.parse_args()


    sm = np.load(args.sm)
    if args.normalize:
        sm /= sm.max(axis=1)[:, np.newaxis]
        # Ensure symmetric
        sm = (sm + sm.T) / 2
    labels = []
    if args.knn > 0:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='nearest_neighbors', n_neighbors=args.knn,
                n_init=args.iterations).fit(sm).labels_
    else:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='precomputed',
                n_init=args.iterations).fit(sm).labels_
    
    with open(args.output, 'w') as fout:
        for l in labels:
            fout.write(str(l) + '\n')

    # Load the true labels.
    if args.true_labels:
        true_labels = []
        with open(args.true_labels, 'r') as fin:
            for line in fin:
                true_labels.append(int(line.strip()))
        # Run the metrics.
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels))
        print("Adjusted Rand Index: %0.3f"
                      % metrics.adjusted_rand_score(true_labels, labels))
        print("Adjusted Mutual Information: %0.3f"
                      % metrics.adjusted_mutual_info_score(true_labels, labels))
        print("Silhouette Coefficient: %0.3f"
                      % metrics.silhouette_score(sm, labels))
Esempio n. 26
0
 def eval_clusters(self):
     """calculates the adjusted rand index of the clustering
     based on the label of the points
     """
     _, labels_true, labels_pred = self.get_labels()
     ari = metrics.adjusted_rand_score(labels_true, labels_pred)
     hom = metrics.homogeneity_score(labels_true, labels_pred)
     comp = metrics.completeness_score(labels_true, labels_pred)
     return ari, hom, comp
Esempio n. 27
0
File: exp3.py Progetto: xulesc/algos
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))
Esempio n. 28
0
def print_stats(truth, pred):
    print('Homogeneity Score: ' + str(metrics.homogeneity_score(truth, pred)))
    print('Completeness Score: ' +
          str(metrics.completeness_score(truth, pred)))
    print('Adjusted Mutual Information Score: ' +
          str(metrics.adjusted_mutual_info_score(truth, pred)))
    print('Adjusted Rand Index Score: ' +
          str(metrics.adjusted_rand_score(truth, pred)))
    print('Purity: ' +
          str(purity(truth, pred)))
Esempio n. 29
0
 def evaluateAllAlgorithms(self):
   algs = [self.labels_db,self.labels_ap]
   t**s =['DBASE','AP']
   for i in range(2):
     print 'Algorithm:',t**s[i]
     print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i]))
     print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i]))
     print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i]))
     print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i]))
     print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
Esempio n. 30
0
def get_cluster_metrics(X, labels, labels_true=None):
    metrics_dict = dict()
    metrics_dict['Silhouette coefficient'] = metrics.silhouette_score(X,
                                                                      labels,
                                                                      metric='precomputed')
    if labels_true:
        metrics_dict['Completeness score'] = metrics.completeness_score(labels_true, labels)
        metrics_dict['Homogeneity score'] = metrics.homogeneity_score(labels_true, labels)

    return metrics_dict
Esempio n. 31
0
# parallelism instead of relying on joblib, so the `n_jobs` parameter has no
# effect anymore. For more details on how to control the number of threads,
# please refer to our :ref:`parallelism` notes.
import scipy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import completeness_score

rng = np.random.RandomState(0)
X, y = make_blobs(random_state=rng)
X = scipy.sparse.csr_matrix(X)
X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
kmeans = KMeans(algorithm='elkan').fit(X_train)
print(completeness_score(kmeans.predict(X_test), y_test))

##############################################################################
# Improvements to the histogram-based Gradient Boosting estimators
# ----------------------------------------------------------------
# Various improvements were made to
# :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the
# Poisson loss mentionned above, these estimators now support :ref:`sample
# weights <sw_hgbdt>`. Also, an automatic early-stopping criterion was added:
# early-stopping is enabled by default when the number of samples exceeds 10k.
# Finally, users can now define :ref:`monotonic constraints
# <monotonic_cst_gbdt>` to constrain the predictions based on the variations of
# specific features. In the following example, we construct a target that is
# generally positively correlated with the first feature, with some noise.
# Applying monotoinc constraints allows the prediction to capture the global
Esempio n. 32
0
printf("data: %d instances  %d parameters\n", data_n, data_p)

#--------------------------------------------------------------------------------------------
printf("#%-5s %-5s %-5s %-5s %-5s %-5s %-5s\n", "ACC", "H**O", "COMPL", "VM",
       "ARAND", "MI", "CH-idx")

for i in range(opts["iterations"]):
    em = GaussianMixture(n_components=components,
                         n_init=13,
                         covariance_type="full").fit(data)
    guess = em.predict(data)

    acc = metrics.accuracy_score(labels, guess)
    h**o = metrics.homogeneity_score(
        labels, guess)  # compare the true lables to those em predicted
    comp = metrics.completeness_score(labels, guess)
    vm = metrics.v_measure_score(labels, guess)
    arand = metrics.adjusted_rand_score(labels, guess)
    mi = metrics.adjusted_mutual_info_score(labels,
                                            guess,
                                            average_method="arithmetic")
    ch = metrics.calinski_harabaz_score(data, guess)

    printf(" %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f\n", acc, h**o, comp, vm,
           arand, mi, ch)

    if i == 0:  # just plot the first
        tokens = train_fn.split("/")
        # build file name as emax_<data-type>_<clusters>.eps
        tokens = tokens[-1].split("_")
        title = sprintf("Exp Max %s k=%d", tokens[0], components)
Esempio n. 33
0
hdb_labels = hdb.labels_
hdb_elapsed_time = time.time() - hdb_t1

db_t1 = time.time()
db = DBSCAN(eps=0.1).fit(X)
db_labels = db.labels_
db_elapsed_time = time.time() - db_t1

# Number of clusters in labels, ignoring noise if present.
n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)

print('\n\n++ HDBSCAN Results')
print('Estimated number of clusters: %d' % n_clusters_hdb_)
print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, hdb_labels))
print('Completeness: %0.3f' % metrics.completeness_score(labels_true, hdb_labels))
print('V-measure: %0.3f' % metrics.v_measure_score(labels_true, hdb_labels))
print('Adjusted Rand Index: %0.3f'
      % metrics.adjusted_rand_score(labels_true, hdb_labels))
print('Adjusted Mutual Information: %0.3f'
      % metrics.adjusted_mutual_info_score(labels_true, hdb_labels))
print('Silhouette Coefficient: %0.3f'
      % metrics.silhouette_score(X, hdb_labels))

n_clusters_db_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)

print('\n\n++ DBSCAN Results')
print('Estimated number of clusters: %d' % n_clusters_db_)
print('Elapsed time to cluster: %.4f s' % db_elapsed_time)
print('Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, db_labels))
print('Completeness: %0.3f' % metrics.completeness_score(labels_true, db_labels))
        int(explained_variance * 100)))

    print()

###############################################################################
# K-Means clustering
km = KMeans(n_clusters=true_k, init='k-means++', n_init=20)

print("Clustering with %s" % km)
km.fit(X)
print()

table.append([
    'k-means',
    metrics.homogeneity_score(labels, km.labels_),
    metrics.completeness_score(labels, km.labels_),
    metrics.v_measure_score(labels, km.labels_),
    metrics.adjusted_rand_score(labels, km.labels_),
    metrics.adjusted_mutual_info_score(labels, km.labels_),
    metrics.silhouette_score(X, km.labels_, metric='cosine')
])

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Adjusted Mututal Information: %.3f" %
      metrics.adjusted_mutual_info_score(labels, km.labels_))
print("Silhouette Coefficient (euclidean): %0.3f" %
      metrics.silhouette_score(X, km.labels_, metric='euclidean'))
Esempio n. 35
0
# # X_all=np.hstack((X_all,X_add))
# 
# X_add=fft_shape_analysis(n1,n2,n_fft)
# X_all=np.hstack((X_all,X_add))

# X_add=lbp_analysis(n1,n2,1e3)
# X_all=np.hstack((X_all,X_add))

classes=clustering_kmeans(X_all,n_clusters_)


print("**************Clustering_results*********************")
print('adjusted_rand_score=%0.2f' % metrics.adjusted_rand_score(classes_true,classes))
# print('normalized_mutual_info_score=%0.2f' % metrics.normalized_mutual_info_score(classes_true,classes))
print('homogeneity_score=%0.2f' % metrics.homogeneity_score(classes_true,classes))
print('completeness_score=%0.2f' % metrics.completeness_score(classes_true,classes))
print('v_measure_score=%0.2f' % metrics.v_measure_score(classes_true,classes))
print('fowlkes_mallows_score=%0.2f' % metrics.fowlkes_mallows_score(classes_true,classes))
print("\n")

pca = PCA(n_components=2,svd_solver='auto')
X_pca  =pca.fit_transform(X_all)


print("**************Clustering_results_after_pca*********************")
print('adjusted_rand_score=%0.2f' % metrics.adjusted_rand_score(classes_true,classes_pca))
# print('normalized_mutual_info_score=%0.2f' % metrics.normalized_mutual_info_score(classes_true,classes_pca))
print('homogeneity_score=%0.2f' % metrics.homogeneity_score(classes_true,classes_pca))
print('completeness_score=%0.2f' % metrics.completeness_score(classes_true,classes_pca))
print('v_measure_score=%0.2f' % metrics.v_measure_score(classes_true,classes_pca))
print('fowlkes_mallows_score=%0.2f' % metrics.fowlkes_mallows_score(classes_true,classes_pca))
Esempio n. 36
0
# step 3 - create an instance of sIB and run the actual clustering
# n_init = the number of random initializations to perform
# max_ter = the maximal number of iteration in each initialization
# n_jobs = the maximal number of initializations to run in parallel
clustering_start_t = time()
n_init = 1 if speed_test_mode else 4
sib = SIB(n_clusters=n_clusters, random_state=128, n_init=n_init,
          n_jobs=-1, max_iter=15, verbose=True)
sib.fit(vectors)
clustering_end_t = time()

print("Clustering time: %.3f secs." % (clustering_end_t - clustering_start_t))

# step 4 - some evaluation
homogeneity = metrics.homogeneity_score(gold_labels, sib.labels_)
completeness = metrics.completeness_score(gold_labels, sib.labels_)
v_measure = metrics.v_measure_score(gold_labels, sib.labels_)
ami = metrics.adjusted_mutual_info_score(gold_labels, sib.labels_)
ari = metrics.adjusted_rand_score(gold_labels, sib.labels_)
print("Homogeneity: %0.3f" % homogeneity)
print("Completeness: %0.3f" % completeness)
print("V-measure: %0.3f" % v_measure)
print("Adjusted Mutual-Information: %.3f" % ami)
print("Adjusted Rand-Index: %.3f" % ari)

# save a heatmap
clustering_utils.create_heatmap(gold_labels, sib.labels_,
                                topics, 'sIB clustering heatmap',
                                os.path.join(output_path, 'sib_heatmap'))

# save a report
Esempio n. 37
0
    fmt='%.6f',
    newline='\n')

fastcluster.linkage(X_test, method='ward', metric='euclidean')

km = KMeans(n_clusters=20,
            init='k-means++',
            max_iter=100,
            n_init=5,
            verbose=True,
            random_state=10)
t0 = time()
km.fit(X_test)
print("done in %0.3fs" % (time() - t0))

#y_test = [int(i) for i in labels]
#pred_test = [int(i) for i in km.labels_]

print("Homogeneity: %0.3f" %
      metrics.homogeneity_score(labels_test, km.labels_))
print("Completeness: %0.3f" %
      metrics.completeness_score(labels_test, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_test, km.labels_))

#spec = SpectralClustering(n_clusters=20, eigen_solver='arpack', random_state=0, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, n_jobs=1)
#spec_labels = spec.fit_predict(X_test)

#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_test, spec_labels))
#print("Completeness: %0.3f" % metrics.completeness_score(labels_test, spec_labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(labels_test, spec_labels))
# In[ ]:

data = []
for algo in algorithms:
    algo.fit(X)
    data.append(({
        'ARI':
        metrics.adjusted_rand_score(y, algo.labels_),
        'AMI':
        metrics.adjusted_mutual_info_score(y,
                                           algo.labels_,
                                           average_method='arithmetic'),
        'Homogenity':
        metrics.homogeneity_score(y, algo.labels_),
        'Completeness':
        metrics.completeness_score(y, algo.labels_),
        'V-measure':
        metrics.v_measure_score(y, algo.labels_),
        'Silhouette':
        metrics.silhouette_score(X, algo.labels_)
    }))

results = pd.DataFrame(
    data=data,
    columns=[
        'ARI', 'AMI', 'Homogenity', 'Completeness', 'V-measure', 'Silhouette'
    ],
    index=['K-means', 'Affinity', 'Spectral', 'Agglomerative'])
results

# ### 实验总结
Esempio n. 39
0
                            random_state=0)

X = StandardScaler().fit_transform(X)

# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))

# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [
    plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
]
                break

        oracle2.append(NUTS2_solution)
        oracle3.append(NUTS3_solution)

    # if args.retrofit:
    #     retro_pred.append(retro_cluster_ids[c])

    gold.append(solution)
    pred.append(cluster_ids[c])
    for i in range(KMEANS_AVG):
        dumb_pred[i][c] = dumb_cluster_ids[i][c]

dumb_pred_v = np.array([v_measure_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean()
dumb_pred_h = np.array([homogeneity_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean()
dumb_pred_c = np.array([completeness_score(gold, dumb_pred[i, :]) for i in range(KMEANS_AVG)]).mean()

# if args.retrofit:
#     retro_pred_v = v_measure_score(gold, retro_pred)
#     retro_pred_h = homogeneity_score(gold, retro_pred)
#     retro_pred_c = completeness_score(gold, retro_pred)
#     print(retro_pred_v, retro_pred_h, retro_pred_c)

# print('clusters\tV-measure\thomogeneity\tcompleteness')
print('%s\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f' % (args.clusters, v_measure_score(gold, pred), homogeneity_score(gold, pred), completeness_score(gold, pred), dumb_pred_v, dumb_pred_h, dumb_pred_c))
# print('%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f\t&\t%.2f' % (v_measure_score(gold, oracle2), homogeneity_score(gold, oracle2), completeness_score(gold, oracle2), v_measure_score(gold, oracle3), homogeneity_score(gold, oracle3), completeness_score(gold, oracle3)))


# m.readshapefile('/Users/dirkhovy/Dropbox/working/lowlands/sociolinguistics/playground/Lameli maps/Lameli', 'de', drawbounds=True)
#
# x, y, z = zip(*[(locations[city][0][1], locations[city][0][0], city_density_scaled[city]) for city in eligible_cities if locations[city][-2] == "DE"])
Esempio n. 41
0
def wordVec():
    dataset = fetch_20newsgroups(subset='all',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()

    labels = dataset.target

    print("Extracting features from the training dataset "
          "using a sparse vectorizer")
    t0 = time.clock()

    if sklearn.naive_bayes.check_X_y():
        if sklearn.naive_bayes.safe_sparse_dot():

            hasher = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm=None)
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(n_features=opts.n_features,
                                           stop_words='english',
                                           alternate_sign=False,
                                           norm='l2')
    else:
        vectorizer = TfidfVectorizer(max_df=0.5,
                                     max_features=opts.n_features,
                                     min_df=2,
                                     stop_words='english',
                                     use_idf=sklearn.metrics.roc_curve())
    X = vectorizer.fit_transform(dataset.data)

    print("done in %fs" % (time.time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    print()

    if True:
        print("Performing dimensionality reduction using LSA")
        t0 = time.time()
        svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict())
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        X = lsa.fit_transform(X)

        print("done in %fs" % (time() - t0))

        explained_variance = svd.explained_variance_ratio_.sum()
        print("Explained variance of the SVD step: {}%".format(
            int(explained_variance * 100)))

        print()

    if opts.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k,
                             init='k-means++',
                             n_init=1,
                             init_size=1000,
                             batch_size=1000,
                             verbose=opts.verbose)
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1,
                    verbose=opts.verbose)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    if not opts.use_hashing:
        print("Top terms per cluster:")

        if opts.n_components:
            original_space_centroids = svd.inverse_transform(
                km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i, end='')
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()
Esempio n. 42
0
def only6_NMF_NLT():

	english_stemmer = Stemmer.Stemmer('en')
	class StemmedTfidfVectorizer(TfidfVectorizer):

		def build_analyzer(self):
			analyzer = super(TfidfVectorizer, self).build_analyzer()
			return lambda doc: english_stemmer.stemWords(analyzer(doc))

	print("Loading 20 newsgroups dataset for all categories...")

	newsgroups = fetch_20newsgroups(subset='all')
	
	print("%d documents" % len(newsgroups.data))
	print("%d categories" % len(newsgroups.target_names))

	print("Creating stemmed TFxIDF representation...")
	t0 = time()

	vect = StemmedTfidfVectorizer(stop_words='english')
	vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation

	print("Done in %fs" % (time() - t0))
	print("n_samples: %d, n_features: %d" % vectors.shape)

	purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score']

	# Reducing the dimensionality with NMF NLT

	nmf_nlt_dim_bank = range(1,21)

	workbook = xlsxwriter.Workbook('part6_pt2_NMF_NLT.xlsx')

	for dims in nmf_nlt_dim_bank:

		print("Implementing NMF of dimension %d on data..." % dims)
		nmf_ = NMF(n_components=dims) # alpha value? l1 value?
		nmf_data = nmf_.fit_transform(vectors)
		print("Done.")

		print("Implementing non-linear transform on data...")
		offset = 0.001
		nmf_data_off=np.add(nmf_data,offset)
		log_nmf_data=np.log(nmf_data_off)
		print("Done.")

		k = 6
		km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

		print("Clustering sparse data with %s" % km)
		t0 = time()
		km.fit(log_nmf_data)
		print("done in %0.3fs" % (time() - t0))

		print_results(newsgroups.target,km.labels_)
		purityMetrics = [metrics.homogeneity_score(newsgroups.target, km.labels_), metrics.completeness_score(newsgroups.target, km.labels_),metrics.v_measure_score(newsgroups.target, km.labels_),metrics.adjusted_rand_score(newsgroups.target, km.labels_),metrics.adjusted_mutual_info_score(newsgroups.target, km.labels_)]

		# Writing to .xlsx file (For Stats)
		worksheet = workbook.add_worksheet()

		row = 0
		col = 0

		worksheet.write(row,col,'Dimension')
		worksheet.write(row,col+1,dims)

		metric_list = dict(zip(purityMetricsNames,purityMetrics))
		pprint(dict(metric_list))

		for key in metric_list.keys():
			row += 1
			worksheet.write(row,col+11,key)
			worksheet.write(row,col+12,metric_list[key])
Esempio n. 43
0

# In[5]:


clustering = AgglomerativeClustering(n_clusters = 3).fit(data)

num_clusters = clustering.n_clusters_
clusterDest = clustering.labels_
takePlot(clusterDest, data, num_clusters)


# In[6]:


print("Completeness: %0.3f" % metrics.completeness_score(labels_true, clusterDest))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, clusterDest))
print("Adjusted Rand index: %0.3f" % metrics.adjusted_rand_score(labels_true, clusterDest))
print("Adjusted Mutual information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, clusterDest))


# In[7]:


print("Very small distance between groups")
samples = 1000
density = 0.4
centers = [[0, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
data, labels_true = make_blobs(n_samples=samples, centers=centers, cluster_std=density)
plt.scatter(data[:,0],data[:,1], c=labels_true)
Esempio n. 44
0
Cancer_EM_train_acc = []
Cancer_EM_cv_acc = []

for i in n_components:
    print(i)
    EM.set_params(random_state=7641, n_components=i)
    EM.fit(Cancer_X)
    Cancer_EM_score.append(EM.score(Cancer_X_train))
    Cancer_EM_bic.append(EM.bic(Cancer_X_train))
    Cancer_EM_aic.append(EM.aic(Cancer_X_train))
    Cancer_EM_log.append(
        silhouette_score(Cancer_X_train, EM.predict(Cancer_X_train)))
    Cancer_EM_homogeneity_score.append(
        homogeneity_score(Cancer_y_train, EM.predict(Cancer_X_train)))
    Cancer_EM_complete_score.append(
        completeness_score(Cancer_y_train, EM.predict(Cancer_X_train)))
    Cancer_scores = cross_validate(EM,
                                   Cancer_X_train,
                                   Cancer_y_train,
                                   cv=5,
                                   scoring=make_scorer(my_custom_acc,
                                                       greater_is_better=True),
                                   n_jobs=-1,
                                   return_train_score=True)
    Cancer_EM_train_acc.append(np.mean(Cancer_scores['train_score']))
    Cancer_EM_cv_acc.append(np.mean(Cancer_scores['test_score']))

PlotEm(6, n_components, Cancer_EM_aic, 'AIC', 'Cancer')
PlotEm(7, n_components, Cancer_EM_bic, 'BIC', 'Cancer')
PlotEm(8, n_components, Cancer_EM_score, 'SSE', 'Cancer')
PlotEm(9, n_components, Cancer_EM_log, 'Log-Likelihood', 'Cancer')
Esempio n. 45
0
#km = MiniBatchKMeans(n_clusters=20, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose)
km = KMeans(n_clusters=20,
            init='k-means++',
            max_iter=100,
            n_init=1,
            verbose=True)
t0 = time()
km.fit(rdata)
#km.fit(rdata)
print("done in %0.3fs" % (time() - t0))

#y_test = [int(i) for i in labels]
#pred_test = [int(i) for i in km.labels_]

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))

##############cluster large data########
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
import numpy as np
from time import time

file = '/users/grad/rakib/dr.norbert/dataset/shorttext/agnews/agnews-w2vec-glove-vector-127600'
data = np.loadtxt(file, dtype='float', delimiter=' ')

data1 = np.delete(data, [0], axis=1)
labels = data[:, 0]
            cmap=matplotlib.colors.ListedColormap(colors))

kmeans_model = KMeans(n_clusters=3, max_iter=10000).fit(data)
kmeans_model.labels_
centroids = kmeans_model.cluster_centers_
centroids

fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=250, marker='s')
for i in range(len(centroids)):
    plt.annotate(i, (centroids[i][0] + 7, centroids[i][1] + 7), fontsize=20)

print("Homegenity score : ",
      metrics.homogeneity_score(labels, kmeans_model.labels_))
print("Completeness score : ",
      metrics.completeness_score(labels, kmeans_model.labels_))
print("V_measure_score : ",
      metrics.v_measure_score(labels, kmeans_model.labels_))
print("Adjusted rand score : ",
      metrics.adjusted_rand_score(labels, kmeans_model.labels_))
print("Adjusted_mutual_info_score : ",
      metrics.adjusted_mutual_info_score(labels, kmeans_model.labels_))
print("Silhouette score : ",
      metrics.silhouette_score(data, kmeans_model.labels_))

colors = ['green', 'blue', 'purple']
plt.figure(figsize=(12, 8))
plt.scatter(data[:, 0],
            data[:, 1],
            c=df['labels'],
            s=200,
Esempio n. 47
0
def Output_result(labels_true,labels_pred,name):
    print('%-30s\t%.2fs\t%.3f\t%.3f\t%.3f' % (name,(time() - t0),
          metrics.homogeneity_score(labels_true,labels_pred),metrics.completeness_score(labels_true, labels_pred),metrics.normalized_mutual_info_score(labels_true, labels_pred,average_method='arithmetic')))
Esempio n. 48
0
 def show_metrics(self, truth, k_labels):
     print("Homogeneity: %0.3f" % metrics.homogeneity_score(truth, k_labels))
     print("Completeness: %0.3f" % metrics.completeness_score(truth, k_labels))
     print("V-measure: %0.3f" % metrics.v_measure_score(truth, k_labels))
     print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(truth, k_labels))
     print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(truth, k_labels))
bench_k_means(MeanShift(), name='MeanShift', data=data)
bench_k_means(SpectralClustering(n_clusters=n_digits, n_init=10),
              name="SpectralClustering",
              data=data)
bench_k_means(AgglomerativeClustering(n_clusters=n_digits),
              name="AgglomerativeClustering",
              data=data)
bench_k_means(DBSCAN(), name="DBSCAN", data=data)

t0 = time()
gm = GaussianMixture()
gm.fit(data)
print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%s\t%s' %
      ('GaussianMixture',
       (time() - t0), metrics.homogeneity_score(labels, gm.predict(data)),
       metrics.completeness_score(labels, gm.predict(data)),
       metrics.normalized_mutual_info_score(
           labels, gm.predict(data)), labels, gm.predict(data)))
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1

pca = PCA(n_components=n_digits).fit(data)

print(82 * '_')

# #############################################################################
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=2).fit_transform(data)

kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
Esempio n. 50
0
#k-means 算法基本步骤:
#(1) 从 n个数据对象任意选择k个对象作为初始聚类中心(最终期望聚为k类);
#(2) 根据每个聚类对象的均值(中心对象),计算每个对象与这些中心对象的距离;按最小距离重新对相应对象进行划分;
#(3) 重新计算每个(有变化)聚类的均值(中心对象);
#(4) 计算标准测度函数,当满足一定条件,如函数收敛时,则算法终止;如果条件不满足则回到步骤(2)。
############################


from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3) # initialization 先验知道3种植物,所以设定引力中心为聚合成3类。
#kmeans = KMeans(k=3, init='random') # both parameters are wrong
kms.fit(data) # actual execution
c = kms.predict(data)

from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(t,c)
#output:0.764986151449
print homogeneity_score(t,c)
#output:0.751485402199

#特别注意!t中只要是3类值就行,不一定非要1,2,3
#当大部分数据点属于一个给定的类并且属于同一个群集,那么完整性得分就趋向于1。
#当所有群集都几乎只包含某个单一类的数据点时同质性得分就趋向于1.
figure()
subplot(211) # top figure with the real classes
plot(data[t==1,0],data[t==1,2],'bo')
plot(data[t==2,0],data[t==2,2],'ro')
plot(data[t==3,0],data[t==3,2],'go')
subplot(212) # bottom figure with classes assigned automatically
plot(data[c==1,0],data[c==1,2],'bo',alpha=.5)
plot(data[c==2,0],data[c==2,2],'go',alpha=.5)
Esempio n. 51
0
km_vmeasure =[]
km_ami = []
km_homogeneity = []
km_completeness = []

cluster_range = (2,11)

for i in range(cluster_range[0],cluster_range[1]):
    km = KMeans(n_clusters=i, random_state=0).fit(X_scaled)
    preds = km.predict(X_scaled)
    km_sse.append(-km.score(X_scaled))
    km_silhouette.append(silhouette_score(X_scaled,preds))
    km_vmeasure.append(v_measure_score(y,preds))
    km_ami.append(adjusted_mutual_info_score(y,preds))
    km_homogeneity.append(homogeneity_score(y,preds))
    km_completeness.append(completeness_score(y,preds))
    print(f"Done for cluster {i}")


# ### Plotting various cluster evaluation metrics as function of number of clusters

# In[33]:


plt.figure(figsize=(21,10))

#SSE
plt.subplot(2,3,1)
plt.plot([i for i in range(cluster_range[0],cluster_range[1])],km_sse,'b-o',linewidth=3,markersize=12)
plt.grid(True)
plt.title("SSE score vs. number of clusters",fontsize=15)
Esempio n. 52
0
                                           -1] == 1].shape[0]) / clus.shape[0]

h**o = []
comp = []
v_mea = []
sil = []
man = []
numPoints = 8
for i in range(2, numPoints):
    ipca = decomposition.FastICA(n_components=6, whiten=True)
    X_new = ipca.fit_transform(X, y)
    gm = mixture.GMM(n_components=i, covariance_type='diag')
    gm.fit(X_new)
    y_pred = gm.predict(X_new)
    h**o.append(metrics.homogeneity_score(y, y_pred))
    comp.append(metrics.completeness_score(y, y_pred))
    v_mea.append(metrics.v_measure_score(y, y_pred))
    sil.append(
        metrics.silhouette_score(projected_data, y_pred, metric='euclidean'))
    man.append(
        metrics.silhouette_score(projected_data, y_pred, metric='manhattan'))

x = xrange(2, numPoints)
fig = plt.figure()
plt.plot(x, h**o, label='homogeneity score')
plt.plot(x, comp, label='completeness score')
plt.plot(x, v_mea, label='v measure score')
plt.plot(x, sil, label='Silhouette Score euclidean')
plt.plot(x, man, label='Silhouette Score manhattan')
plt.legend(loc='upper right', shadow=True)
plt.show()
    classifier = KMedoids(dataset_norm, list_label, 2)
    classifier.traindata(200)
    list_clustered = classifier.classes
    print(
        "---------------- K-MEDOIDS SCORE USING DATA TRAIN ------------------------"
    )
    print("ARI SCORE: " + str(
        adjusted_rand_score(np.array(list_label), np.array(list_clustered))))
    print("MUTUAL INFO SCORE: " + str(
        adjusted_mutual_info_score(np.array(list_label),
                                   np.array(list_clustered))))
    print(
        "HOMOGENEITY SCORE: " +
        str(homogeneity_score(np.array(list_label), np.array(list_clustered))))
    print("COMPLETENESS SCORE: " + str(
        completeness_score(np.array(list_label), np.array(list_clustered))))
    print("V MEASURE SCORE: " +
          str(v_measure_score(np.array(list_label), np.array(list_clustered))))
    print("FOWLKES-MALLOWS SCORE: " + str(
        fowlkes_mallows_score(np.array(list_label), np.array(list_clustered))))
    # print("SILHOUETTE SCORE: " + str(silhouette_score(np.array(dataset_norm), np.array(list_label), metric="euclidean")))
    print("CALINSKI-HARABAZ SCORE: " + str(
        calinski_harabaz_score(np.array(dataset_norm), np.array(list_label))))

    datatest = utils.create_list_dataset("CencusIncome.test.txt")
    datatest_norm = utils.normalize_attr(datatest)
    list_label_test = utils.create_list_label("CencusIncome.test.txt")
    list_clustered_test = []
    for instance in datatest_norm:
        list_clustered_test.append(classifier.predict(instance))
    print(
Esempio n. 54
0
        clusters = [x.strip() for x in f.readlines()]
        for i, cluster in enumerate(clusters):
            for image in cluster.split(','):
                true_labels += [
                    s0 for (s0, s1) in cameras if image.startswith(s1)
                ]
                pred_labels.append(i)

    # Table as in reference papers
    x = PrettyTable()
    x.field_names = ['Model'] + list(range(1, len(clusters) + 1))
    for _, camera in cameras:
        l = []
        for i, cluster in enumerate(clusters):
            l.append(0)
            for image in cluster.split(','):
                if image.startswith(camera):
                    l[i] += 1
        x.add_row([camera] + l)
    print(x)

    #Table with erros
    y = PrettyTable()
    y.field_names = [
        'Error', 'ARI', 'MIBS', 'Homogeinity', 'Completeness', 'V-Measure',
        'Fowlkes-Mallows'
    ]
    y.add_row(['', metrics.adjusted_rand_score(pred_labels, true_labels), metrics.mutual_info_score(true_labels, pred_labels), metrics.homogeneity_score(true_labels, pred_labels), \
        metrics.completeness_score(true_labels, pred_labels),metrics.v_measure_score(true_labels, pred_labels),metrics.fowlkes_mallows_score(true_labels, pred_labels)])
    print(y)
Esempio n. 55
0
 def completeness(self, labels):
     return float(metrics.completeness_score(labels, self.model.labels_))
Esempio n. 56
0
from sklearn.metrics import classification_report
#print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']))

from sklearn.model_selection import cross_val_score
#scores = cross_val_score(classifier, data, t, cv=6)
#print(scores)

#from numpy import mean
#print(mean(scores))

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, init='random')# initialization
kmeans.fit(data)# actual execution

c = kmeans.predict(data)

from sklearn.metrics import completeness_score, homogeneity_score
print(completeness_score(t, c))
print(homogeneity_score(t, c))

from pylab import subplot, plot, show, figure
figure()
subplot(211) # top figure with the real classes
plot(data[t==1,0],data[t==1,2],'bo')
plot(data[t==2,0],data[t==2,2],'ro')
plot(data[t==3,0],data[t==3,2],'go')
subplot(212) # bottom figure with classes assigned automatically
show()
            y_upper = y_lower + ct_values.shape[0]

            color = cm.Accent(float(t) / n)
            ax[mapping[i]].fill_betweenx(np.arange(y_lower, y_upper), 0, ct_values, facecolor=color, edgecolor=color)

            y_lower = y_upper + 20

    plt.show()

    # Compute the other metrics for K=2
    km = KMeans(n_clusters=2, max_iter=1000, random_state=1000)
    Y_pred = km.fit_predict(cdf)
    df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index)
    kmdff = pd.concat([dff, df_km], axis=1)

    print('Completeness: {}'.format(completeness_score(kmdff['diagnosis'], kmdff['prediction'])))
    print('Homogeneity: {}'.format(homogeneity_score(kmdff['diagnosis'], kmdff['prediction'])))
    print('Adj. Mutual info: {}'.format(adjusted_mutual_info_score(kmdff['diagnosis'], kmdff['prediction'])))
    print('Adj. Rand score: {}'.format(adjusted_rand_score(kmdff['diagnosis'], kmdff['prediction'])))

    # Perform a K-Means clustering with K=8
    km = KMeans(n_clusters=8, max_iter=1000, random_state=1000)
    Y_pred = km.fit_predict(cdf)

    df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index)
    kmdff = pd.concat([dff, df_km], axis=1)

    # Show the result
    fig, ax = plt.subplots(figsize=(18, 11))

    with sns.plotting_context("notebook", font_scale=1.5):
Esempio n. 58
0
def test_using_sklearn(label_true, label_true_test, dataset, datatest):
    X = numpy.array(dataset)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    cluster_train = kmeans.labels_
    arr_test = numpy.array(datatest)
    cluster_test = kmeans.predict(arr_test)

    # Evaluation for Full Training
    print(
        "\n------------------------ SCIKIT LEARN --------------------------------"
    )
    print(
        "--------------- K-MEANS SCORE USING DATA TRAIN -----------------------"
    )
    print("ARI SCORE: " + str(
        adjusted_rand_score(numpy.array(label_true), numpy.array(
            cluster_train))))
    print("MUTUAL INFO SCORE: " + str(
        adjusted_mutual_info_score(numpy.array(label_true),
                                   numpy.array(cluster_train))))
    print("HOMOGENEITY SCORE: " + str(
        homogeneity_score(numpy.array(label_true), numpy.array(cluster_train)))
          )
    print("COMPLETENESS SCORE: " + str(
        completeness_score(numpy.array(label_true), numpy.array(
            cluster_train))))
    print("V MEASURE SCORE: " + str(
        v_measure_score(numpy.array(label_true), numpy.array(cluster_train))))
    print("FOWLKES-MALLOWS SCORE: " + str(
        fowlkes_mallows_score(numpy.array(label_true),
                              numpy.array(cluster_train))))
    # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true), metric="euclidean")))
    print("CALINSKI-HARABAZ SCORE: " + str(
        calinski_harabaz_score(numpy.array(dataset), numpy.array(label_true))))

    # Evaluation for Split Validation
    print(
        "--------------- K-MEANS SCORE USING DATA TEST -----------------------"
    )
    print("ARI SCORE: " + str(
        adjusted_rand_score(numpy.array(label_true_test),
                            numpy.array(cluster_test))))
    print("MUTUAL INFO SCORE: " + str(
        adjusted_mutual_info_score(numpy.array(label_true_test),
                                   numpy.array(cluster_test))))
    print("HOMOGENEITY SCORE: " + str(
        homogeneity_score(numpy.array(label_true_test),
                          numpy.array(cluster_test))))
    print("COMPLETENESS SCORE: " + str(
        completeness_score(numpy.array(label_true_test),
                           numpy.array(cluster_test))))
    print("V MEASURE SCORE: " + str(
        v_measure_score(numpy.array(label_true_test), numpy.array(
            cluster_test))))
    print("FOWLKES-MALLOWS SCORE: " + str(
        fowlkes_mallows_score(numpy.array(label_true_test),
                              numpy.array(cluster_test))))
    # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true_test), metric="euclidean")))
    print("CALINSKI-HARABAZ SCORE: " + str(
        calinski_harabaz_score(numpy.array(datatest),
                               numpy.array(label_true_test))))

    return None
"""
"""
Measuring Performance of K-means
Homogeneity and Completeness – If you have pre-existing class labels that you’re trying to duplicate with k-means clustering, you can use two measures: homogeneity and completeness.

Homogeneity means all of the observations with the same class label are in the same cluster.
Completeness means all members of the same class are in the same cluster.
Scikit-Learn (Python) has an excellent write-up on these two measures.

"""
#We can turn those concept as scores homogeneity_score and completeness_score. Both are bounded below by 0.0 and above by 1.0 (higher is better):

from sklearn import metrics

labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.homogeneity_score(labels_true, labels_pred)

metrics.completeness_score(labels_true, labels_pred)

#Their harmonic mean called V-measure is computed by v_measure_score
metrics.v_measure_score(labels_true, labels_pred)

#All calculated together
metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)

#https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

#http://www.learnbymarketing.com/methods/k-means-clustering/
cluster_centers = {
    'X': kmeans_X.cluster_centers_,
    'X_scaled': kmeans_X_scaled.cluster_centers_,
    'total_data': kmeans_total_data.cluster_centers_,
    'total_data_scaled': kmeans_total_data_scaled.cluster_centers_,
}

for each in metrics_report.keys():
    metrics_report[each]['ARI'] = round(
        metrics.adjusted_rand_score(y, labels[each]), 2)
    metrics_report[each]['AMI'] = round(
        metrics.adjusted_mutual_info_score(y, labels[each]), 2)
    metrics_report[each]['homogeneity'] = round(
        metrics.homogeneity_score(y, labels[each]), 2)
    metrics_report[each]['completeness'] = round(
        metrics.completeness_score(y, labels[each]), 2)
    metrics_report[each]['v_measure'] = round(
        metrics.v_measure_score(y, labels[each]), 2)
    metrics_report[each]['silhouette'] = round(
        metrics.silhouette_score(X, labels[each]), 2)
    metrics_report[each]['accuracy'] = round(
        metrics.accuracy_score(y, labels[each]) * 100, 2)

print(metrics_report)

#visualizing - clustering of X_scaled dataset
plt.scatter(X_scaled[kmeans_X_scaled.labels_ == 1, 4],
            X_scaled[kmeans_X_scaled.labels_ == 1, 8],
            s=20,
            c='blue',
            label='Cluster 1')