Example #1
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
Example #2
0
def main():
    digits = datasets.load_digits()
    #print_digit_data(digits)    # tok
    #plot_training_data(digits)  # tok
    #plot_target_data(digits)    # tok
    #show_PCA_training(digits)   # tok
    
    data = preprocess_data(digits)   # tok
    #print(data)   # tok
    
    X_train, X_test, y_train, y_test = split_data_into_training_and_test(data, digits)   # tok
    
    
    clf = cluser_digits(X_train)   # tok
    # show_cluster_digits(clf)  # TOK
    
    y_pred = predict_labels(clf, X_test, y_test, X_train, y_train)   # tok
    show_prediction_confusion_matrix(y_test, y_pred)   # tok
   
    homogeneity_score(clf, X_test, y_test, X_train, y_train, y_pred)   # tok
   
    ##########################################
    # try a different model
    svc_model, X_train, X_test, y_train, y_test, images_train, images_test = model_SVC(digits)   # tok
    
    # grid_search - use this to tune parameters
    grid_search(digits)   # tok
    apply_grid_search(clf, X_test, y_test, X_train, y_train)
    predicted = classify_rbf(svc_model, X_test, y_test, images_test)
    check_model_performance(y_test, predicted)
    
    show_model2_results(svc_model, X_train, y_train)
def aggregate_stats(infiles, outfile):
    """
    Combine all the aggstats into a single file
    
    Compute summary statistics
    """

    res = []
    for infile in infiles:
        d = pickle.load(open(infile, 'r'))
        print "The file is", infile
        assigndf = d['df']
        meta = d['meta']
        neurons = meta['neurons']


        m = extract_metadata(infile)
        if len(m) == 0:
            # skip the stupid non-replicated ones
            continue 

        for k, v in m.iteritems():
            assigndf[k] = v
        

        assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))]
        # compute the statistics
        assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)


        # don't consider the ones where the role is "none" as these are multi-role ones
        neurons.ix[neurons['role'].isnull(), 'role'] = 'I'
        
        assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], 
                                                                                     irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], 
                                                                                           irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], 
                                                                                             irm.util.canonicalize_assignment(x['assign'])), axis=1)



        assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1)
        assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1)
        assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)
        #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)

        # 

        # fraction of mass in top N types
        
        res.append(assigndf)
    alldf = pandas.concat(res)
    pickle.dump(alldf, open(outfile, 'w'), -1)
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Example #5
0
def clustering(dataset):
    vectorizer = dataset.vectorizer
    X = dataset.X
    true_k = dataset.n_classes
    labels = dataset.target

    km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f"
          % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, sample_size=1000))
    print()

    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0)
    for i in range(true_k):
        print("Cluster %d (%d):" % (i, sizes[i]), end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
Example #6
0
def kmeans_setup(data):
	

	if pca_f == 1:
		pca = PCA(n_components = num_clusters).fit(data)
		initializer = pca.components_
		name = 'PCA'
	else:
		initializer = 'k-means++'
		name = 'k-means++'

	t0 = time()
	
	estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations)
	estimator.fit(data)
	
	if debug == True:
		sample_size = 300
		print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
	          % (name, (time() - t0), estimator.inertia_,
	             metrics.homogeneity_score(labels, estimator.labels_),
	             metrics.completeness_score(labels, estimator.labels_),
	             metrics.v_measure_score(labels, estimator.labels_),
	             metrics.adjusted_rand_score(labels, estimator.labels_),
	             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
	             metrics.silhouette_score(data, estimator.labels_,
	                                      metric='euclidean',
	                                      sample_size=sample_size)))
	return estimator
Example #7
0
def affin_test():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    
     
    x_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
        x_train, y_train, test_size=0.9, random_state=42)    
    
    
    labels_true = y_train 
    
    x_train = StandardScaler().fit_transform(x_train)
    af = AffinityPropagation(preference=-50).fit(x_train)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    n_clusters_ = len(cluster_centers_indices)
    
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
Example #8
0
    def run(self):
        meandist=[]
        homogeneity_scores=[]
        completeness_scores=[]
        rand_scores=[]
        silhouettes=[]

        for k in self.clusters:
            model = KMeans(n_clusters=k, max_iter=5000, init='k-means++')
            labels = model.fit_predict(self.X)

            if k == self.targetcluster and self.stats:
                nd_data = np.concatenate((self.X, np.expand_dims(labels, axis=1),np.expand_dims(self.y, axis=1)), axis=1)
                pd_data = pd.DataFrame(nd_data)
                pd_data.to_csv("cluster.csv", index=False, index_label=False, header=False)
                print model.cluster_centers_

                for i in range (0,3):
                    print "Cluster {}".format(i)
                    cluster = pd_data.loc[pd_data.iloc[:,-2]==i].iloc[:,-2:]
                    print cluster.shape[0]
                    print float(cluster.loc[cluster.iloc[:,-1]==0].shape[0])/cluster.shape[0]
                    print float(cluster.loc[cluster.iloc[:,-1]==1].shape[0])/cluster.shape[0]

            meandist.append(sum(np.min(cdist(self.X, model.cluster_centers_, 'euclidean'), axis=1))/ self.X.shape[0])
            homogeneity_scores.append(metrics.homogeneity_score(self.y, labels))
            completeness_scores.append(metrics.completeness_score(self.y, labels))
            rand_scores.append(metrics.adjusted_rand_score(self.y, labels))

        if self.gen_plot:
            #self.visualize()

            self.plot(meandist, homogeneity_scores, completeness_scores, rand_scores, silhouettes)
def bench_k_means(estimator, name, data, target_labels, sample_size):
  """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy
  ARGS
    estimator: K-Means clustering algorithm <sklearn.cluster.KMeans>
    name: estimator name <str>
    data: array-like or sparse matrix, shape=(n_samples, n_features)
    target_labels: labels of data points <number array>
    sample_size: size of the sample to use when computing the Silhouette Coefficient <int>
  """ 
  t0 = time()
  estimator.fit(data)

  _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels)

  print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        % (name, (time() - t0), estimator.inertia_,
           metrics.homogeneity_score(target_labels, estimator.labels_),
           metrics.completeness_score(target_labels, estimator.labels_),
           metrics.v_measure_score(target_labels, estimator.labels_),
           metrics.adjusted_rand_score(target_labels, estimator.labels_),
           metrics.adjusted_mutual_info_score(target_labels,  estimator.labels_),
           metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size),
           train_accuracy
          )
        )
def bench_k_means(estimator, name, data, sample_size, labels,postIds):
    data=sparse.csr_matrix(data)
    t0 = time()
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    lsa = TruncatedSVD(500)

    data = lsa.fit_transform(data)
    data = Normalizer(copy=False).fit_transform(data)

    print("done in %fs" % (time() - t0))
    print()

    #sData=sparse.csr_matrix(data)
    val=estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f '
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))

    print("Parsing USer File:")
    parseUserFile()
    print("extracting User File:")
    clusterDict=extractCluster(postIds,estimator.labels_)
    print("writing Cluster Data to File")
    writeCluterToFile(clusterDict)
Example #11
0
def main(argv):
    file_vectors,clust_type, clusters, distance, cluster_param, std = get_arguments(argv)
    fname='.'.join(map(str,[file_vectors.split('/')[-1],clust_type, clusters, distance, cluster_param, std]))
    writer=open(fname,'w') ## better to put in EX1, EX2, .. folders
    print 'clustering:',clust_type
    print 'clusters:',clusters
    print 'cluster_param:',cluster_param
    print 'std:',std
        
    X,words,truth=load_data(file_vectors,True)
    X=np.array(X)
    
    if clust_type=='affin':
        labels=affin_sclustering(X, n_clust=int(clusters), distance=distance, gamma=float(cluster_param), std=bool(std)) 
    else:
        labels=knn_sclustering(X, n_clust=int(clusters), k=int(cluster_param)) 
    
    writer.write('\nVMeas:'+ str(v_measure_score(truth,labels)))
    writer.write('\nRand:'+str(adjusted_rand_score(truth,labels)))
    writer.write('\nHomogen:'+str(homogeneity_score(truth,labels))+'\n')
        
    i=0
    for word in words:
        writer.write(word+' : '+str(labels[i])+'\n')
        i+=1   
    writer.close()       
Example #12
0
def bench_k_means(estimator, data, labels):
    t0 = time()
    estimator.fit(data)
    print("time to fit: {:.5}".format(time() - t0))
    homogenity = metrics.homogeneity_score(labels, estimator.labels_)
    completeness = metrics.completeness_score(labels, estimator.labels_)
    v_measure = metrics.v_measure_score(labels, estimator.labels_)
    print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format(
        homogenity, completeness, v_measure)
    )

    adj_rand_score = metrics.adjusted_rand_score(
        labels, estimator.labels_
    )
    print("adjusted_rand_score {:.5}".format(adj_rand_score))

    adj_mutual_info_score = metrics.adjusted_mutual_info_score(
        labels,  estimator.labels_
    )
    print("adjusted_mutual_info_score {:.5}".format(
        adj_mutual_info_score)
    )

    silhouette_score = metrics.silhouette_score(
        data, estimator.labels_, metric='euclidean'
    )
    print("silhouette_score {:.5}".format(
        metrics.silhouette_score(data, estimator.labels_,
                                 metric='euclidean'))
    )

    return [
        homogenity, completeness, v_measure, adj_rand_score,
        adj_mutual_info_score, silhouette_score
    ]
Example #13
0
def cluster(Z, K=4, algo='kmeans'):
	descr = Z.columns
	X = Imputer().fit_transform(Z)

	##############################################################################
	if algo == 'dbscan':
		# Compute DBSCAN
		db = DBSCAN(eps=0.3, min_samples=10).fit(X)
		core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
		core_samples_mask[db.core_sample_indices_] = True
		labels = db.labels_
        
		# Number of clusters in labels, ignoring noise if present.
		n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        
		print('Estimated number of clusters: %d' % n_clusters_)
		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
		print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
		print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
		print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
	
	elif algo == 'kmeans':
		km = KMeans(n_clusters=K)
		km.fit(X)
		print(km.labels_)
		return km
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Example #15
0
def predictAffinityPropagation(X, labels_true):
	#ranX, ranY = shuffle(X, y, random_state=0)
	af = AffinityPropagation(preference=-50).fit(X)
	cluster_centers_indices = af.cluster_centers_indices_
	labels = af.labels_

	n_clusters_ = len(cluster_centers_indices)

	print('Estimated number of clusters: %d' % n_clusters_)
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
	print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
	print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
	print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

	plt.close('all')
	plt.figure(1)
	plt.clf()

	colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
	for k, col in zip(range(n_clusters_), colors):
	    class_members = labels == k
	    cluster_center = X[cluster_centers_indices[k]]
	    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
	    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
	             markeredgecolor='k', markersize=14)
	    for x in X[class_members]:
	        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

	plt.title('Estimated number of clusters: %d' % n_clusters_)
	plt.show()
Example #16
0
def compare(method1, method2, fig=False):
    X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1))
    X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2))
    
    print 'n_cluster\tHomo\tCompl\tNMI\tARI'
    for i in range(2, 6):
        clust1 = Clustering(species, method1, X1, None, n_clusters=i)
        clust2 = Clustering(species, method2, X2, None, n_clusters=i)
        
        clust1.agglomerative(linkage='ward')
        clust2.agglomerative(linkage='ward')
        
        label1 = clust1.pred_labels('ward')
        label2 = clust2.pred_labels('ward')
        
        
        if i == 3 and fig:
            names = np.unique(label1)
            figName = '{0}_{1}_on_{2}'.format(species, method1, method2)
            plot2d(X2, label1, names, figName, figName)

            names = np.unique(label2)
            figName = '{0}_{1}_on_{2}'.format(species, method2, method1)
            plot2d(X1, label2, names, figName, figName)
    
        print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2),
                                                metrics.completeness_score(label1, label2),
                                                metrics.normalized_mutual_info_score(label1, label2),
                                                metrics.adjusted_rand_score(label1, label2))
Example #17
0
def clustering_by_kmeans(vectorizer, X, true_k):
    print "Clustering in " + str(true_k) + " groups by K-means..."
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
    km.fit_predict(X)

    print "Measuring..."

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_))  #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied.
    print("Adjusted Rand-Index: %.3f"   % metrics.adjusted_rand_score(documents, km.labels_))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
    #print top terms per cluster clusters

    clusters = km.labels_.tolist()  # 0 iff term is in cluster0, 1 iff term is in cluster1 ...  (lista de termos)
    #print "Lista de termos pertencentes aos clusters " + str(clusters)
    print "Total de " + str(len(km.labels_)) + " documents"

    #Example to get all documents in cluster 0
    #cluster_0 = np.where(clusters==0) # don't forget import numpy as np
    #print cluster_0
    #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do:
    #X_cluster_0 = documents[cluster_0]
    terms = vectorizer.get_feature_names()

    #print terms
    measuring_kmeans(true_k,clusters)
Example #18
0
def cluster(algorithm, data, topics, make_silhouette=False):
  print str(algorithm)
  clusters = algorithm.fit_predict(data)
  labels = algorithm.labels_
  print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
  print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
  print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
  print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
  print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
  print ' ***************** '
  
  silhouettes = metrics.silhouette_samples(data, labels)
  num_clusters = len(set(clusters))
  print 'num clusters: %d' % num_clusters
  print 'num fitted: %d' % len(clusters)

  # Make a silhouette plot if the flag is set
  if make_silhouette:
    order = numpy.lexsort((-silhouettes, clusters)) 
    indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
    ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
    ytickLabels = ["%d" % x for x in range(num_clusters)]
    cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
    clr = [cmap[i] for i in clusters[order]]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,   
            edgecolor='none', color=clr)
    ax.set_ylim(ax.get_ylim()[::-1])
    plt.yticks(ytick, ytickLabels)
    plt.xlabel('Silhouette Value')
    plt.ylabel('Cluster')
    plt.savefig('cluster.png')
Example #19
0
def cluster(model, uids):
    ##############################################################################
    # Generate sample data
    X = []
    for uid in uids:
        X.append(model.docvecs[uid])
    labels_true = uids

    ##############################################################################
    # Compute Affinity Propagation
    af = AffinityPropagation(preference=-50).fit(X)
    pickle.dump(af, open('data/af.pick', 'w'))
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def bestClassify(X,Y):
	"Best classifier function"
	tfidf = True

	if tfidf:
		vec = TfidfVectorizer(preprocessor = identity,
							tokenizer = identity, sublinear_tf = True)
	else:
		vec = CountVectorizer(preprocessor = identity,
							tokenizer = identity)

	km = KMeans(n_clusters=2, n_init=100, verbose=1)
	clusterer = Pipeline( [('vec', vec),
								('cls', km)] )

	prediction = clusterer.fit_predict(X,Y)

	checker = defaultdict(list)
	for pred,truth in zip(prediction,Y):
		checker[pred].append(truth)

	labeldict = {}
	for pred, label in checker.items():
		labeldict[pred] = Counter(label).most_common(1)[0][0]
		#print(pred, Counter(label).most_common(1)[0][0])

	prediction = [labeldict[p] for p in prediction]
	labels = list(labeldict.values())
	print(labels)
	print(confusion_matrix(Y, prediction, labels=labels))

	print("Homogeneity:", homogeneity_score(Y,prediction))
	print("Completeness:", completeness_score(Y,prediction))
	print("V-measure:", v_measure_score(Y,prediction))
	print("Rand-Index:", adjusted_rand_score(Y,prediction))
Example #21
0
File: a.py Project: chengxwcq/ee219
def get_result(km, labels):
    homo_score = metrics.homogeneity_score(labels, km.labels_)
    complete_score = metrics.completeness_score(labels, km.labels_)
    v_score = metrics.v_measure_score(labels, km.labels_)
    rand_score = metrics.adjusted_rand_score(labels, km.labels_)
    mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_)
    return homo_score, complete_score, v_score, rand_score, mutual_info
Example #22
0
def run_clustering( clusterer, data, labels ):
    """
    Cluster: Using a predefined and parameterized clustering algorithm, fit
    some dataset and perform metrics given a set of ground-truth labels.

        clusterer: the clustering algorithm, from sklearn
        data:      array-like dataset input
        labels:    vector of ground-truth labels

    """

    # Time the operation
    t0 = time()
    clusterer.fit(data)
    t1 = time()

    # Perform metrics
    runtime         = (t1 - t0)
    homogeneity     = metrics.homogeneity_score(   labels, clusterer.labels_ )
    completeness    = metrics.completeness_score(  labels, clusterer.labels_ )
    v_measure       = metrics.v_measure_score(     labels, clusterer.labels_ )
    adjusted_rand   = metrics.adjusted_rand_score( labels, clusterer.labels_ )
    adjusted_mutual = metrics.adjusted_mutual_info_score( labels,
                                                          clusterer.labels_ )

    # Output to logs
    logging.info("  |-        Execution time: %fs"   % runtime)
    logging.info("  |-           Homogeneity: %0.3f" % homogeneity)
    logging.info("  |-          Completeness: %0.3f" % completeness)
    logging.info("  |-             V-measure: %0.3f" % v_measure)
    logging.info("  |-   Adjusted Rand-Index: %.3f"  % adjusted_rand)
    logging.info("  |-  Adjusted Mutual Info: %.3f"  % adjusted_mutual)
Example #23
0
def cluseval(label, truth):
    rand = metrics.adjusted_rand_score(truth, label)
    mutual = metrics.adjusted_mutual_info_score(truth, label)
    h**o = metrics.homogeneity_score(truth, label)
    complete = metrics.completeness_score(truth, label)
    v = metrics.v_measure_score(truth, label)
    result = [rand, mutual, h**o, complete, v]
    return result
Example #24
0
def print_cluster(clusterTrainClass, labels, clusterTestStory):
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels))
	print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels))
	print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels))
	print "Silhouette Coefficient:"
	print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def evaluate(labels_true, labels):
    homogeneity = metrics.homogeneity_score(labels_true, labels)
    completeness = metrics.completeness_score(labels_true, labels)
    v_measure = metrics.v_measure_score(labels_true, labels)
    adjusted_rand = metrics.adjusted_rand_score(labels_true, labels)
    adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels)
    #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean')
    return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
def cluster_metrics(labels_1, labels_2):
    print("\n".join(
        [
            "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)),
            "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)),
            "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)),
            "Completeness: %f" % (completeness_score(labels_1, labels_2))
        ]
    ))
Example #27
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(usage=__doc__,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description='Perform spectral clustering.')
    parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.')
    parser.add_argument("--knn", "-k", type=int, default=0, 
            help='Number of nearest neighbors, 0 means all.')
    parser.add_argument("--sm", "-s", 
            help='File containing similarity matrix')
    parser.add_argument("--iterations", "-i", type=int, default=10,
            help='Number of KMeans iterations.')
    parser.add_argument("--true_labels", "-t", 
            help='File containing the true labels.')
    parser.add_argument("--output", "-o", help='Name of the file to write' +
            ' the labels to.')
    parser.add_argument("--normalize", "-n", action='store_true', 
            help='Normalize each row so that the max value is one.')
    args = parser.parse_args()


    sm = np.load(args.sm)
    if args.normalize:
        sm /= sm.max(axis=1)[:, np.newaxis]
        # Ensure symmetric
        sm = (sm + sm.T) / 2
    labels = []
    if args.knn > 0:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='nearest_neighbors', n_neighbors=args.knn,
                n_init=args.iterations).fit(sm).labels_
    else:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='precomputed',
                n_init=args.iterations).fit(sm).labels_
    
    with open(args.output, 'w') as fout:
        for l in labels:
            fout.write(str(l) + '\n')

    # Load the true labels.
    if args.true_labels:
        true_labels = []
        with open(args.true_labels, 'r') as fin:
            for line in fin:
                true_labels.append(int(line.strip()))
        # Run the metrics.
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels))
        print("Adjusted Rand Index: %0.3f"
                      % metrics.adjusted_rand_score(true_labels, labels))
        print("Adjusted Mutual Information: %0.3f"
                      % metrics.adjusted_mutual_info_score(true_labels, labels))
        print("Silhouette Coefficient: %0.3f"
                      % metrics.silhouette_score(sm, labels))
Example #28
0
 def eval_clusters(self):
     """calculates the adjusted rand index of the clustering
     based on the label of the points
     """
     _, labels_true, labels_pred = self.get_labels()
     ari = metrics.adjusted_rand_score(labels_true, labels_pred)
     hom = metrics.homogeneity_score(labels_true, labels_pred)
     comp = metrics.completeness_score(labels_true, labels_pred)
     return ari, hom, comp
Example #29
0
def get_cluster_metrics(X, labels, labels_true=None):
    metrics_dict = dict()
    metrics_dict['Silhouette coefficient'] = metrics.silhouette_score(X,
                                                                      labels,
                                                                      metric='precomputed')
    if labels_true:
        metrics_dict['Completeness score'] = metrics.completeness_score(labels_true, labels)
        metrics_dict['Homogeneity score'] = metrics.homogeneity_score(labels_true, labels)

    return metrics_dict
Example #30
0
 def evaluateAllAlgorithms(self):
   algs = [self.labels_db,self.labels_ap]
   t**s =['DBASE','AP']
   for i in range(2):
     print 'Algorithm:',t**s[i]
     print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i]))
     print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i]))
     print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i]))
     print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i]))
     print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
gmmClusterer = GaussianMixture(n_components=2)
t0 = time()
gmmTrainedLabels = gmmClusterer.fit(Train_Matrix)

gmmTestLabels = gmmClusterer.predict(Test_Matrix)

print(82 * '*')

print("Cluster Means: ", str(gmmClusterer.means_))
print(82 * '-')
print("Cluster Covariance: ", gmmClusterer.covariances_)
print(82 * '-')
print("Precisions: ", str(gmmClusterer.precisions_))
print(82 * '-')

print('Model\t\ttime\thomo\tcompl\tv-meas\tARI  \tAMI')
print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
      ('GMM', (time() - t0),
       metrics.homogeneity_score(Test_Target_Matrix, gmmTestLabels),
       metrics.completeness_score(Test_Target_Matrix, gmmTestLabels),
       metrics.v_measure_score(Test_Target_Matrix, gmmTestLabels),
       metrics.adjusted_rand_score(Test_Target_Matrix, gmmTestLabels),
       metrics.adjusted_mutual_info_score(Test_Target_Matrix, gmmTestLabels)))

# plt.scatter(Test_Matrix.iloc[0,:], Test_Matrix.iloc[1,:], color='black')
# # Prediction and draw the diagram
# #plt.plot(range(len(testData)), y_testDataPrediction_tuned, color='red', linewidth=1)
# #plt.legend(["predict", "true"], loc='upper right')
# plt.title('GMM Clustering')
# plt.show()
Example #32
0
print '---'
print 'true kappas {}'.format(kappas)
print 'vmf-soft kappas {}'.format(
    vmf_soft.concentrations_[[vmf_soft_mu_0_idx, vmf_soft_mu_1_idx]])
print 'vmf-hard kappas {}'.format(
    vmf_hard.concentrations_[[vmf_hard_mu_0_idx, vmf_hard_mu_1_idx]])

print '---'
print 'vmf-soft weights {}'.format(
    vmf_soft.weights_[[vmf_soft_mu_0_idx, vmf_soft_mu_1_idx]])
print 'vmf-hard weights {}'.format(
    vmf_hard.weights_[[vmf_hard_mu_0_idx, vmf_hard_mu_1_idx]])

print '---'
print("Homogeneity: %0.3f (k-means)" %
      metrics.homogeneity_score(labels, km.labels_))
print("Homogeneity: %0.3f (spherical k-means)" %
      metrics.homogeneity_score(labels, skm.labels_))
print("Homogeneity: %0.3f (vmf-soft)" %
      metrics.homogeneity_score(labels, vmf_soft.labels_))
print("Homogeneity: %0.3f (vmf-hard)" %
      metrics.homogeneity_score(labels, vmf_hard.labels_))

print '---'
print("Completeness: %0.3f (k-means)" %
      metrics.completeness_score(labels, km.labels_))
print("Completeness: %0.3f (spherical k-means)" %
      metrics.completeness_score(labels, skm.labels_))
print("Completeness: %0.3f" %
      metrics.completeness_score(labels, vmf_soft.labels_))
print("Completeness: %0.3f" %
Example #33
0

class_list = []
data_test = np.array(data_test)
index = 0
print (shape(centers))
for i in range (shape(centers)[0]):

	best_dist = 9999999
	for j in range (shape(data_test)[0]):
		dist=0

		for k in range(shape(centers)[1]):
			dist = dist+((centers[i][k]-data_test[j][k])**2)
		dist = (dist)**0.5


		if dist<best_dist:
			best_dist=dist
			index=j
	class_list.append(label_test[index])

pred=clusters
for i in range (len(clusters)):
	pred[i]=class_list[clusters [i]]

print('Homogeneity score :\n', metrics.homogeneity_score(label_test, pred))
print('F1 score :\n', f1_score(label_test, pred, average=None))
print ('ACCURACY :\n', metrics.classification_report(label_test, pred))
print('Confusion matrix:\n', confusion_matrix(label_test, pred))
def kmeans(principalDf, NbCluster, finalDf):

    kmeans = KMeans(n_clusters=3, init='k-means++').fit(principalDf)
    KM_clustered = principalDf.copy()
    KM_clustered = pd.DataFrame(KM_clustered)
    KM_clustered.loc[:, 'Cluster'] = kmeans.labels_  # append labels to points

    frames = [finalDf['Analysis'], KM_clustered['Cluster']]
    result = pd.concat(frames, axis=1)
    print('-' * 60)
    print("Kmeans résultat")
    print('-' * 60)
    print("Shape: {}".format(result.shape))
    print(result.sample(5))

    # =============================================================================
    # Assigning a label to each cluster
    # As there's no relation between a cluster number and the true label we need to map a cluster to the one label which appears most in that cluster
    #
    # These corrected predicted labels are needed below to calculate model performance vs the the true labels
    # =============================================================================
    print('\n')
    for ClusterNum in range(3):

        OneCluster = pd.DataFrame(
            result[result['Cluster'] == ClusterNum].groupby('Analysis').size())
        OneCluster.columns = ['Size']

        NewDigit = OneCluster.index[OneCluster['Size'] ==
                                    OneCluster['Size'].max()].tolist()
        NewDigit[0]

        rowIndex = result.index[result['Cluster'] == ClusterNum]
        result.loc[rowIndex, 'TransLabel'] = NewDigit[0]

        print(ClusterNum, NewDigit[0])

    # =============================================================================
    # # Check performance of classification to 3 clusters
    # =============================================================================
    print('-' * 60)
    print('K-Means performance')
    print('-' * 60)

    Correct = (finalDf['Analysis'] == result['TransLabel']).sum()
    Accuracy = round(Correct / finalDf.shape[0], 3)
    print('Accuracy ', Accuracy)

    # =============================================================================
    # # METRICS for clustering algorithms
    # =============================================================================

    print(
        'homogeneity_score: ',
        round(
            metrics.homogeneity_score(finalDf['Analysis'],
                                      result['TransLabel']), 3))
    print(
        'completeness_score: ',
        round(
            metrics.completeness_score(finalDf['Analysis'],
                                       result['TransLabel']), 3))
    print(
        'v_measure_score: ',
        round(
            metrics.v_measure_score(finalDf['Analysis'], result['TransLabel']),
            3))
    print(
        'adjusted_rand_score: ',
        round(
            metrics.adjusted_rand_score(finalDf['Analysis'],
                                        result['TransLabel']), 3))
    print(
        'adjusted_mutual_info_score: ',
        round(
            metrics.adjusted_mutual_info_score(finalDf['Analysis'],
                                               result['TransLabel']), 3))

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = principalDf.to_numpy()[:, 0].min(
    ) - 1, principalDf.to_numpy()[:, 0].max() + 1
    y_min, y_max = principalDf.to_numpy()[:, 1].min(
    ) - 1, principalDf.to_numpy()[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(principalDf.to_numpy()[:, 0],
             principalDf.to_numpy()[:, 1],
             'k.',
             markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)
    plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
              'Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
Example #35
0
from sklearn.cluster import KMeans

km3 = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1)
get_ipython().magic('time km3.fit(X_lsa)')

# In[11]:

# How do we know the clustering result is good or not?
# If we have labels available, we can use this to derive how coherent the clusters are.
# Homogeneity: each cluster contains only members of a single class

from sklearn import metrics

labels = subnews['Class']
print("Homogeneity for 3 clusters: %0.3f" %
      metrics.homogeneity_score(labels, km3.labels_))

# In[12]:

# Let's try some other K values to compare their metrics
km2 = KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=1)
get_ipython().magic('time km2.fit(X_lsa)')

km4 = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1)
get_ipython().magic('time km4.fit(X_lsa)')

km5 = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1)
get_ipython().magic('time km5.fit(X_lsa)')

# In[13]:
Example #36
0
Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(X_scaled)
    gmm.fit(X_scaled)
    Score['km'].append(km.score(X_scaled))
    Score['gmm'].append(gmm.score(X_scaled))
    S_homog['km'].append(
        metrics.homogeneity_score(labels, km.predict(X_scaled)))
    S_homog['gmm'].append(
        metrics.homogeneity_score(labels, gmm.predict(X_scaled)))
    S_adjMI['km'].append(
        metrics.adjusted_mutual_info_score(labels, km.predict(X_scaled)))
    S_adjMI['gmm'].append(
        metrics.adjusted_mutual_info_score(labels, gmm.predict(X_scaled)))
    S_vm['km'].append(metrics.v_measure_score(labels, km.predict(X_scaled)))
    S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(X_scaled)))

plt.figure(figsize=(9.6, 7.2))
plt.xlabel('Number of clusters')
plt.ylabel('Score value')
plt.title('Score vs. Cluster number for K-mean and Gaussian Mixture (species)')
plt.grid(True)
#plt.legend(['Train', 'Test'], loc='lower right')
Example #37
0
df.max()
df.min()
type(predictors)
#==============================================================================
# Clustering using Kmeans
#==============================================================================
#defining the object that will carry out the kmeans clustering part.
KMeans_object = KMeans(init = 'k-means++', n_clusters = 17, n_init= 10)
#doing the kmeans clustering
KMeans_object.fit(predictors)

#printing the inertia
print('The Inertia is:', KMeans_object.inertia_)

#Calculating the Homogenity score
print('Homogenity Score is:', metrics.homogeneity_score(outcomes, KMeans_object.labels_))
#Calculating the Completeness Score
print('Completeness Score is:', metrics.completeness_score(outcomes, KMeans_object.labels_))

#calculating the V-measure score
print('V-Measure Score is:', metrics.v_measure_score(outcomes, KMeans_object.labels_))

#calculating the adjusted rand score
print('Adjusted Rand Score is:', metrics.adjusted_rand_score(outcomes, KMeans_object.labels_))

#calculating the adjusted mutual information score
print('Adjusted Mututal Info Score is:', metrics.adjusted_mutual_info_score(outcomes, KMeans_object.labels_))

#Calculating the SIlhoutte Score. We are not sampling the dataset to calculate it.
print('Silhoutte Score is:', metrics.silhouette_score(predictors2, KMeans_object.labels_, metric='euclidean'))
Example #38
0
def predict_and_cluster(opts,mode):

	n_digits = 3
	n_samples, n_features = (25,1927)
	labels = array([0,1,2,1,1,2,2,1,2,0,0,0,1,1,2,1,1,1,1,1,1,1,1,2,1])
	true_k = np.unique(labels).shape[0]

	corpus, news = jieba_tokenizer()

	print("Extracting features from the training dataset using a sparse vectorizer")
	t0 = time()
	if opts.use_hashing:
	    if opts.use_idf:
	        # Perform an IDF normalization on the output of HashingVectorizer
	        hasher = HashingVectorizer(n_features=opts.n_features,
	                                   stop_words='english', non_negative=True,
	                                   norm=None, binary=False)
	        vectorizer = make_pipeline(hasher, TfidfTransformer())
	    else:
	        vectorizer = HashingVectorizer(n_features=opts.n_features,
	                                       stop_words='english',
	                                       non_negative=False, norm='l2',
	                                       binary=False)
	else:
	    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
	                                 min_df=2, stop_words='english',
	                                 use_idf=opts.use_idf)
	X = vectorizer.fit_transform(corpus)

	print("done in %fs" % (time() - t0))
	# n_samples: how many articles are there
	# n_features: how many different words in all articles are there
	print("n_samples: %d, n_features: %d" % X.shape)
	print()

	if opts.n_components:
	    print("Performing dimensionality reduction using LSA")
	    t0 = time()
	    # Vectorizer results are normalized, which makes KMeans behave as
	    # spherical k-means for better results. Since LSA/SVD results are
	    # not normalized, we have to redo the normalization.
	    svd = TruncatedSVD(opts.n_components)
	    lsa = make_pipeline(svd, Normalizer(copy=False))

	    X = lsa.fit_transform(X)

	    print("done in %fs" % (time() - t0))

	    svd = TruncatedSVD().fit(X)
	    X_proj = svd.transform(X)
	    explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum()

	    print("Explained variance of the SVD step: {}%".format(
	        int(explained_variances[0] * 100)))

	    print()

	# =================================================
	# clustering

	# if opts.minibatch:
	#     km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
	#                          init_size=1000, batch_size=1000, verbose=True)
	# else:
	km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
	                verbose=True) # always better
	
	print("Clustering sparse data with %s" % km)
	t0 = time()
	km.fit(X)
	print("done in %0.3fs" % (time() - t0))
	print()

	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
	print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
	print("Adjusted Rand-Index: %.3f"
	      % metrics.adjusted_rand_score(labels, km.labels_))

	print("Silhouette Coefficient: %0.3f"
	     % metrics.silhouette_score(X, labels, sample_size=None))

	print()

	if not (opts.n_components or opts.use_hashing):
	    print("Top terms per cluster:")
	    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
	    terms = vectorizer.get_feature_names()
	    for i in range(true_k):
	        print("Cluster %d:" % i, end='')
	        for ind in order_centroids[i, :10]:
	            print(' %s' % terms[ind], end='')
	        print()

	for i in range(len(news)):
	    news[i].category = labels[i]

	from sklearn.metrics.pairwise import cosine_similarity

	FG=nx.Graph()

	for i in range(len(news)):
	    news[i].similarity = cosine_similarity(X[i:i+1], X)[0]
	    cs = news[i].similarity
	    # print (cs)
	    for j in range(len(news)):
			if i != j:
				FG.add_weighted_edges_from([(i,j,cs[j])])

	# for i in range(len(news)):
	#     print(news[i].number, news[i].title, news[i].time, news[i].category, news[i].url, news[i].similarity)

	bestpart(FG,labels,km.labels_)
Example #39
0
    titles = u'原始数据', u'KMeans++聚类', u'旋转后数据', u'旋转后KMeans++聚类',\
             u'方差不相等数据', u'方差不相等KMeans++聚类', u'数量不相等数据', u'数量不相等KMeans++聚类'

    model = KMeans(n_clusters=4, init='k-means++', n_init=5)
    plt.figure(figsize=(9, 10), facecolor='w')
    for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1):
        plt.subplot(4, 2, i)
        plt.title(title)
        if i % 2 == 1:
            y_pred = y
        else:
            y_pred = model.fit_predict(x)
            model.cluster_centers_

        print i
        print 'Homogeneity:', homogeneity_score(y, y_pred)
        print 'completeness:', completeness_score(y, y_pred)
        print 'V measure:', v_measure_score(y, y_pred)
        print 'AMI:', adjusted_mutual_info_score(y, y_pred)
        print 'ARI:', adjusted_rand_score(y, y_pred)
        print 'Silhouette:', silhouette_score(x, y_pred), '\n'
        plt.scatter(x[:, 0],
                    x[:, 1],
                    c=y_pred,
                    s=30,
                    cmap=cm,
                    edgecolors='none')
        x1_min, x2_min = np.min(x, axis=0)
        x1_max, x2_max = np.max(x, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
Example #40
0
# use: https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

# ADJUSTED RAND SCORE
# https://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-index
print('--> adjusted rand score')
#print('adjusted rand score on training set: {}'.format(metrics.adjusted_rand_score(y_train, y_train_pred)))
#print('adjusted rand score on testing set: {}'.format(metrics.adjusted_rand_score(y_test, y_test_pred)))
print('adjusted rand score: {}'.format(metrics.adjusted_rand_score(y, y_pred)))

# HOMOGENEITY
# https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure
# homogeneity: each cluster contains only members of a single class.
print('--> homogeneity: each cluster contains only members of a single class.')
# print('homogeneity score on training set: {}'.format(metrics.homogeneity_score(y_train, y_train_pred)))
# print('homogeneity score on testing set: {}'.format(metrics.homogeneity_score(y_test, y_test_pred)))
print('homogeneity score: {}'.format(metrics.homogeneity_score(y, y_pred)))

# COMPLETENESS
# https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure
# completeness: all members of a given class are assigned to the same cluster.
print('--> completeness: all members of a given class are assigned to the same cluster.')
# print('completeness score on training set: {}'.format(metrics.completeness_score(y_train, y_train_pred)))
# print('completeness score on testing set: {}'.format(metrics.completeness_score(y_test, y_test_pred)))
print('completeness score: {}'.format(metrics.completeness_score(y, y_pred)))

# FOWLKES MALLOWS SCORES
# https://scikit-learn.org/stable/modules/clustering.html#fowlkes-mallows-scores
print('--> fowlkes mallows score: The Fowlkes-Mallows score FMI is defined as the geometric mean of the pairwise precision and recall.')
# print('fowlkes mallows score on training set: {}'.format(metrics.fowlkes_mallows_score(y_train, y_train_pred)))
# print('fowlkes mallows  score on testing set: {}'.format(metrics.fowlkes_mallows_score(y_test, y_test_pred)))
print('fowlkes mallows score: {}'.format(metrics.fowlkes_mallows_score(y, y_pred)))
Example #41
0
# -*- coding: utf-8 -*-
"""
Created on Wed May  2 21:53:10 2018
meanshift with iris data
@author: shifuddin
"""
from load_data import load_csv
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import homogeneity_score
'''
Load X, y from uri
'''
uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test'
X, y = load_csv(uri, ',', 1, 45, 0, 1, True)
'''
Calculate bandwidth / radius of each cluster centroid from data
'''
bandwidth = estimate_bandwidth(X, quantile=.1, n_samples=100)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)

ms.fit(X)

labels = ms.labels_
centroids = ms.cluster_centers_

homogeneity = homogeneity_score(y.ravel(), labels)
Example #42
0
def show_db(label_data,
            db,
            corpus,
            corpus_embeddings,
            corpus_file=None,
            label_file=None):
    if label_file is not None:
        # 读取真实标签数据
        label_data = pd.read_csv(label_file)
    labels_true = label_data.flag.to_list()
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    clustered_sentences = [[] for i in range(n_clusters_)]
    if corpus_file is not None:
        # 读取原始文本
        corpus = pd.read_csv(corpus_file).content.to_list()
    for sentence_id, cluster_id in enumerate(labels):
        clustered_sentences[cluster_id].append(corpus[sentence_id])

    for i, cluster in enumerate(clustered_sentences):
        print("Cluster ", i + 1)
        print(cluster)
        print("")

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" %
          metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" %
          metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(corpus_embeddings, labels))

    # #############################################################################
    # Plot result
    import matplotlib.pyplot as plt

    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [
        plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
    ]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = corpus_embeddings[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=tuple(col),
                 markeredgecolor='k',
                 markersize=14)

        xy = corpus_embeddings[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=tuple(col),
                 markeredgecolor='k',
                 markersize=6)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
Example #43
0
def main():
    # initialization
    retval = os.getcwd()
    corpus_path = retval + "/../vector_model_w_stem/corpus3/"
    word_set = {}
    term_list = []
    num_dimensionality = 25
    num_clusters = 5

    # get word list for each document
    for file_name in os.listdir(corpus_path):
        vector = {}
        file_path = corpus_path + file_name
        for line in open(file_path).read().split("\n"):
            word = line.split("\t")[0]
            if word:
                value = float(line.split("\t")[1])
                term_list.append(word)
                vector[word] = value
        word_set[file_name] = vector

    # get term list (vocabulary) based on documents
    print("------------------ Parameters Detail ---------------------")
    print("The length of total word list: " + str(len(term_list)))
    # remove duplicate
    term_list = list(set(term_list))
    print("Remove duplicate the length of vocabulary: " + str(len(term_list)))

    # generate term-document matrix and document list
    term_document_matrix = []
    doc_list = []
    ground_truth = []
    for doc_name in word_set.keys():
        vector = word_set[doc_name]
        ground_truth.append(doc_name.split("-")[0])
        term_document_vector = []
        for word_voc in term_list:
            if word_voc in vector.keys():
                value = vector[word_voc]
                term_document_vector.append(value)
            else:
                term_document_vector.append(0)
        term_document_matrix.append(term_document_vector)
        doc_list.append(doc_name)
    print("The number of document: " + str(len(doc_list)))
    print("The number of clusters: " + str(num_clusters))
    print("The dimensionality: " + str(num_dimensionality))

    term_document_matrix = np.array(term_document_matrix).transpose()
    # SVD
    U, Sigma, V = np.linalg.svd(term_document_matrix)

    # dimensionality reduction for each document
    doc_matrix_reduced = dimensionality_reduction(Sigma, V, num_dimensionality)

    # k-means clustering
    km = KMeans(n_clusters=num_clusters)
    km.fit(doc_matrix_reduced.transpose())

    # evaluate the quality of k-means clustering
    print("------------------ Clustering Result ---------------------")
    for label in range(0, num_clusters):
        for idx in range(0, len(doc_list)):
            if km.labels_[idx] == label:
                print doc_list[idx] + "\t" + str(km.labels_[idx])

    print("------------------- Evaluation Score ---------------------")
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(ground_truth, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(ground_truth, km.labels_))
    print("V-measure: %0.3f" %
          metrics.v_measure_score(ground_truth, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(ground_truth, km.labels_))

    # visualization the result of clustering
    visualization_clustering(doc_matrix_reduced, ground_truth)
    print("----------------------- All Set -------------------------")
def ClusterByHDbScan(listtuple_pred_true_text, avgItemsInCluster_in_a_batch):
    print("\nClusterByHDbScan")
    printClusterEvaluation_list(listtuple_pred_true_text)
    print(len(listtuple_pred_true_text), avgItemsInCluster_in_a_batch)

    dic_tupple_class_predicted = groupTxtByClass(listtuple_pred_true_text,
                                                 False)
    numberOfClusters_predicted = len(dic_tupple_class_predicted)

    dic_tupple_class_true = groupTxtByClass(listtuple_pred_true_text, True)
    numberOfClusters_true = len(dic_tupple_class_true)

    print("numberOfClusters_true=" + str(numberOfClusters_true) +
          ", numberOfClusters_predicted=" + str(numberOfClusters_predicted))

    train_data = []
    train_predlabels = []
    train_trueLabels = []

    for pred_true_text in listtuple_pred_true_text:
        train_predlabels.append(pred_true_text[0])
        train_trueLabels.append(pred_true_text[1])
        train_data.append(pred_true_text[2])

    vectorizer = TfidfVectorizer(max_df=1.0,
                                 min_df=1,
                                 stop_words='english',
                                 use_idf=True,
                                 smooth_idf=True,
                                 norm='l2')
    X = vectorizer.fit_transform(train_data)

    svd = TruncatedSVD(2)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_svd = lsa.fit_transform(X)

    min_cluster_size_in_a_batch = int(math.ceil(avgItemsInCluster_in_a_batch))

    min_cluster_size_in_a_batch = 2

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size_in_a_batch)
    clusterer.fit(X)
    X_hdbscan_labels = clusterer.labels_

    print("X-total-clusters=" + str(X_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(
              train_trueLabels, X_hdbscan_labels, average_method='arithmetic'))

    clusterer_svd = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size_in_a_batch)
    clusterer_svd.fit(X_svd)
    X_svd_hdbscan_labels = clusterer_svd.labels_

    db = DBSCAN().fit(X_svd)
    X_svd_dbscan_labels = db.labels_

    print("X-svd-total-clusters=" + str(X_svd_hdbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_hdbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_hdbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_hdbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_hdbscan_labels,
                                               average_method='arithmetic'))

    print("X-svd-dbscan-total-clusters=" + str(X_svd_dbscan_labels.max()))
    print("Homogeneity: %0.4f" %
          metrics.homogeneity_score(train_trueLabels, X_svd_dbscan_labels))
    print("Completeness: %0.4f" %
          metrics.completeness_score(train_trueLabels, X_svd_dbscan_labels))
    print("V-measure: %0.4f" %
          metrics.v_measure_score(train_trueLabels, X_svd_dbscan_labels))
    print("Adjusted Rand-Index: %.4f" %
          metrics.adjusted_rand_score(train_trueLabels, X_svd_dbscan_labels))
    print("nmi_score-whole-data:   %0.4f" %
          metrics.normalized_mutual_info_score(train_trueLabels,
                                               X_svd_dbscan_labels,
                                               average_method='arithmetic'))
Example #45
0
            print float(clus.loc[clus.iloc[:,
                                           -1] == 1].shape[0]) / clus.shape[0]

h**o = []
comp = []
v_mea = []
sil = []
man = []
numPoints = 8
for i in range(2, numPoints):
    rp = SparseRandomProjection(n_components=6)
    projected_data = rp.fit_transform(X)
    gm = mixture.GMM(n_components=i, covariance_type='diag')
    gm.fit(projected_data)
    y_pred = gm.predict(projected_data)
    h**o.append(metrics.homogeneity_score(y, y_pred))
    comp.append(metrics.completeness_score(y, y_pred))
    v_mea.append(metrics.v_measure_score(y, y_pred))
    sil.append(
        metrics.silhouette_score(projected_data,
                                 gm.predict(projected_data),
                                 metric='euclidean'))
    man.append(
        metrics.silhouette_score(projected_data,
                                 gm.predict(projected_data),
                                 metric='manhattan'))

x = xrange(2, numPoints)
fig = plt.figure()
plt.plot(x, h**o, label='homogeneity score')
plt.plot(x, comp, label='completeness score')
Example #46
0
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
# labels_pred = [1, 1, 0, 0, 3, 3]
labels_pred = labels_true[:]
# metrics.adjusted_rand_score(labels_true, labels_pred)
print metrics.adjusted_rand_score(labels_true, labels_pred)
print metrics.adjusted_mutual_info_score(labels_true, labels_pred)
print metrics.homogeneity_score(labels_true, labels_pred)
print metrics.completeness_score(labels_true, labels_pred)


Example #47
0
def analyze_k_means(estimator, name, data):
	t0 = time()
	estimator.fit(data)	
	print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels,  estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
Example #48
0
# contrast train data
X_contrast = np.zeros(np.shape(X_train))
for i in range(len(X_contrast)):
    image = X_train[i, :]
    image = image.astype(np.uint8)
    X_contrast[i] = cv2.equalizeHist(image).reshape(1, NUMBER_OF_PIXELS)

# normalize train data
X_contrast = X_contrast.astype('float32') / MAX_BRIGHTNESS - MEAN
X_train = X_train.astype('float32') / MAX_BRIGHTNESS - MEAN

# run kmeans with 19 clusters, as there are 19 letters left in the data
kmeans = KMeans(init="k-means++", n_clusters=19, n_init=4)

# run k-means on full dataset train
kmeans_full = kmeans.fit(X_contrast)
labels = kmeans.predict(X_contrast)

# print number of iterations train data
print('Number of iterations Full Kmeans train data {}'.format(
    kmeans_full.n_iter_))

# Print scores full train dataset
print('Homogeneity Score Full Train Dataset: {}'.format(
    homogeneity_score(y_train, labels)))
print('Completeness Score Full Train Dataset: {}'.format(
    completeness_score(y_train, labels)))
print('V-score Score Full train Dataset: {}'.format(
    v_measure_score(y_train, labels)))
Example #49
0
def Evaluate_old(listtuple_pred_true_text, ignoreMinusOne=False):

    preds = []
    trues = []

    new_listtuple_pred_true_text = []

    totalwords = 0

    for pred_true_text in listtuple_pred_true_text:
        if str(pred_true_text[1]) == '-1' and ignoreMinusOne == True:
            continue

        preds.append(pred_true_text[0])
        trues.append(pred_true_text[1])
        new_listtuple_pred_true_text.append(
            [pred_true_text[0], pred_true_text[1], pred_true_text[2]])

        totalwords += len(pred_true_text[2])
        #print(pred_true_text[2], totalwords)

    print("evaluate total texts=" + str(len(new_listtuple_pred_true_text)))

    score = metrics.homogeneity_score(trues, preds)
    print("homogeneity_score-whole-data:   %0.8f" % score)

    score = metrics.completeness_score(trues, preds)
    print("completeness_score-whole-data:   %0.8f" % score)

    score = metrics.v_measure_score(trues, preds)
    print("v_measure_score-whole-data:   %0.8f" % score)

    score = metrics.normalized_mutual_info_score(trues,
                                                 preds,
                                                 average_method='arithmetic')
    print("nmi_score-whole-data:   %0.8f" % score)

    #score=metrics.adjusted_mutual_info_score(trues, preds)
    #print ("adjusted_mutual_info_score-whole-data:   %0.4f" % score)

    #score=metrics.adjusted_rand_score(trues, preds)
    #print ("adjusted_rand_score-whole-data:   %0.4f" % score)

    dic_tupple_class = groupItemsBySingleKeyIndex(new_listtuple_pred_true_text,
                                                  0)  #before 0
    dic_tupple_class_true = groupItemsBySingleKeyIndex(
        new_listtuple_pred_true_text, 1)  #before 1
    print("pred clusters=" + str(len(dic_tupple_class)) + ", true clusters=" +
          str(len(dic_tupple_class_true)))
    ComputePurity(dic_tupple_class)
    li = [
        len(dic_tupple_class_true[x]) for x in dic_tupple_class_true
        if isinstance(dic_tupple_class_true[x], list)
    ]
    print('min', min(li), 'max', max(li), 'median', statistics.median(li),
          'avg', statistics.mean(li), 'std', statistics.stdev(li), 'sum of li',
          sum(li))
    print('avg words per text', totalwords / len(new_listtuple_pred_true_text),
          'totalwords', totalwords, '#texts',
          len(new_listtuple_pred_true_text))
    '''print("---Pred distribution")
Example #50
0
###################################################################

Kclusters = range(2,50,2)
km_sil_scores = []
km_homo_scores = []
km_inertia_scores = []
km_fitness_times = []

for k in Kclusters:
        t1 = time.time()
        km = KMeans(n_clusters=k, n_init=10,random_state=100,n_jobs=-1).fit(X1)
        t2 = time.time()

        km_fitness_times.append(t2 - t1)
        km_sil_scores.append(silhouette_score(X1, km.labels_))
        km_homo_scores.append(homogeneity_score(Y1, km.labels_))
        km_inertia_scores.append(km.inertia_)


em_sil_scores = []
em_homo_scores = []
em_aic_scores = []
em_bic_scores = []
em_fitness_times = []

for k in Kclusters:
        t1 = time.time()
        em = GaussianMixture(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X1)
        t2 = time.time()

        em_fitness_times.append(t2 - t1)
Example #51
0
    6: "total sulfur dioxide (mg/dm^3)",
    7: "density(g/cm^3)",
    8: "pH",
    9: "sulphates (g/dm^3)",
    10: "alcohol (vol.%)",
    11: "quality"
}
wine_2 = wine_2.rename(columns=mapping_2)
wine_2 = wine_2.drop(['quality'], axis=1)
kmeans = KMeans(n_clusters=7, random_state=0).fit(wine_2)
print("For 7 clusters, comparing them to the wine quality:")
print("Silhouette score", metrics.silhouette_score(wine_2, kmeans.labels_))
print("Completeness score",
      metrics.completeness_score(wine["quality"], kmeans.labels_))
print("Homogeneity score",
      metrics.homogeneity_score(wine["quality"], kmeans.labels_))

#testing cluster sizes
store = []
for i in range(3, 10):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(wine_2)
    store.append((metrics.silhouette_score(wine_2, kmeans.labels_), i))

plt.scatter([s[1] for s in store], [s[0] for s in store])

plt.xlabel("Clusters")
plt.ylabel("Silhouette score")
plt.savefig("clusters.png")
plt.close()

#graphs showing the groupings obtained
Example #52
0
                         batch_size=1000,
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
Example #53
0
        np.sum(evidenceList[allClusters[i] == lowClusterNo[i]] == "N"))  # TN
    allResults[i][3] = float(
        np.sum(evidenceList[allClusters[i] == lowClusterNo[i]] == "Y"))  # FN

# Evaluating cluster validation scores
# Each valList element contains [Adjusted Rand, Mutual Info, Adjusted Mutual Info, Normalized Mutual Info, Homogeneity, Completeness, V Measure]
valList = []
for i in range(0, len(allClusters)):
    valItem = []
    valItem.append(metrics.adjusted_rand_score(evidenceList, allClusters[i]))
    valItem.append(metrics.mutual_info_score(evidenceList, allClusters[i]))
    valItem.append(
        metrics.adjusted_mutual_info_score(evidenceList, allClusters[i]))
    valItem.append(
        metrics.normalized_mutual_info_score(evidenceList, allClusters[i]))
    valItem.append(metrics.homogeneity_score(evidenceList, allClusters[i]))
    valItem.append(metrics.completeness_score(evidenceList, allClusters[i]))
    valItem.append(metrics.v_measure_score(evidenceList, allClusters[i]))
    valList.append(valItem)

# Writing results
statFile = open(
    outputLocationStat + inputFileName.split("/")[-1].split(".")[0] + "_" +
    algorithm + "_" + method + "_" + distance + "_" + str(noClust) + ".stat",
    "w")
for i in range(0, len(allResults)):
    statFile.write("# " + allLabels[i] + "\n")
    tp = allResults[i][0]
    fp = allResults[i][1]
    tn = allResults[i][2]
    fn = allResults[i][3]
     #for each k, calculate the silhouette_coefficient by using: silhouette_score(X_training, kmeans.labels_)
     #find which k maximizes the silhouette_coefficient
     silhouette_coeff = silhouette_score(X_training, kmeans.labels_)
     silhouette_scores[k] = (silhouette_coeff)
     

#plot the value of the silhouette_coefficient for each k value of kmeans so that we can see the best k
k_value = [x for x in range(2,21)]
plt.plot(silhouette_scores.keys(),silhouette_scores.values(),)
# plt.show()
best_k = dict(sorted(silhouette_scores.items(), key = itemgetter(1), reverse = True)[:1])

#reading the validation data (clusters) by using Pandas library
df1 = pd.read_csv('testing.csv', header=None)

#assign your data labels to vector labels (you might need to reshape the row vector to a column vector)
# do this: np.array(df.values).reshape(1,<number of samples>)[0]
labels = np.array(df1.values).reshape(1,-1)[0]

#Calculate and print the Homogeneity of this kmeans clustering
print("K-Means Homogeneity Score = " + metrics.homogeneity_score(labels, kmeans.labels_).__str__())

#rung agglomerative clustering now by using the best value o k calculated before by kmeans
#Do it:
agg = AgglomerativeClustering(n_clusters=best_k.keys()[0], linkage='ward')
agg.fit(X_training)

# Calculate and print the Homogeneity of this agglomerative clustering
print("Agglomerative Clustering Homogeneity Score = " + metrics.homogeneity_score(labels, agg.labels_).__str__())
# In[ ]:

x = data[['0', '1']].values
x.shape

# In[ ]:

#printing results
print('labels:')
# print(labelsPred)

# tEnd = datetime.datetime.now()
# print('Time: ' + str(tEnd - tStart))
print('Measures:')
print('HS: ' + str(metrics.homogeneity_score(y, labelsPred)))
print('CS: ' + str(metrics.completeness_score(y, labelsPred)))
print('VM: ' + str(metrics.v_measure_score(y, labelsPred)))
print('AMI: ' + str(metrics.adjusted_mutual_info_score(y, labelsPred)))
print('ARI: ' + str(metrics.adjusted_rand_score(y, labelsPred)))

# In[ ]:

import matplotlib.pyplot as plt
from itertools import cycle, islice

fig = plt.figure()
colors = np.array(
    list(
        islice(
            cycle([
Example #56
0
X = X.toarray()
#y = np.array(labels)

print "Affinity Clustering..."
print X

##############################################################################
# Compute Affinity Propagation
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))


##############################################################################
# Plot result
import pylab as pl
from itertools import cycle
Example #57
0
def calc_measures_avg(measures, n_imgs, ignore_classes, for_final_result):
    measures_result = {}
    # these measures can just be averaged
    for measure in [
            Constants.ERRORS, Constants.IOU, Constants.BINARY_IOU,
            Constants.AP, Constants.MOTA, Constants.MOTP,
            Constants.AP_INTERPOLATED, Constants.FALSE_POSITIVES,
            Constants.FALSE_NEGATIVES, Constants.ID_SWITCHES
    ]:
        if measure in measures:
            measures_result[measure] = numpy.sum(measures[measure]) / n_imgs

    # TODO: This has to be added as IOU instead of conf matrix.
    if Constants.CONFUSION_MATRIX in measures:
        measures_result[Constants.IOU] = calc_iou(measures, n_imgs,
                                                  ignore_classes)

    if Constants.CLICKS in measures:
        clicks = [
            int(x.rsplit(':', 1)[-1]) for x in measures[Constants.CLICKS]
        ]
        measures_result[Constants.CLICKS] = float(numpy.sum(clicks)) / n_imgs

    if for_final_result and Constants.DETECTION_AP in measures:
        from object_detection.utils.object_detection_evaluation import ObjectDetectionEvaluation
        if isinstance(measures[Constants.DETECTION_AP],
                      ObjectDetectionEvaluation):
            evaluator = measures[Constants.DETECTION_AP]
        else:
            n_classes = measures[Constants.DETECTION_AP][-2]
            evaluator = ObjectDetectionEvaluation(n_classes,
                                                  matching_iou_threshold=0.5)
            evaluator.next_image_key = 0  # add a new field which we will use
            _add_aps(evaluator, measures[Constants.DETECTION_AP])

        aps, mAP, _, _, _, _ = evaluator.evaluate()
        measures_result[Constants.DETECTION_APS] = aps
        measures_result[Constants.DETECTION_AP] = mAP

    if for_final_result and Constants.CLUSTER_IDS in measures and Constants.ORIGINAL_LABELS in measures:
        from sklearn.metrics import adjusted_mutual_info_score, homogeneity_score, completeness_score
        labels_true = numpy.reshape(
            numpy.array(measures[Constants.ORIGINAL_LABELS],
                        dtype=numpy.int32), [-1])
        labels_pred = numpy.reshape(
            numpy.array(measures[Constants.CLUSTER_IDS], dtype=numpy.int32),
            [-1])
        ami = adjusted_mutual_info_score(labels_true, labels_pred)
        measures_result[Constants.ADJUSTED_MUTUAL_INFORMATION] = ami
        homogeneity = homogeneity_score(labels_true, labels_pred)
        measures_result[Constants.HOMOGENEITY] = homogeneity
        completeness = completeness_score(labels_true, labels_pred)
        measures_result[Constants.COMPLETENESS] = completeness

    NO_EVAL = False
    if not NO_EVAL:
        if for_final_result and Constants.ORIGINAL_LABELS in measures and Constants.EMBEDDING in measures:
            from sklearn import mixture
            from sklearn.cluster import KMeans
            from sklearn.metrics import adjusted_mutual_info_score, homogeneity_score, completeness_score
            embeddings = numpy.array(measures[Constants.EMBEDDING],
                                     dtype=numpy.int32)
            embeddings = numpy.reshape(embeddings, [-1, embeddings.shape[-1]])
            labels_true = numpy.reshape(
                numpy.array(measures[Constants.ORIGINAL_LABELS],
                            dtype=numpy.int32), [-1])
            # n_components = 80
            # n_components = 400
            # n_components = 1000
            n_components = 3000
            import time

            # start = time.time()
            # gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full')
            # gmm.fit(embeddings)
            # labels_pred= gmm.predict(embeddings)
            # print "gmm took ", time.time()-start

            start = time.time()
            kmeans = KMeans(n_clusters=n_components, n_jobs=-1)
            labels_pred = kmeans.fit_predict(embeddings)
            print("km took ", time.time() - start)

            ami = adjusted_mutual_info_score(labels_true, labels_pred)
            measures_result[Constants.ADJUSTED_MUTUAL_INFORMATION] = ami
            homogeneity = homogeneity_score(labels_true, labels_pred)
            measures_result[Constants.HOMOGENEITY] = homogeneity
            completeness = completeness_score(labels_true, labels_pred)
            measures_result[Constants.COMPLETENESS] = completeness

    return measures_result
Example #58
0
            shuffle(shuffleind)
            zipper = sorted((zip(shuffleind, reads, read_parent_id)))
            z, reads, read_parent_id = (list(t) for t in zip(*zipper))

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #HOMOGENEITY AND COMPLETENESS FOR FULL DATASET WITH PREVIOUSLY CALCULATED GOOD_THRESH!

            #Find suitable threshold
            print("\n\nThreshold approximation:")
            good_thresh = find_threshold(50, 15, 40, 1)
            print("\n\nFull clustering:")
            clus_N, cluster_sizes, read_labels = simplesim_cluster(good_thresh)


            print('\n\n~~~~~~~INFO~~~~~~~')
            hom = metrics.homogeneity_score(read_parent_id, read_labels)
            comp = metrics.completeness_score(read_parent_id, read_labels)
            print("Homogeneity: %0f" % hom)
            print("Completeness: %0f" % comp)
            homogeneity_lst.append(hom)
            completeness_lst.append(comp)

        print("\nDONE TRIPLICATES LOOP\n")

        cluster_eff_lst_lst.append(cluster_eff_lst)
        homogeneity_lst_lst.append(homogeneity_lst)
        completeness_lst_lst.append(completeness_lst)



#PANDAS DATAFRAME FROM THIS?!
                             max_features=10000,
                             min_df=2,
                             stop_words='english',
                             use_idf=True)
matrix = vectorizer.fit_transform(dataset.data)

print("n_samples: %d, n_features: %d" % matrix.shape)
print()

#降维
print("Performing dimensionality reduction using LSA")
t0 = time()
svd = TruncatedSVD(2)  #维度
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

matrix_l = lsa.fit_transform(matrix)

# #############################################################################
# Do the actual clustering

gmm = mixture.GaussianMixture(n_components=50, covariance_type='full')
labels = gmm.fit(matrix_l).predict(matrix_l)
labels_pred = labels
print("Homogeneity: %0.3f" %
      metrics.homogeneity_score(labels_ture, labels_pred))
print("Completeness: %0.3f" %
      metrics.completeness_score(labels_ture, labels_pred))
print("NMI: %0.3f" % metrics.normalized_mutual_info_score(
    labels_ture, labels_pred, average_method='arithmetic'))
Example #60
0
ax[1].set_title('Actual Training Labels')

# Show the plots
plt.show()

# Evaluation of Clustering Model
# Import `metrics` from `sklearn`
from sklearn import metrics

# Print out the confusion matrix with `confusion_matrix()`
print(metrics.confusion_matrix(y_test, y_pred))

from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
print('% 9s' % 'inertia    h**o   compl  v-meas     ARI AMI  silhouette')
print('%i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f' %
      (clf.inertia_, homogeneity_score(y_test, y_pred),
       completeness_score(y_test, y_pred), v_measure_score(
           y_test, y_pred), adjusted_rand_score(
               y_test, y_pred), adjusted_mutual_info_score(y_test, y_pred),
       silhouette_score(X_test, y_pred, metric='euclidean')))

# try out Support Vector Machines
# Import `train_test_split`
from sklearn.cross_validation import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test, images_train, images_test = train_test_split(
    digits.data, digits.target, digits.images, test_size=0.25, random_state=42)

# Import the `svm` model
from sklearn import svm