def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Example #2
0
File: bfc.py Project: audy/bfc
def main():
    ''' doctsring for main '''

    args = parse_args()

    setup_logging(verbose = args.verbose)

    records = consume_fasta(args.fasta_file)

    # setup Hasher, Vectorizer and Classifier

    hasher = HashingVectorizer(analyzer='char',
                               n_features = 2 ** 18,
                               ngram_range=(args.ngram_min, args.ngram_max),
                               )

    logging.info(hasher)

    encoder, classes = get_classes(records, args.tax_level)
    n_clusters = len(classes)

    logging.info('using taxonomic level %s' % args.tax_level)
    logging.info('Using %s clusters' % n_clusters)

    classifier = MiniBatchKMeans(n_clusters = n_clusters)

    records = records[0:args.n_iters]

    chunk_generator = iter_chunk(records, args.chunk_size, args.tax_level)

    logging.info('ngram range: [%s-%s]' % (args.ngram_min, args.ngram_max))

    for labels, features in chunk_generator:

        logging.info('transforming training chunk')
        labels = encoder.transform(labels)
        vectors = hasher.transform(features)

        logging.info('fitting training chunk')
        classifier.partial_fit(vectors)

        pred_labels = classifier.predict(vectors)

        score = v_measure_score(labels, pred_labels)
        shuffled_score = v_measure_score(labels, sample(pred_labels, len(pred_labels)))

        logging.info('score: %.2f' % (score))
        logging.info('shuffled score: %.2f' % (shuffled_score))
def bench_k_means(estimator, name, data, target_labels, sample_size):
  """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy
  ARGS
    estimator: K-Means clustering algorithm <sklearn.cluster.KMeans>
    name: estimator name <str>
    data: array-like or sparse matrix, shape=(n_samples, n_features)
    target_labels: labels of data points <number array>
    sample_size: size of the sample to use when computing the Silhouette Coefficient <int>
  """ 
  t0 = time()
  estimator.fit(data)

  _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels)

  print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        % (name, (time() - t0), estimator.inertia_,
           metrics.homogeneity_score(target_labels, estimator.labels_),
           metrics.completeness_score(target_labels, estimator.labels_),
           metrics.v_measure_score(target_labels, estimator.labels_),
           metrics.adjusted_rand_score(target_labels, estimator.labels_),
           metrics.adjusted_mutual_info_score(target_labels,  estimator.labels_),
           metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size),
           train_accuracy
          )
        )
Example #4
0
def main(argv):
    file_vectors,clust_type, clusters, distance, cluster_param, std = get_arguments(argv)
    fname='.'.join(map(str,[file_vectors.split('/')[-1],clust_type, clusters, distance, cluster_param, std]))
    writer=open(fname,'w') ## better to put in EX1, EX2, .. folders
    print 'clustering:',clust_type
    print 'clusters:',clusters
    print 'cluster_param:',cluster_param
    print 'std:',std
        
    X,words,truth=load_data(file_vectors,True)
    X=np.array(X)
    
    if clust_type=='affin':
        labels=affin_sclustering(X, n_clust=int(clusters), distance=distance, gamma=float(cluster_param), std=bool(std)) 
    else:
        labels=knn_sclustering(X, n_clust=int(clusters), k=int(cluster_param)) 
    
    writer.write('\nVMeas:'+ str(v_measure_score(truth,labels)))
    writer.write('\nRand:'+str(adjusted_rand_score(truth,labels)))
    writer.write('\nHomogen:'+str(homogeneity_score(truth,labels))+'\n')
        
    i=0
    for word in words:
        writer.write(word+' : '+str(labels[i])+'\n')
        i+=1   
    writer.close()       
Example #5
0
def my_clustering(X, y, n_clusters, pca):
    # =======================================
    # Complete the code here.
    # return scores like this: return [score, score, score, score]
    # =======================================
    from sklearn.cluster import KMeans
    #print('f**k X ', X.shape)
    #print('f**k y ', y.shape)
    clf = KMeans(n_clusters)
    clf.fit(X)

    from sklearn import metrics
    ari = metrics.adjusted_rand_score(y, clf.labels_)
    mri = metrics.adjusted_mutual_info_score(y, clf.labels_)
    v_measure = metrics.v_measure_score(y, clf.labels_)
    '''
    silhouette_coeff = metrics.silhouette_score(X, clf.labels_,
                                      metric='euclidean',
                                      sample_size=300)
    '''
    silhouette_coeff = metrics.silhouette_score(X, clf.labels_)

    show_images(n_clusters, clf, pca)


    return [ari,mri,v_measure,silhouette_coeff]
Example #6
0
def bench_k_means(estimator, data, labels):
    t0 = time()
    estimator.fit(data)
    print("time to fit: {:.5}".format(time() - t0))
    homogenity = metrics.homogeneity_score(labels, estimator.labels_)
    completeness = metrics.completeness_score(labels, estimator.labels_)
    v_measure = metrics.v_measure_score(labels, estimator.labels_)
    print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format(
        homogenity, completeness, v_measure)
    )

    adj_rand_score = metrics.adjusted_rand_score(
        labels, estimator.labels_
    )
    print("adjusted_rand_score {:.5}".format(adj_rand_score))

    adj_mutual_info_score = metrics.adjusted_mutual_info_score(
        labels,  estimator.labels_
    )
    print("adjusted_mutual_info_score {:.5}".format(
        adj_mutual_info_score)
    )

    silhouette_score = metrics.silhouette_score(
        data, estimator.labels_, metric='euclidean'
    )
    print("silhouette_score {:.5}".format(
        metrics.silhouette_score(data, estimator.labels_,
                                 metric='euclidean'))
    )

    return [
        homogenity, completeness, v_measure, adj_rand_score,
        adj_mutual_info_score, silhouette_score
    ]
Example #7
0
def cluster(Z, K=4, algo='kmeans'):
	descr = Z.columns
	X = Imputer().fit_transform(Z)

	##############################################################################
	if algo == 'dbscan':
		# Compute DBSCAN
		db = DBSCAN(eps=0.3, min_samples=10).fit(X)
		core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
		core_samples_mask[db.core_sample_indices_] = True
		labels = db.labels_
        
		# Number of clusters in labels, ignoring noise if present.
		n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        
		print('Estimated number of clusters: %d' % n_clusters_)
		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
		print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
		print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
		print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
	
	elif algo == 'kmeans':
		km = KMeans(n_clusters=K)
		km.fit(X)
		print(km.labels_)
		return km
def bench_k_means(estimator, name, data, sample_size, labels,postIds):
    data=sparse.csr_matrix(data)
    t0 = time()
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    lsa = TruncatedSVD(500)

    data = lsa.fit_transform(data)
    data = Normalizer(copy=False).fit_transform(data)

    print("done in %fs" % (time() - t0))
    print()

    #sData=sparse.csr_matrix(data)
    val=estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f '
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))

    print("Parsing USer File:")
    parseUserFile()
    print("extracting User File:")
    clusterDict=extractCluster(postIds,estimator.labels_)
    print("writing Cluster Data to File")
    writeCluterToFile(clusterDict)
Example #9
0
def clustering_by_kmeans(vectorizer, X, true_k):
    print "Clustering in " + str(true_k) + " groups by K-means..."
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
    km.fit_predict(X)

    print "Measuring..."

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_))  #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied.
    print("Adjusted Rand-Index: %.3f"   % metrics.adjusted_rand_score(documents, km.labels_))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
    #print top terms per cluster clusters

    clusters = km.labels_.tolist()  # 0 iff term is in cluster0, 1 iff term is in cluster1 ...  (lista de termos)
    #print "Lista de termos pertencentes aos clusters " + str(clusters)
    print "Total de " + str(len(km.labels_)) + " documents"

    #Example to get all documents in cluster 0
    #cluster_0 = np.where(clusters==0) # don't forget import numpy as np
    #print cluster_0
    #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do:
    #X_cluster_0 = documents[cluster_0]
    terms = vectorizer.get_feature_names()

    #print terms
    measuring_kmeans(true_k,clusters)
def bestClassify(X,Y):
	"Best classifier function"
	tfidf = True

	if tfidf:
		vec = TfidfVectorizer(preprocessor = identity,
							tokenizer = identity, sublinear_tf = True)
	else:
		vec = CountVectorizer(preprocessor = identity,
							tokenizer = identity)

	km = KMeans(n_clusters=2, n_init=100, verbose=1)
	clusterer = Pipeline( [('vec', vec),
								('cls', km)] )

	prediction = clusterer.fit_predict(X,Y)

	checker = defaultdict(list)
	for pred,truth in zip(prediction,Y):
		checker[pred].append(truth)

	labeldict = {}
	for pred, label in checker.items():
		labeldict[pred] = Counter(label).most_common(1)[0][0]
		#print(pred, Counter(label).most_common(1)[0][0])

	prediction = [labeldict[p] for p in prediction]
	labels = list(labeldict.values())
	print(labels)
	print(confusion_matrix(Y, prediction, labels=labels))

	print("Homogeneity:", homogeneity_score(Y,prediction))
	print("Completeness:", completeness_score(Y,prediction))
	print("V-measure:", v_measure_score(Y,prediction))
	print("Rand-Index:", adjusted_rand_score(Y,prediction))
Example #11
0
def run_clustering( clusterer, data, labels ):
    """
    Cluster: Using a predefined and parameterized clustering algorithm, fit
    some dataset and perform metrics given a set of ground-truth labels.

        clusterer: the clustering algorithm, from sklearn
        data:      array-like dataset input
        labels:    vector of ground-truth labels

    """

    # Time the operation
    t0 = time()
    clusterer.fit(data)
    t1 = time()

    # Perform metrics
    runtime         = (t1 - t0)
    homogeneity     = metrics.homogeneity_score(   labels, clusterer.labels_ )
    completeness    = metrics.completeness_score(  labels, clusterer.labels_ )
    v_measure       = metrics.v_measure_score(     labels, clusterer.labels_ )
    adjusted_rand   = metrics.adjusted_rand_score( labels, clusterer.labels_ )
    adjusted_mutual = metrics.adjusted_mutual_info_score( labels,
                                                          clusterer.labels_ )

    # Output to logs
    logging.info("  |-        Execution time: %fs"   % runtime)
    logging.info("  |-           Homogeneity: %0.3f" % homogeneity)
    logging.info("  |-          Completeness: %0.3f" % completeness)
    logging.info("  |-             V-measure: %0.3f" % v_measure)
    logging.info("  |-   Adjusted Rand-Index: %.3f"  % adjusted_rand)
    logging.info("  |-  Adjusted Mutual Info: %.3f"  % adjusted_mutual)
Example #12
0
def cluster(model, uids):
    ##############################################################################
    # Generate sample data
    X = []
    for uid in uids:
        X.append(model.docvecs[uid])
    labels_true = uids

    ##############################################################################
    # Compute Affinity Propagation
    af = AffinityPropagation(preference=-50).fit(X)
    pickle.dump(af, open('data/af.pick', 'w'))
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
Example #13
0
def predictAffinityPropagation(X, labels_true):
	#ranX, ranY = shuffle(X, y, random_state=0)
	af = AffinityPropagation(preference=-50).fit(X)
	cluster_centers_indices = af.cluster_centers_indices_
	labels = af.labels_

	n_clusters_ = len(cluster_centers_indices)

	print('Estimated number of clusters: %d' % n_clusters_)
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
	print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
	print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
	print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

	plt.close('all')
	plt.figure(1)
	plt.clf()

	colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
	for k, col in zip(range(n_clusters_), colors):
	    class_members = labels == k
	    cluster_center = X[cluster_centers_indices[k]]
	    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
	    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
	             markeredgecolor='k', markersize=14)
	    for x in X[class_members]:
	        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

	plt.title('Estimated number of clusters: %d' % n_clusters_)
	plt.show()
Example #14
0
def cluster(algorithm, data, topics, make_silhouette=False):
  print str(algorithm)
  clusters = algorithm.fit_predict(data)
  labels = algorithm.labels_
  print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
  print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
  print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
  print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
  print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
  print ' ***************** '
  
  silhouettes = metrics.silhouette_samples(data, labels)
  num_clusters = len(set(clusters))
  print 'num clusters: %d' % num_clusters
  print 'num fitted: %d' % len(clusters)

  # Make a silhouette plot if the flag is set
  if make_silhouette:
    order = numpy.lexsort((-silhouettes, clusters)) 
    indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
    ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
    ytickLabels = ["%d" % x for x in range(num_clusters)]
    cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
    clr = [cmap[i] for i in clusters[order]]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,   
            edgecolor='none', color=clr)
    ax.set_ylim(ax.get_ylim()[::-1])
    plt.yticks(ytick, ytickLabels)
    plt.xlabel('Silhouette Value')
    plt.ylabel('Cluster')
    plt.savefig('cluster.png')
Example #15
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Example #17
0
def clustering(dataset):
    vectorizer = dataset.vectorizer
    X = dataset.X
    true_k = dataset.n_classes
    labels = dataset.target

    km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1)

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f"
          % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, sample_size=1000))
    print()

    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0)
    for i in range(true_k):
        print("Cluster %d (%d):" % (i, sizes[i]), end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
Example #18
0
def kmeans_setup(data):
	

	if pca_f == 1:
		pca = PCA(n_components = num_clusters).fit(data)
		initializer = pca.components_
		name = 'PCA'
	else:
		initializer = 'k-means++'
		name = 'k-means++'

	t0 = time()
	
	estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations)
	estimator.fit(data)
	
	if debug == True:
		sample_size = 300
		print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
	          % (name, (time() - t0), estimator.inertia_,
	             metrics.homogeneity_score(labels, estimator.labels_),
	             metrics.completeness_score(labels, estimator.labels_),
	             metrics.v_measure_score(labels, estimator.labels_),
	             metrics.adjusted_rand_score(labels, estimator.labels_),
	             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
	             metrics.silhouette_score(data, estimator.labels_,
	                                      metric='euclidean',
	                                      sample_size=sample_size)))
	return estimator
Example #19
0
def affin_test():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    
     
    x_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
        x_train, y_train, test_size=0.9, random_state=42)    
    
    
    labels_true = y_train 
    
    x_train = StandardScaler().fit_transform(x_train)
    af = AffinityPropagation(preference=-50).fit(x_train)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    n_clusters_ = len(cluster_centers_indices)
    
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
Example #20
0
File: a.py Project: chengxwcq/ee219
def get_result(km, labels):
    homo_score = metrics.homogeneity_score(labels, km.labels_)
    complete_score = metrics.completeness_score(labels, km.labels_)
    v_score = metrics.v_measure_score(labels, km.labels_)
    rand_score = metrics.adjusted_rand_score(labels, km.labels_)
    mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_)
    return homo_score, complete_score, v_score, rand_score, mutual_info
Example #21
0
def compute_metrics(answers, predictions):
    aris = []
    vscores = []
    fscores = []
    weights = []
    for k in answers.keys():
        idx = np.argsort(np.array(answers[k][0]))
        true = np.array(answers[k][1])[idx]
        pred = np.array(predictions[k][1])
        weights.append(pred.shape[0])
        if len(np.unique(true)) > 1:
            aris.append(adjusted_rand_score(true, pred))
        vscores.append(v_measure_score(true, pred))
        fscores.append(compute_fscore(true, pred))
#        print '%s: ari=%f, vscore=%f, fscore=%f' % (k, aris[-1], vscores[-1], fscores[-1])
    aris = np.array(aris)
    vscores = np.array(vscores)
    fscores = np.array(fscores)
    weights = np.array(weights)
    print 'number of one-sense words: %d' % (len(vscores) - len(aris))
    print 'mean ari: %f' % np.mean(aris)
    print 'mean vscore: %f' % np.mean(vscores)
    print 'weighted vscore: %f' % np.sum(vscores * (weights / float(np.sum(weights))))
    print 'mean fscore: %f' % np.mean(fscores)
    print 'weighted fscore: %f' % np.sum(fscores * (weights / float(np.sum(weights))))
    return np.mean(aris),np.mean(vscores)
Example #22
0
def print_cluster(clusterTrainClass, labels, clusterTestStory):
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels))
	print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels))
	print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels))
	print "Silhouette Coefficient:"
	print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def evaluate(labels_true, labels):
    homogeneity = metrics.homogeneity_score(labels_true, labels)
    completeness = metrics.completeness_score(labels_true, labels)
    v_measure = metrics.v_measure_score(labels_true, labels)
    adjusted_rand = metrics.adjusted_rand_score(labels_true, labels)
    adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels)
    #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean')
    return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
Example #24
0
def cluseval(label, truth):
    rand = metrics.adjusted_rand_score(truth, label)
    mutual = metrics.adjusted_mutual_info_score(truth, label)
    h**o = metrics.homogeneity_score(truth, label)
    complete = metrics.completeness_score(truth, label)
    v = metrics.v_measure_score(truth, label)
    result = [rand, mutual, h**o, complete, v]
    return result
Example #25
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(usage=__doc__,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description='Perform spectral clustering.')
    parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.')
    parser.add_argument("--knn", "-k", type=int, default=0, 
            help='Number of nearest neighbors, 0 means all.')
    parser.add_argument("--sm", "-s", 
            help='File containing similarity matrix')
    parser.add_argument("--iterations", "-i", type=int, default=10,
            help='Number of KMeans iterations.')
    parser.add_argument("--true_labels", "-t", 
            help='File containing the true labels.')
    parser.add_argument("--output", "-o", help='Name of the file to write' +
            ' the labels to.')
    parser.add_argument("--normalize", "-n", action='store_true', 
            help='Normalize each row so that the max value is one.')
    args = parser.parse_args()


    sm = np.load(args.sm)
    if args.normalize:
        sm /= sm.max(axis=1)[:, np.newaxis]
        # Ensure symmetric
        sm = (sm + sm.T) / 2
    labels = []
    if args.knn > 0:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='nearest_neighbors', n_neighbors=args.knn,
                n_init=args.iterations).fit(sm).labels_
    else:
        labels = SpectralClustering(n_clusters=args.clusters, 
                affinity='precomputed',
                n_init=args.iterations).fit(sm).labels_
    
    with open(args.output, 'w') as fout:
        for l in labels:
            fout.write(str(l) + '\n')

    # Load the true labels.
    if args.true_labels:
        true_labels = []
        with open(args.true_labels, 'r') as fin:
            for line in fin:
                true_labels.append(int(line.strip()))
        # Run the metrics.
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels))
        print("Adjusted Rand Index: %0.3f"
                      % metrics.adjusted_rand_score(true_labels, labels))
        print("Adjusted Mutual Information: %0.3f"
                      % metrics.adjusted_mutual_info_score(true_labels, labels))
        print("Silhouette Coefficient: %0.3f"
                      % metrics.silhouette_score(sm, labels))
def evaluate(km, labels):

    print("Homogeneity: %.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %.3f" % metrics.v_measure_score(labels, km.labels_))

    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %.3f" % metrics.silhouette_score(X, \
                                                                    labels, \
                                                                    sample_size=1000))
Example #27
0
 def evaluateAllAlgorithms(self):
   algs = [self.labels_db,self.labels_ap]
   t**s =['DBASE','AP']
   for i in range(2):
     print 'Algorithm:',t**s[i]
     print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i]))
     print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i]))
     print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i]))
     print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i]))
     print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
Example #28
0
File: exp3.py Project: xulesc/algos
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))
Example #29
0
 def print_metrics(self, data, labels, labels_real):
     print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_real, labels))
     print("Completeness: %0.3f" % metrics.completeness_score(labels_real, labels))
     print("V-measure: %0.3f" % metrics.v_measure_score(labels_real, labels))
     print("Adjusted Rand Index: %0.3f"
           % metrics.adjusted_rand_score(labels_real, labels))
     print("Adjusted Mutual Information: %0.3f"
           % metrics.adjusted_mutual_info_score(labels_real, labels))
     print("Silhouette Coefficient: %0.3f"
           % metrics.silhouette_score(data, labels))
Example #30
0
def bench(estimator, name):
    # Lifted from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#example-cluster-plot-kmeans-digits-py
    t0 = time()
    print('% 9s   %.2fs   %.3f   %.3f   %.3f   %.3f   %.3f'
          % (name, (time() - t0),
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)
            ))
Example #31
0
File: iris.py Project: d12597/-
def cluster1(X, y):
    a = {}
    pca = PCA(n_components=2)  #降为2维
    pca = pca.fit(X)
    X_dr = pca.transform(X)

    #聚类种类及名称
    clustering_names = [
        'MiniBatchKMeans', 'MeanShift', 'AgglomerativeClustering', 'DBSCAN',
        'Birch'
    ]

    x = X_dr
    #规范化数据集以便于参数选择
    x = StandardScaler().fit_transform(x)
    #均值漂移估计带宽
    bandwidth = cluster.estimate_bandwidth(x, quantile=0.3)
    #kneighbors_graph类返回用KNN时和每个样本最近的K个训练集样本的位置
    connectivity = kneighbors_graph(x, n_neighbors=10, include_self=False)
    #使连接对称
    connectivity = 0.5 * (connectivity + connectivity.T)

    # 创建聚类估计器

    two_means = cluster.MiniBatchKMeans(n_clusters=3,
                                        n_init=10)  #MiniBatchKMeans
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)  #MeanShift
    average_linkage = cluster.AgglomerativeClustering(
        n_clusters=3)  #AgglomerativeClustering
    dbscan = cluster.DBSCAN(eps=0.5)  #DBSCAN
    birch = cluster.Birch(n_clusters=3)  #Birch

    #聚类算法
    clustering_algorithms = [two_means, ms, average_linkage, dbscan, birch]

    colors = np.array([x for x in "bgrcmykbgrcmykbgrcmykbgrcmyk"])
    #hstack()函数水平把数组堆叠起来
    colors = np.hstack([colors] * 20)

    num = []
    for name, algorithm in zip(clustering_names, clustering_algorithms):
        r = []
        g = []
        b = []
        # t0 = time.time() #time()函数返回当前时间的时间戳
        algorithm.fit(X)
        # t1 = time.time()
        #hasattr()函数用于判断对象是否包含对应的属性
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(x)

        # if hasattr(algorithm, 'cluster_centers_'):
        # centers = algorithm.cluster_centers_
        # center_colors = colors[:len(centers)]

        for i in range(len(colors[y_pred].tolist())):  #循环获取聚类结果的各个点的x,y,color
            if colors[y_pred].tolist()[i] == 'r':
                r.append([x[:, 0][i], x[:, 1][i]])
            if colors[y_pred].tolist()[i] == 'g':
                g.append([x[:, 0][i], x[:, 1][i]])
            if colors[y_pred].tolist()[i] == 'b':
                b.append([x[:, 0][i], x[:, 1][i]])
        #创建聚类名称与结果的键值对
        a.update({"%s" % name: {'r': r, 'g': g, 'b': b}})
        num.append(metrics.v_measure_score(y, y_pred))
    return x, a, num
Example #32
0
def cluster_to_find_similar_products():
    df = pd.read_csv(
        '/Users/srinath/playground/data-science/BimboInventoryDemand/producto_tabla.csv'
    )
    labels = df['Producto_ID']
    extracted_features = [extract_data(p) for p in df['NombreProducto'].values]
    extracted_features_np = np.row_stack(extracted_features)

    extracted_features_df = pd.DataFrame(extracted_features_np,
                                         columns=[
                                             'description', 'brand', 'weight',
                                             'pieces', "has_choco",
                                             "has_vanilla", "has_multigrain"
                                         ])

    print "have " + str(df.shape[0]) + "products"

    #vectorize names
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=200,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=True)
    X = vectorizer.fit_transform(extracted_features_df['description'])

    print X

    print("n_samples: %d, n_features: %d" % X.shape)

    print("Performing dimensionality reduction using LSA")
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(5)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print("new size", X.shape)
    print type(X)

    extracted_features_df = encode_onehot(extracted_features_df, ['brand'])
    extracted_features_df = drop_feilds_1df(extracted_features_df,
                                            ['description'])

    print "X,df", X.shape, extracted_features_df.values.shape

    X = np.hstack((X, extracted_features_df.values))

    # Do the actual clustering
    km = KMeans(n_clusters=10,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=True)

    print("Clustering sparse data with %s" % km)
    #km.fit(X)

    results = km.fit_predict(X)
    print len(results), results

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    products_clusters = np.column_stack([labels, results])
    to_saveDf = pd.DataFrame(products_clusters,
                             columns=["Producto_ID", "Cluster"])
    to_saveDf.to_csv('product_clusters.csv', index=False)

    to_saveDf['NombreProducto'] = df['NombreProducto']
        false_pos+=1 #311
    if p==0 and g==1:
        false_neg+=1 #171
#655 : actual positives    ; #593  : actual negative
#795 : predicted positive  ; #453  : predicted negative   

precision_model= true_pos/(true_pos+false_pos)  #0.6088
recall_model=true_pos/(true_pos+false_neg)  #0.7389
f_1=2*(precision_model *recall_model)/ (precision_model+recall_model) #0.667586
print("Accuracy:",metrics.accuracy_score(actual, predicted))  #0.61378 (accuracy calculation)

##---------metrics to evaluate performance of clustering-------------
# zero is bad; 1 is good ;
homogeneity=metrics.homogeneity_score(actual, predicted)    
completeness= metrics.completeness_score(actual, predicted)  
v_measure=metrics.v_measure_score(actual, predicted)   
#perfect labelling==1, bad labelling closer to 0
ami= metrics.adjusted_mutual_info_score(actual, predicted) 
nmi=metrics.normalized_mutual_info_score(actual, predicted) 
mutual_i= metrics.mutual_info_score(actual, predicted)  #this can be either positive or negaive (negative is bad)
#btwn -1 and 1; negative values are bad (independent labelings), similar clusterings have a positive ARI, 1.0 is the perfect match score.
ari=metrics.adjusted_rand_score(actual, predicted) 

# ROC curve
fpr, tpr, thresholds = roc_curve(actual, predicted, pos_label = 1)
roc_auc = auc(fpr, tpr)
plt.figure(1, figsize = (15, 10))
plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
Example #34
0
    labels = [int(x / 4) for x in dataset.target]
    vectorizer = CountVectorizer(min_df=3, stop_words="english")
    dataset_array = vectorizer.fit_transform(dataset.data)
    tfidf_transformer = TfidfTransformer()
    dataset_tfidf = tfidf_transformer.fit_transform(dataset_array)
    print("p1: dimensions of the TF-IDF matrix is: ", dataset_tfidf.shape)

    # Q2: contingency table of clustering result
    km = KMeans(n_clusters=2, random_state=0, max_iter=1000, n_init=30)
    km.fit(dataset_tfidf)
    get_contingency_table(labels, km.labels_)

    # Q3: 5 measures
    print("Homogeneity: %0.4f" % homogeneity_score(labels, km.labels_))
    print("Completeness: %0.4f" % completeness_score(labels, km.labels_))
    print("V-measure: %0.4f" % v_measure_score(labels, km.labels_))
    print("Adjusted Rand Index: %.4f" %
          adjusted_rand_score(labels, km.labels_))
    print("Adjusted mutual info score: %.4f" %
          adjusted_mutual_info_score(labels, km.labels_))

    # Q4: plot variance
    plot_variance(dataset_tfidf)

    # Q5,6: SVD and NMF
    best_r_svd = plot_r_choice(dataset_tfidf, labels, "SVD")
    print("The best r for SVD is " + str(best_r_svd))

    best_r_nmf = plot_r_choice(dataset_tfidf, labels, "NMF")
    print("The best r for NMF is " + str(best_r_nmf))
clusterize = KMeans(n_clusters=3, random_state=42)
output = clusterize.fit_predict(data)

data_res = []
#clusterize.labels_ = [str(label + 1) for label in clusterize.labels_]
data_res.append(({
    'ARI':
    metrics.adjusted_rand_score(expert_labels, clusterize.labels_),
    'AMI':
    metrics.adjusted_mutual_info_score(expert_labels, clusterize.labels_),
    'Homogenity':
    metrics.homogeneity_score(expert_labels, clusterize.labels_),
    'Completeness':
    metrics.completeness_score(expert_labels, clusterize.labels_),
    'V-measure':
    metrics.v_measure_score(expert_labels, clusterize.labels_),
    'Silhouette':
    metrics.silhouette_score(data, clusterize.labels_)
}))

results = pd.DataFrame(data=data_res,
                       columns=[
                           'ARI', 'AMI', 'Homogenity', 'Completeness',
                           'V-measure', 'Silhouette'
                       ],
                       index=['K-means'])

print(results)

if vizualize:
    pca = PCA(n_components=2)
from sklearn import metrics

if __name__ == "__main__":
    y = [0, 0, 0, 1, 1, 1]
    y_hat = [0, 0, 1, 1, 2, 2]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    print(u'同一性(Homogeneity):', h)
    print(u'完整性(Completeness):', c)
    v2 = 2 * c * h / (c + h)
    v = metrics.v_measure_score(y, y_hat)
    print(u'V-Measure:', v2, v)

    y = [0, 0, 0, 1, 1, 1]
    y_hat = [0, 0, 1, 3, 3, 3]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    v = metrics.v_measure_score(y, y_hat)
    print(u'同一性(Homogeneity):', h)
    print(u'完整性(Completeness):', c)
    print(u'V-Measure:', v)

    # 允许不同值
    y = [0, 0, 0, 1, 1, 1]
    y_hat = [1, 1, 1, 0, 0, 0]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    v = metrics.v_measure_score(y, y_hat)
    print(u'同一性(Homogeneity):', h)
    print(u'完整性(Completeness):', c)
    print(u'V-Measure:', v)
"""
#We can turn those concept as scores homogeneity_score and completeness_score. Both are bounded below by 0.0 and above by 1.0 (higher is better):

from sklearn import metrics

labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.homogeneity_score(labels_true, labels_pred)  


metrics.completeness_score(labels_true, labels_pred) 

#Their harmonic mean called V-measure is computed by v_measure_score
metrics.v_measure_score(labels_true, labels_pred) 

#All calculated together
metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)



#https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

#http://www.learnbymarketing.com/methods/k-means-clustering/


"""
Q1. (Create a program that fulfills the following specification.)
deliveryfleet.csv
Example #38
0
labels_true = numpy.array(iris_target)
data = numpy.array(iris_data)

X = PCA(n_components=2).fit_transform(iris_data)
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-10).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
              % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
              % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
              % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()
Example #39
0
#clf.transform(X_test)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

y_test = [int(i) for i in test_labels]
pred_test = [int(i) for i in pred]
score = metrics.homogeneity_score(y_test, pred_test)
print("homogeneity_score:   %0.3f" % score)
score = metrics.completeness_score(y_test, pred_test)
print("completeness_score:   %0.3f" % score)
score = metrics.v_measure_score(y_test, pred_test)
print("v_measure_score:   %0.3f" % score)
score = metrics.accuracy_score(y_test, pred_test)
print("acc_score:   %0.3f" % score)
score = metrics.normalized_mutual_info_score(y_test, pred_test)
print("nmi_score:   %0.3f" % score)

#file=open("D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_traintest","w")
file = open(
    "/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_traintest",
    "w")
#file=open("D:/PhD/dr.norbert/dataset/shorttext/stackoverflow/semisupervised/stackoverflowraw_ensembele_traintest","w")
#file=open("D:/PhD/dr.norbert/dataset/shorttext/data-web-snippets/semisupervised/data-web-snippetsraw_ensembele_traintest","w")

for i in range(len(train_labels)):
    file.write(train_labels[i] + "\t" + train_trueLabels[i] + "\t" +
metrics_report = {'kmeans': {}, 'gmm': {}}

labels = {'kmeans': kmeans.labels_, 'gmm': gmm}

for each in metrics_report.keys():
    metrics_report[each]['ARI'] = round(
        metrics.adjusted_rand_score(y, labels[each]), 2)
    metrics_report[each]['AMI'] = round(
        metrics.adjusted_mutual_info_score(y, labels[each]), 2)
    metrics_report[each]['homogeneity'] = round(
        metrics.homogeneity_score(y, labels[each]), 2)
    metrics_report[each]['completeness'] = round(
        metrics.completeness_score(y, labels[each]), 2)
    metrics_report[each]['v_measure'] = round(
        metrics.v_measure_score(y, labels[each]), 2)
    metrics_report[each]['silhouette'] = round(
        metrics.silhouette_score(X, labels[each]), 2)
    metrics_report[each]['accuracy'] = round(
        metrics.accuracy_score(y, labels[each]) * 100, 2)

print(metrics_report)

#visualizing - k-means clustering of ICA transformed dataset
plt.scatter(X_scaled_transformed[kmeans.labels_ == 1, 0],
            X_scaled_transformed[kmeans.labels_ == 1, 1],
            s=40,
            c='red',
            label='Cluster 1')
plt.scatter(X_scaled_transformed[kmeans.labels_ == 0, 0],
            X_scaled_transformed[kmeans.labels_ == 0, 1],
# 训练聚类模型
n_clusters = 3  # 设置聚类数量
model_kmeans = KMeans(n_clusters=n_clusters, random_state=0)  # 建立聚类模型对象
model_kmeans.fit(X)  # 训练聚类模型
y_pre = model_kmeans.predict(X)  # 预测聚类模型

# 模型效果指标评估
n_samples, n_features = X.shape  # 总样本量,总特征数
inertias = model_kmeans.inertia_  # 样本距离最近的聚类中心的总和
adjusted_rand_s = metrics.adjusted_rand_score(y_true, y_pre)  # 调整后的兰德指数
mutual_info_s = metrics.mutual_info_score(y_true, y_pre)  # 互信息
adjusted_mutual_info_s = metrics.adjusted_mutual_info_score(y_true,
                                                            y_pre)  # 调整后的互信息
homogeneity_s = metrics.homogeneity_score(y_true, y_pre)  # 同质化得分
completeness_s = metrics.completeness_score(y_true, y_pre)  # 完整性得分
v_measure_s = metrics.v_measure_score(y_true, y_pre)  # V-measure得分
silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean')  # 平均轮廓系数
calinski_harabaz_s = metrics.calinski_harabaz_score(
    X, y_pre)  # Calinski和Harabaz得分
print('samples: %d \t features: %d' % (n_samples, n_features))  # 打印输出样本量和特征数量
print(70 * '-')  # 打印分隔线
print('ine\tARI\tMI\tAMI\thomo\tcomp\tv_m\tsilh\tc&h')  # 打印输出指标标题
print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d' %
      (inertias, adjusted_rand_s, mutual_info_s, adjusted_mutual_info_s,
       homogeneity_s, completeness_s, v_measure_s, silhouette_s,
       calinski_harabaz_s))  # 打印输出指标值
print(70 * '-')  # 打印分隔线
print('short name \t full name')  # 打印输出缩写和全名标题
print('ine \t inertias')
print('ARI \t adjusted_rand_s')
print('MI \t mutual_info_s')
Example #42
0
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.show()

# homogeneity, completeness, and v-measure
k = [2, 3, 4, 5, 6, 7, 8]

homo_score = []
comp_score = []
vm_score = []

for n_cluster in k:
    y_pred = KMeans(n_clusters=n_cluster, max_iter=1000,
                    random_state=47).fit_predict(X)
    h**o = metrics.homogeneity_score(y, y_pred)
    comp = metrics.completeness_score(y, y_pred)
    vm = metrics.v_measure_score(y, y_pred)

    homo_score.append(h**o)
    comp_score.append(comp)
    vm_score.append(vm)
plt.plot(k, homo_score, 'r', label='Homogeneity')
plt.plot(k, comp_score, 'b', label='Completeness')
plt.plot(k, vm_score, 'y', label='V- Measure')
plt.xlabel('Value of K')
plt.ylabel('homogeneity_completeness_v_measure')
plt.legend(loc=4)
plt.show()

# Adjusted Rand Index
k = [2, 3, 4, 5, 6, 7, 8]
scores = []
Example #43
0
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None):
    vecinos = algorithms[ALGORITHM]

    algoritmos = "coseno"
    if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]:
        algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS
    elif PORCENTAJE_VECINOS != 0:
        algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS)

    titulo = MODELO + "-" + algoritmos
    if normalizar is not None:
        titulo += "-" + normalizar

    fname = sys.argv[2] + "/" + titulo + ".out"

    if os.path.isfile(fname):
        return

    print(titulo)
    print("-" * 20)

    if PORCENTAJE_VECINOS == 0:
        X = coseno
        if MODELO == "dbscan":
            # Solo sirve para coseno!
            X = 1 - X
    else:
        neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy"
        if os.path.isfile(neighbour_file_name):
            NEIGHBOURS = np.load(neighbour_file_name)
        else:
            print("Calculando vecinos")
            NEIGHBOURS = np.zeros((len(service_number), len(service_number)))
            for i in range(0, len(service_number)):
                for j in range(i, len(service_number)):
                    NEIGHBOURS[i][j] = vecinos(followers, users, i, j)
                    if i != j:
                        NEIGHBOURS[j][i] = NEIGHBOURS[i][j]
            np.save(neighbour_file_name, NEIGHBOURS)

        if normalizar is not None:
            print("Normalizando Vecinos")
            if normalizar == 'minmax':
                NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS)
            elif normalizar == 'scale':
                NEIGHBOURS = preprocessing.scale(NEIGHBOURS)
            elif normalizar == 'robust':
                NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS)
            elif normalizar == 'softmax':
                NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True)
            elif normalizar == 'matrixminmax':
                NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS))
            elif normalizar == 'matrixmax':
                NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS)
        if MODELO == "dbscan":  # Si es distancia
            if normalizar is not None:
                NEIGHBOURS = 1 - NEIGHBOURS
            else:
                NEIGHBOURS = - NEIGHBOURS
            X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS
        else:  # Si es afinidad
            if PORCENTAJE_VECINOS == "boost":
                X = np.multiply(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "maxsim":
                X = np.maximum(coseno, NEIGHBOURS)
            elif PORCENTAJE_VECINOS == "dist":
                NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS))
                COSINE_SORTED = np.argsort(np.argsort(coseno))
                POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED)))
                X = POS_BOOST
            else:
                X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS

    print("Generando Modelo")

    if MODELO == 'kmedoids':
        model = KMedoids(n_clusters=1500).fit(X)
    if MODELO == 'kmedoids470':
        model = KMedoids(n_clusters=470).fit(X)
    elif MODELO == 'ap':
        model = AffinityPropagation(affinity='precomputed').fit(X)
    elif MODELO == 'dbscan':
        model = DBSCAN(metric='precomputed').fit(X)

    labels = model.labels_

    clusters = defaultdict(list)
    for index, classif in enumerate(labels):
        clusters[classif].append(index)

    n_clusters_ = len(clusters)

    info = ""
    info += 'Clusters: %d\n' % n_clusters_
    # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels)
    info += 'Entropy: %0.3f\n' % entropy(labels_true, labels)
    info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels)
    info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels)
    info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels)
    info += 'Purity: %0.3f\n' % purity(labels_true, labels)
    info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels)
    info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels)
    info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels)

    clustersize = Counter(labels)

    salida = open(fname, 'w', encoding='UTF-8')

    print(info)

    salida.write(titulo + "\n")
    for cluster, services in clusters.items():
        countcat = Counter([labels_true[svc] for svc in services])
        max_key, num = countcat.most_common(1)[0]
        salida.write("%i (%s - %i/%i): %s \n" % (
            cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services])))
    salida.write("-" * 20 + "\n")
    salida.write(info)
    salida.close()
    def kmeans_model(self, test_size, random_state,show=None):
        # pre-process the data
        standardized_data = scale(self.data)

        # splitting the data into training and testing sets
        # typically 3/4 of the data is used to train, 1/4 of the data is used to test
        # x is the data you are testing : y is the target values of the corresponding data
        x_train, x_test, y_train, y_test, images_train, images_test = train_test_split(standardized_data, self.target,
                                                                                       self.images,
                                                                                       test_size=test_size,
                                                                                       random_state=random_state)
        # gets the number of training features
        n_samples, n_features = x_train.shape

        # print out the number of samples and features
        print("# of training samples: ", n_samples)
        print("# of training features: ", n_features)

        # num_digits is the amount of unique targets
        n_digits = len(np.unique(y_train))

        # create the KMeans model.
        # init defaults to init='k-means++'
        # add n-init argument to determine how many different centroid configurations the algorithm will try
        clf = cluster.KMeans(init='k-means++', n_clusters=n_digits, random_state=random_state)

        # fit the x_train data to the model
        clf.fit(x_train)

        if show:
            #  create the figure with a size of 8x3 inches
            fig = plt.figure(figsize=(8, 4))

            # Add title
            fig.suptitle('Cluster Center Images', fontsize=14, fontweight='bold')

            # For all labels (0-9)
            for i in range(10):
                # Initialize subplots in a grid of 2X5, at i+1th position
                ax = fig.add_subplot(2, 5, 1 + i)
                # Display images
                ax.imshow(clf.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest")
                # Don't show the axes
                plt.axis('off')

            # Show the plot
            plt.show()

        # predict the labels for x_test
        y_pred = clf.predict(x_test)

        # print out the first 50 predicted and test values
        print("Predicted Values:\n",y_pred[:50])
        print("Target Values:\n",y_test[:50])
        print("Shape of Data:\n",clf.cluster_centers_.shape)

        # Create an isomap and fit the `digits` data to it
        x_iso = Isomap(n_neighbors=10).fit_transform(x_train)

        # Compute cluster centers and predict cluster index for each sample
        clusters = clf.fit_predict(x_train)

        if show:
            # Create a plot with subplots in a grid of 1X2
            fig = plt.figure(1, (8, 4))
            gs = gridspec.GridSpec(1, 2)
            ax = [fig.add_subplot(ss) for ss in gs]

            # Adjust layout
            fig.suptitle('Predicted Versus Training Labels(ISOMAP)', fontsize=14, fontweight='bold')

            # Add scatterplots to the subplots
            ax[0].scatter(x_iso[:, 0], x_iso[:, 1], c=clusters, edgecolors='black')
            ax[0].set_title('Predicted Training Labels')
            ax[1].scatter(x_iso[:, 0], x_iso[:, 1], c=y_train, edgecolors='black')
            ax[1].set_title('Actual Training Labels')

            gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])

            # Show the plots
            plt.show()

        # Model and fit the `digits` data to the PCA model
        x_pca = PCA(n_components=2).fit_transform(x_train)

        # Compute cluster centers and predict cluster index for each sample
        clusters = clf.fit_predict(x_train)

        if show:
            # Create a plot with subplots in a grid of 1X2
            fig = plt.figure(1, (8, 4))
            gs = gridspec.GridSpec(1, 2)
            ax = [fig.add_subplot(ss) for ss in gs]

            # Adjust layout
            fig.suptitle('Predicted Versus Training Labels (PCA)', fontsize=14, fontweight='bold')
            fig.subplots_adjust(top=0.85)

            # Add scatterplots to the subplots
            ax[0].scatter(x_pca[:, 0], x_pca[:, 1], c=clusters, edgecolors='black')
            ax[0].set_title('Predicted Training Labels')
            ax[1].scatter(x_pca[:, 0], x_pca[:, 1], c=y_train, edgecolors='black')
            ax[1].set_title('Actual Training Labels')

            gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])

        # Show the plots
        plt.show()

        # Print out the confusion matrix to see how the model is incorrect
        print("Classification Report:\n",metrics.classification_report(y_test, y_pred))
        print("Confusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred))

        # So looking at these numbers we can see that the kmeans model is not a good fit for our problem
        # this means that we must pick a different model for our data
        print('% 9s' % 'inertia    h**o   compl  v-meas     ARI AMI  silhouette')
        print('%i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (clf.inertia_,
                 homogeneity_score(y_test, y_pred),
                 completeness_score(y_test, y_pred),
                 v_measure_score(y_test, y_pred),
                 adjusted_rand_score(y_test, y_pred),
                 adjusted_mutual_info_score(y_test, y_pred),
                 silhouette_score(x_test, y_pred, metric='euclidean')))
Example #45
0
n_samples, n_features = data.shape

print("\t n_samples %d, \t n_features %d" % (n_samples, n_features))

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

t0 = time.time()
kmeans = KMeans(init='random', n_clusters=10, n_init=10)
kmeans.fit(data)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
      ('Random', (time.time() - t0), kmeans.inertia_,
       metrics.homogeneity_score(labels, kmeans.labels_),
       metrics.completeness_score(labels, kmeans.labels_),
       metrics.v_measure_score(labels, kmeans.labels_),
       metrics.adjusted_rand_score(labels, kmeans.labels_),
       metrics.adjusted_mutual_info_score(
           labels, kmeans.labels_, average_method='arithmetic'),
       metrics.silhouette_score(
           data, kmeans.labels_, metric='euclidean', sample_size=sample_size)))

print(82 * '_')

# Visualize the results on PCA-reduced data - random_raw

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='random', n_clusters=10, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
Example #46
0
    # x_data = x_data[:,params]
    # print(x_data.shape)

    km = KMeans(n_clusters=3)
    km.fit(x_data)
    # 每个样本所属的类
    predict_pre = km.labels_
    print("===========================================")
    print("聚类结果:")
    print(predict_pre)

    # 兰德系数,衡量的是两个数据分布的吻合程度
    print("调整兰德系数是:" + str(metrics.adjusted_rand_score(y_data,predict_pre)))
    # V-measure
    print("同质性:" + str(metrics.homogeneity_score(y_data, predict_pre)))
    print("完整性:" + str(metrics.completeness_score(y_data, predict_pre)))
    print("两者的调和平均V-measure:" + str(metrics.v_measure_score(y_data, predict_pre)))
    # 轮廓系数
    print("轮廓系数:" + str(metrics.silhouette_score(x_data, predict_pre)))

    # 建立四个颜色的列表
    color = ['orange', 'green', 'blue']
    # 遍历列表,把预测结果标记成对应的颜色
    colr1 = [color[i] for i in predict_pre]
    plt.scatter(x_data[:, 1], x_data[:, 2], color=colr1)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()


Example #47
0
    plt.savefig(path.join(PLOT_DIR, abbrev + "_em-nmf_scatter.png"),
                bbox_inches='tight')
    plt.show()
    plt.close()

    # parallel coordinates plot
    print("# Parallel Coordinates Plot for " + label)
    visualizer = ParallelCoordinates(features=feature_names,
                                     sample=0.1,
                                     shuffle=True,
                                     fast=True)
    visualizer.fit_transform(X, y_pred)
    visualizer.ax.set_xticklabels(visualizer.ax.get_xticklabels(),
                                  rotation=45,
                                  horizontalalignment='right')
    visualizer.finalize()
    plt.savefig(path.join(PLOT_DIR, abbrev + "_em-nmf_parallel.png"),
                bbox_inches='tight')
    visualizer.show()
    plt.close()

    # compare with ground truth (classes)
    print(label + ": Homogeneity Score = " +
          str(metrics.homogeneity_score(y, y_pred)))
    print(label + ": V Measure Score = " +
          str(metrics.v_measure_score(y, y_pred)))
    print(label + ": Mutual Info Score = " +
          str(metrics.mutual_info_score(y, y_pred)))
    print(label + ": Adjusted Rand Index = " +
          str(metrics.adjusted_rand_score(y, y_pred)))
                 n_jobs=None,
                 p=None)
y_method1 = cluster.fit_predict(X_pca)

labels = cluster.labels_

noOfClusters = len(set(labels))
print(noOfClusters)

print("Method1: ", Counter(y_method1))

s_score = silhouette_score(X_pca, y_method1)
print(s_score)

homogeneity_score = homogeneity_score(target, labels)
v_measure_score = v_measure_score(target, labels, beta=20.0)
completeness_score = completeness_score(target, labels)
contingency_matrix = contingency_matrix(target, labels)

print("homogeneity_score: ", homogeneity_score)
print("v_measure_score: ", v_measure_score)
print("completeness_score: ", completeness_score)
print("contingency_matrix: ", contingency_matrix)

print(datetime.now() - startTime)

#DBSCAN Guiding Question 3:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, scale
from sklearn.decomposition import PCA
Example #49
0
#*****************************calculation**************************************
num_cluster = 3
clusters = tfidf_kmeans(TF_X, k=num_cluster)
#print(len(clusters))
#print(len(labels))

print(82 * '_')
print(
    'init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tkappa\tcorr\tsilh_Clus\tsilh_HMN'
)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%-9s\t%.3f\t%.3f' % (
    name,
    (time() - t0),
    metrics.homogeneity_score(labels, clusters),
    metrics.completeness_score(labels, clusters),
    metrics.v_measure_score(labels, clusters),
    metrics.adjusted_rand_score(labels, clusters),
    metrics.adjusted_mutual_info_score(labels, clusters),
    metrics.cohen_kappa_score(labels, clusters, weights='linear'),
    str(spearmanr(labels, clusters)),
    metrics.silhouette_score(TF_X, clusters, metric='euclidean'),
    metrics.silhouette_score(TF_X, labels, metric='euclidean'),
))

#**************************error analysis**************************************
from sklearn.metrics.cluster import contingency_matrix
x = labels  #actual labels
y = clusters  #predicted labels
error_analysis = contingency_matrix(x, y)
#***************************plot************************************************
from sklearn.metrics.pairwise import cosine_similarity
Example #50
0
    # report timing
    printlog('\t Time taken = {0} s'.format(end - start))

    # ----------------------------------------------------------------------
    # stats
    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_clusters_true = len(set(labels_true)) - (1 if -1 in labels else 0)

    printlog('\t Estimated number of clusters: {0}'.format(n_clusters))
    # print stats
    args = [labels_true, labels]
    pargs = [
        metrics.homogeneity_score(*args),
        metrics.completeness_score(*args),
        metrics.v_measure_score(*args),
        metrics.adjusted_rand_score(*args),
        metrics.adjusted_mutual_info_score(*args)
    ]
    printlog("\t Homogeneity: {0:.3f}\n\t Completeness: {1:.3f}"
             "\n\t V-measure: {2:.3f}\n\t Adjusted Rand Index: {3:.3f}"
             "\n\t Adjusted Mutual Information: {4:.3f}".format(*pargs))

    # ----------------------------------------------------------------------
    # comparing results
    printlog('Comparing results...')
    merged = compare_results(groups, labels_true, labels)

    # ----------------------------------------------------------------------
    # Plot result
    printlog('Plotting graphs...')
Example #51
0
                             stop_words='english',
                             use_idf=True,
                             smooth_idf=True,
                             norm='l2')
#vectorizer = TfidfVectorizer(max_df=0.15, min_df=1, stop_words=stopwords, use_idf=True, smooth_idf=True, norm='l2')
X_test = vectorizer.fit_transform(test_data)

km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=5)
km.fit(X_test)
print(len(km.labels_), len(test_data))

score = metrics.homogeneity_score(test_labels, km.labels_)
print("homogeneity_score:   %0.3f" % score)
score = metrics.completeness_score(test_labels, km.labels_)
print("completeness_score:   %0.3f" % score)
score = metrics.v_measure_score(test_labels, km.labels_)
print("v_measure_score:   %0.3f" % score)
score = metrics.accuracy_score(test_labels, km.labels_)
print("acc_score:   %0.3f" % score)
score = metrics.normalized_mutual_info_score(test_labels, km.labels_)
print("nmi_score:   %0.3f" % score)

file = open(
    "D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_traintest",
    "w")

for i in range(len(train_labels)):
    file.write(train_labels[i] + "\t" + train_trueLabels[i] + "\t" +
               train_data[i])

for i in range(len(km.labels_)):
if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(X, labels, sample_size=1000))

print()
Example #53
0
bestChromosomeInAllGenerations, bestLabelsPredInAllGenerations, bestFitnessInAllGenerations, allBestFitness = EvoNP.run(
    points, nPoints, k, nChromosomes, nGenerations, crossoverProbability,
    mutationProbability)

print("HS: " + str(
    float("%0.2f" % metrics.homogeneity_score(
        labelsTrue,
        bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations]))))
print("CS: " + str(
    float("%0.2f" % metrics.completeness_score(
        labelsTrue,
        bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations]))))
print("VM: " + str(
    float("%0.2f" % metrics.v_measure_score(
        labelsTrue,
        bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations]))))
print("AMI: " + str(
    float("%0.2f" % metrics.adjusted_mutual_info_score(
        labelsTrue,
        bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations]))))
print("ARI: " + str(
    float("%0.2f" % metrics.adjusted_rand_score(
        labelsTrue,
        bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations]))))

# plot fitness progression
allGenerations = [x + 1 for x in range(nGenerations)]
plt.plot(allGenerations, allBestFitness)
plt.title(filename[:-4])
plt.xlabel('Generations')
Example #54
0
    titles = '原始数据', 'KMeans++聚类', '旋转后数据', '旋转后KMeans++聚类',\
             '方差不相等数据', '方差不相等KMeans++聚类', '数量不相等数据', '数量不相等KMeans++聚类'

    model = KMeans(n_clusters=4, init='k-means++', n_init=5)
    plt.figure(figsize=(8, 9), facecolor='w')
    for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1):
        plt.subplot(4, 2, i)
        plt.title(title)
        if i % 2 == 1:
            y_pred = y
        else:
            y_pred = model.fit_predict(x)
        print(i)
        print('Homogeneity:', homogeneity_score(y, y_pred))
        print('completeness:', completeness_score(y, y_pred))
        print('V measure:', v_measure_score(y, y_pred))
        print('AMI:', adjusted_mutual_info_score(y, y_pred))
        print('ARI:', adjusted_rand_score(y, y_pred))
        print('Silhouette:', silhouette_score(x, y_pred), '\n')
        plt.scatter(x[:, 0], x[:, 1], c=y_pred, s=30, cmap=cm, edgecolors='none')
        x1_min, x2_min = np.min(x, axis=0)
        x1_max, x2_max = np.max(x, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(b=True, ls=':')
    plt.tight_layout(2, rect=(0, 0, 1, 0.97))
    plt.suptitle('数据分布对KMeans聚类的影响', fontsize=18)
    plt.show()
Example #55
0
# bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),name="random", data=data)
for i in range(5):
    bench_k_means(GaussianMixture(n_components=n_digits_i[i], random_state=0),
                  name="GaussianMixture",
                  data=PCA_data_trans)

#ICA----------------------------------
kmeans_ICA = KMeans(n_clusters=2, random_state=0).fit(ICA_data_trans)
float(sum(kmeans_ICA.labels_ == labels)) / float(len(labels))
metrics.homogeneity_score(labels, kmeans_ICA.labels_)
EMax_ICA = GaussianMixture(n_components=2, random_state=0).fit(ICA_data_trans)
EMax_ICA.labels_ = EMax_ICA.predict(ICA_data_trans)
float(sum(EMax_ICA.labels_ == labels)) / float(len(labels))
metrics.homogeneity_score(labels, EMax_ICA.labels_)
metrics.completeness_score(labels, EMax_ICA.labels_)
metrics.v_measure_score(labels, EMax_ICA.labels_)
metrics.adjusted_rand_score(labels, EMax_ICA.labels_)
metrics.adjusted_mutual_info_score(labels, EMax_ICA.labels_)
metrics.silhouette_score(ICA_data_trans,
                         EMax_ICA.labels_,
                         metric='euclidean',
                         sample_size=sample_size)

n_digits_i = [2, 5, 10, 20, 50]
for i in range(5):
    bench_k_means(KMeans(init='k-means++', n_clusters=n_digits_i[i],
                         n_init=10),
                  name="k-means++",
                  data=ICA_data_trans)
# bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),name="random", data=data)
for i in range(5):
Example #56
0
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    clf = EM(n_components=times)
    clf.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # centroids = clf.cluster_centers_
    # plt.scatter(centroids[:, 0], centroids[:, 1],
    #             marker='x', s=169, linewidths=3,
    #             color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    clf = EM(n_components=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = EM(n_components=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    # newtx = np.append(td)
    # newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="EM_"+alg)
    errs = []
    scores = []
    # this is what we will compare to
    checker = EM(n_components=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        # create a clusterer
        clf = EM(n_components=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
        scores.append(clf.score(tx, ty))
        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg)

    # other metrics
    # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"]
    plt.figure()
    plt.title(dataset+": EM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times),adj_rand, label="Adjusted Random")
    plt.plot(range(2,times),v_meas, label="V Measure")
    plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.savefig("EMMetrics"+dataset+"_"+alg+".png")

    kmeans = KM(n_clusters=2)
    kmeans.fit(reduced_data)

    Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
Example #57
0
def find_similar_products():
    df = pd.read_csv(
        '/Users/srinath/playground/data-science/BimboInventoryDemand/producto_tabla.csv'
    )

    labels = df['Producto_ID']
    print "have " + str(df.shape[0]) + "products"

    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=200,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=True)
    X = vectorizer.fit_transform(df['NombreProducto'])

    print("n_samples: %d, n_features: %d" % X.shape)

    print type(X)
    print X

    print("Performing dimensionality reduction using LSA")
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(5)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print("new size", X.shape)
    print type(X)
    print X

    # Do the actual clustering
    km = KMeans(n_clusters=30,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=True)

    print("Clustering sparse data with %s" % km)
    #km.fit(X)

    results = km.fit_predict(X)
    print len(results), results

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    products_clusters = np.column_stack([labels, results])
    to_saveDf = pd.DataFrame(products_clusters,
                             columns=["Producto_ID", "Cluster"])
    to_saveDf.to_csv('product_clusters.csv', index=False)

    to_saveDf['NombreProducto'] = df['NombreProducto']

    grouped = to_saveDf.groupby(['Cluster'])['NombreProducto']
    grouped.apply(print_cluster)
Example #58
0
def km(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    processed = []
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    sil = []
    inertia = []
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)
        test = clf.predict(tx)
        result = clf.predict(rx)

        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
        inertia.append(clf.inertia_)
    plots = [adj_rand, v_meas, mutual_info, adj_mutual_info]
    plt.title(dataset+": KM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times), adj_rand, label="Adjusted Random")
    plt.plot(range(2,times), v_meas, label="V Measure")
    plt.plot(range(2,times), mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times), adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.ylim(ymin=-0.05, ymax=1.05)
    plt.savefig("KMeansMetric"+dataset+"_"+alg+".png")

    plt.figure()
    plt.title(dataset+": KMeans Inertia - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.plot(range(2,times), inertia)
    plt.savefig("KM-Inertia-"+dataset+"-"+alg+".png")

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)

    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
        # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    best_clusterer = KM(n_clusters=4)
    best_clusterer.fit(X)
    Z = best_clusterer.predict(X)
    print(len(Z))
    print(len(X))
    plt.figure(1)
    plt.clf()
    colors = ['r', 'g', 'b', 'y', 'c', 'm','#eeefff', '#317c15', '#4479b4', '#6b2b9c',
'#63133b', '#6c0d22', '#0c7c8c', '#67c50e','#c5670e', '#946c47', '#58902a', '#54b4e4',
'#e4549e', '#2b2e85'  ]
    for i in range(0, len(X)):
        plt.plot(X[i][0], X[i][1], marker='.', color=colors[Z[i]], markersize=2)
    #plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = best_clusterer.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='k', zorder=10)
    plt.title('K-means Clusters ' + alg)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
    kmeans = KM(n_clusters=3)
    kmeans.fit(tx)
    result=pd.DataFrame(kmeans.transform(tx), columns=['KM%i' % i for i in range(3)])
    my_color = pd.Series(ty).astype('category').cat.codes
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(result['KM0'], result['KM1'], result['KM2'], c=my_color, cmap="Dark2_r", s=60)
    plt.show()
    reduced_data = PCA(n_components=2).fit_transform(tx)
    kmeans = KM(n_clusters=4)
    kmeans.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': K-means clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    checker = KM(n_clusters=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    clusters = {x:[] for x in range(4)}
    clf = KM(n_clusters=4)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)  # and test it on the testing set
    for index, val in enumerate(result):
        clusters[val].append(index)
    mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(4)}
    processed = [mapper[val] for val in result]
    print(sum((processed-truth)**2) / float(len(ry)))
    clf = KM(n_clusters=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = KM(n_clusters=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(td)
    newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="KM_"+alg)
    nn(newtx, ty, newrx, ry, add="onKM"+add)
Example #59
0
km = categ.fit_predict(X)

#Agglomerative Clustering Technique
'''
from sklearn.cluster import AgglomerativeClustering
categ = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward')
km = categ.fit_predict(X)
'''
#Confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y, km)

#accuracy Score
from sklearn.metrics import v_measure_score
print("v_measure_score", v_measure_score(Y, km))
#print("accuracy_score  = {:.2f}%".format(accuracy_score(Y,km)*100))

#Visualising the cluster
'''
plt.scatter(X[Y == 0,0], X[Y == 0,1], s=100, c='red', label = 'cluster 1')
plt.scatter(X[Y == 1,0], X[Y == 1,1], s=100, c='green', label = 'cluster 2')
plt.scatter(X[Y == 2,0], X[Y == 2,1], s=100, c='black', label = 'cluster 3')
#plt.scatter(y_pred.cluster_centers_[:,0], y_pred.cluster_centers_[:,1], s = 300, c='yellow')
plt.title('K-mean_cluster(real)')
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.show()

#Visualising the cluster
plt.scatter(X[km == 0,0], X[km == 0,1], s=100, c='red', label = 'cluster 1')
km_sse= []
km_silhouette = []
km_vmeasure =[]
km_ami = []
km_homogeneity = []
km_completeness = []

cluster_range = (2,12)

for i in range(cluster_range[0],cluster_range[1]):
    km = KMeans(n_clusters=i, random_state=0).fit(X_random_proj)
    preds = km.predict(X_random_proj)
    km_sse.append(-km.score(X_random_proj))
    km_silhouette.append(silhouette_score(X_random_proj,preds))
    km_vmeasure.append(v_measure_score(y,preds))
    km_ami.append(adjusted_mutual_info_score(y,preds))
    km_homogeneity.append(homogeneity_score(y,preds))
    km_completeness.append(completeness_score(y,preds))
    print(f"Done for cluster {i}")


# In[100]:


plt.figure(figsize=(21,11))

#SSE
plt.subplot(2,3,1)
plt.plot([i for i in range(cluster_range[0],cluster_range[1])],km_sse,'b-o',linewidth=3,markersize=12)
plt.grid(True)