def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def main(): ''' doctsring for main ''' args = parse_args() setup_logging(verbose = args.verbose) records = consume_fasta(args.fasta_file) # setup Hasher, Vectorizer and Classifier hasher = HashingVectorizer(analyzer='char', n_features = 2 ** 18, ngram_range=(args.ngram_min, args.ngram_max), ) logging.info(hasher) encoder, classes = get_classes(records, args.tax_level) n_clusters = len(classes) logging.info('using taxonomic level %s' % args.tax_level) logging.info('Using %s clusters' % n_clusters) classifier = MiniBatchKMeans(n_clusters = n_clusters) records = records[0:args.n_iters] chunk_generator = iter_chunk(records, args.chunk_size, args.tax_level) logging.info('ngram range: [%s-%s]' % (args.ngram_min, args.ngram_max)) for labels, features in chunk_generator: logging.info('transforming training chunk') labels = encoder.transform(labels) vectors = hasher.transform(features) logging.info('fitting training chunk') classifier.partial_fit(vectors) pred_labels = classifier.predict(vectors) score = v_measure_score(labels, pred_labels) shuffled_score = v_measure_score(labels, sample(pred_labels, len(pred_labels))) logging.info('score: %.2f' % (score)) logging.info('shuffled score: %.2f' % (shuffled_score))
def bench_k_means(estimator, name, data, target_labels, sample_size): """For benchmarking K-Means estimators. Prints different clustering metrics and train accuracy ARGS estimator: K-Means clustering algorithm <sklearn.cluster.KMeans> name: estimator name <str> data: array-like or sparse matrix, shape=(n_samples, n_features) target_labels: labels of data points <number array> sample_size: size of the sample to use when computing the Silhouette Coefficient <int> """ t0 = time() estimator.fit(data) _, _, train_accuracy = compute_residuals_and_rsquared(estimator.labels_, target_labels) print('% 9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(target_labels, estimator.labels_), metrics.completeness_score(target_labels, estimator.labels_), metrics.v_measure_score(target_labels, estimator.labels_), metrics.adjusted_rand_score(target_labels, estimator.labels_), metrics.adjusted_mutual_info_score(target_labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_,metric='euclidean',sample_size=sample_size), train_accuracy ) )
def main(argv): file_vectors,clust_type, clusters, distance, cluster_param, std = get_arguments(argv) fname='.'.join(map(str,[file_vectors.split('/')[-1],clust_type, clusters, distance, cluster_param, std])) writer=open(fname,'w') ## better to put in EX1, EX2, .. folders print 'clustering:',clust_type print 'clusters:',clusters print 'cluster_param:',cluster_param print 'std:',std X,words,truth=load_data(file_vectors,True) X=np.array(X) if clust_type=='affin': labels=affin_sclustering(X, n_clust=int(clusters), distance=distance, gamma=float(cluster_param), std=bool(std)) else: labels=knn_sclustering(X, n_clust=int(clusters), k=int(cluster_param)) writer.write('\nVMeas:'+ str(v_measure_score(truth,labels))) writer.write('\nRand:'+str(adjusted_rand_score(truth,labels))) writer.write('\nHomogen:'+str(homogeneity_score(truth,labels))+'\n') i=0 for word in words: writer.write(word+' : '+str(labels[i])+'\n') i+=1 writer.close()
def my_clustering(X, y, n_clusters, pca): # ======================================= # Complete the code here. # return scores like this: return [score, score, score, score] # ======================================= from sklearn.cluster import KMeans #print('f**k X ', X.shape) #print('f**k y ', y.shape) clf = KMeans(n_clusters) clf.fit(X) from sklearn import metrics ari = metrics.adjusted_rand_score(y, clf.labels_) mri = metrics.adjusted_mutual_info_score(y, clf.labels_) v_measure = metrics.v_measure_score(y, clf.labels_) ''' silhouette_coeff = metrics.silhouette_score(X, clf.labels_, metric='euclidean', sample_size=300) ''' silhouette_coeff = metrics.silhouette_score(X, clf.labels_) show_images(n_clusters, clf, pca) return [ari,mri,v_measure,silhouette_coeff]
def bench_k_means(estimator, data, labels): t0 = time() estimator.fit(data) print("time to fit: {:.5}".format(time() - t0)) homogenity = metrics.homogeneity_score(labels, estimator.labels_) completeness = metrics.completeness_score(labels, estimator.labels_) v_measure = metrics.v_measure_score(labels, estimator.labels_) print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format( homogenity, completeness, v_measure) ) adj_rand_score = metrics.adjusted_rand_score( labels, estimator.labels_ ) print("adjusted_rand_score {:.5}".format(adj_rand_score)) adj_mutual_info_score = metrics.adjusted_mutual_info_score( labels, estimator.labels_ ) print("adjusted_mutual_info_score {:.5}".format( adj_mutual_info_score) ) silhouette_score = metrics.silhouette_score( data, estimator.labels_, metric='euclidean' ) print("silhouette_score {:.5}".format( metrics.silhouette_score(data, estimator.labels_, metric='euclidean')) ) return [ homogenity, completeness, v_measure, adj_rand_score, adj_mutual_info_score, silhouette_score ]
def cluster(Z, K=4, algo='kmeans'): descr = Z.columns X = Imputer().fit_transform(Z) ############################################################################## if algo == 'dbscan': # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) elif algo == 'kmeans': km = KMeans(n_clusters=K) km.fit(X) print(km.labels_) return km
def bench_k_means(estimator, name, data, sample_size, labels,postIds): data=sparse.csr_matrix(data) t0 = time() print("Performing dimensionality reduction using LSA") t0 = time() lsa = TruncatedSVD(500) data = lsa.fit_transform(data) data = Normalizer(copy=False).fit_transform(data) print("done in %fs" % (time() - t0)) print() #sData=sparse.csr_matrix(data) val=estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f ' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_))) print("Parsing USer File:") parseUserFile() print("extracting User File:") clusterDict=extractCluster(postIds,estimator.labels_) print("writing Cluster Data to File") writeCluterToFile(clusterDict)
def clustering_by_kmeans(vectorizer, X, true_k): print "Clustering in " + str(true_k) + " groups by K-means..." km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1) km.fit_predict(X) print "Measuring..." print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_)) #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied. print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(documents, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) #print top terms per cluster clusters clusters = km.labels_.tolist() # 0 iff term is in cluster0, 1 iff term is in cluster1 ... (lista de termos) #print "Lista de termos pertencentes aos clusters " + str(clusters) print "Total de " + str(len(km.labels_)) + " documents" #Example to get all documents in cluster 0 #cluster_0 = np.where(clusters==0) # don't forget import numpy as np #print cluster_0 #cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do: #X_cluster_0 = documents[cluster_0] terms = vectorizer.get_feature_names() #print terms measuring_kmeans(true_k,clusters)
def bestClassify(X,Y): "Best classifier function" tfidf = True if tfidf: vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity, sublinear_tf = True) else: vec = CountVectorizer(preprocessor = identity, tokenizer = identity) km = KMeans(n_clusters=2, n_init=100, verbose=1) clusterer = Pipeline( [('vec', vec), ('cls', km)] ) prediction = clusterer.fit_predict(X,Y) checker = defaultdict(list) for pred,truth in zip(prediction,Y): checker[pred].append(truth) labeldict = {} for pred, label in checker.items(): labeldict[pred] = Counter(label).most_common(1)[0][0] #print(pred, Counter(label).most_common(1)[0][0]) prediction = [labeldict[p] for p in prediction] labels = list(labeldict.values()) print(labels) print(confusion_matrix(Y, prediction, labels=labels)) print("Homogeneity:", homogeneity_score(Y,prediction)) print("Completeness:", completeness_score(Y,prediction)) print("V-measure:", v_measure_score(Y,prediction)) print("Rand-Index:", adjusted_rand_score(Y,prediction))
def run_clustering( clusterer, data, labels ): """ Cluster: Using a predefined and parameterized clustering algorithm, fit some dataset and perform metrics given a set of ground-truth labels. clusterer: the clustering algorithm, from sklearn data: array-like dataset input labels: vector of ground-truth labels """ # Time the operation t0 = time() clusterer.fit(data) t1 = time() # Perform metrics runtime = (t1 - t0) homogeneity = metrics.homogeneity_score( labels, clusterer.labels_ ) completeness = metrics.completeness_score( labels, clusterer.labels_ ) v_measure = metrics.v_measure_score( labels, clusterer.labels_ ) adjusted_rand = metrics.adjusted_rand_score( labels, clusterer.labels_ ) adjusted_mutual = metrics.adjusted_mutual_info_score( labels, clusterer.labels_ ) # Output to logs logging.info(" |- Execution time: %fs" % runtime) logging.info(" |- Homogeneity: %0.3f" % homogeneity) logging.info(" |- Completeness: %0.3f" % completeness) logging.info(" |- V-measure: %0.3f" % v_measure) logging.info(" |- Adjusted Rand-Index: %.3f" % adjusted_rand) logging.info(" |- Adjusted Mutual Info: %.3f" % adjusted_mutual)
def cluster(model, uids): ############################################################################## # Generate sample data X = [] for uid in uids: X.append(model.docvecs[uid]) labels_true = uids ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) pickle.dump(af, open('data/af.pick', 'w')) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
def predictAffinityPropagation(X, labels_true): #ranX, ranY = shuffle(X, y, random_state=0) af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def cluster(algorithm, data, topics, make_silhouette=False): print str(algorithm) clusters = algorithm.fit_predict(data) labels = algorithm.labels_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels) print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels) print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels) print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels) print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels) print ' ***************** ' silhouettes = metrics.silhouette_samples(data, labels) num_clusters = len(set(clusters)) print 'num clusters: %d' % num_clusters print 'num fitted: %d' % len(clusters) # Make a silhouette plot if the flag is set if make_silhouette: order = numpy.lexsort((-silhouettes, clusters)) indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)] ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices] ytickLabels = ["%d" % x for x in range(num_clusters)] cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist() clr = [cmap[i] for i in clusters[order]] fig = plt.figure() ax = fig.add_subplot(111) ax.barh(range(data.shape[0]), silhouettes[order], height=1.0, edgecolor='none', color=clr) ax.set_ylim(ax.get_ylim()[::-1]) plt.yticks(ytick, ytickLabels) plt.xlabel('Silhouette Value') plt.ylabel('Cluster') plt.savefig('cluster.png')
def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10): ############################################################################## # Extract Y true labels_true = y_true ############################################################################## # transform distance matrix into a similarity matrix S = 1 - D ############################################################################## # compute DBSCAN #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S) db = Ward(n_clusters=n_clusters).fit(S) #core_samples = db.core_sample_indices_ labels = db.labels_ # number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Number of clusters: %d' % n_clusters_ print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels) print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels) print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels) print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels) print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels) print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
def clustering(dataset): vectorizer = dataset.vectorizer X = dataset.X true_k = dataset.n_classes labels = dataset.target km = cluster.KMeans(n_clusters=true_k, max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print() print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() sizes = np.sum(km.labels_[:, np.newaxis] == np.arange(true_k), axis=0) for i in range(true_k): print("Cluster %d (%d):" % (i, sizes[i]), end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def kmeans_setup(data): if pca_f == 1: pca = PCA(n_components = num_clusters).fit(data) initializer = pca.components_ name = 'PCA' else: initializer = 'k-means++' name = 'k-means++' t0 = time() estimator = KMeans(init=initializer, n_clusters=num_clusters, n_init = num_init, max_iter = num_iterations) estimator.fit(data) if debug == True: sample_size = 300 print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) return estimator
def affin_test(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() x_train, X_valid, y_train, y_valid = cross_validation.train_test_split( x_train, y_train, test_size=0.9, random_state=42) labels_true = y_train x_train = StandardScaler().fit_transform(x_train) af = AffinityPropagation(preference=-50).fit(x_train) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x_train, labels, metric='sqeuclidean'))
def get_result(km, labels): homo_score = metrics.homogeneity_score(labels, km.labels_) complete_score = metrics.completeness_score(labels, km.labels_) v_score = metrics.v_measure_score(labels, km.labels_) rand_score = metrics.adjusted_rand_score(labels, km.labels_) mutual_info = metrics.adjusted_mutual_info_score(labels, km.labels_) return homo_score, complete_score, v_score, rand_score, mutual_info
def compute_metrics(answers, predictions): aris = [] vscores = [] fscores = [] weights = [] for k in answers.keys(): idx = np.argsort(np.array(answers[k][0])) true = np.array(answers[k][1])[idx] pred = np.array(predictions[k][1]) weights.append(pred.shape[0]) if len(np.unique(true)) > 1: aris.append(adjusted_rand_score(true, pred)) vscores.append(v_measure_score(true, pred)) fscores.append(compute_fscore(true, pred)) # print '%s: ari=%f, vscore=%f, fscore=%f' % (k, aris[-1], vscores[-1], fscores[-1]) aris = np.array(aris) vscores = np.array(vscores) fscores = np.array(fscores) weights = np.array(weights) print 'number of one-sense words: %d' % (len(vscores) - len(aris)) print 'mean ari: %f' % np.mean(aris) print 'mean vscore: %f' % np.mean(vscores) print 'weighted vscore: %f' % np.sum(vscores * (weights / float(np.sum(weights)))) print 'mean fscore: %f' % np.mean(fscores) print 'weighted fscore: %f' % np.sum(fscores * (weights / float(np.sum(weights)))) return np.mean(aris),np.mean(vscores)
def print_cluster(clusterTrainClass, labels, clusterTestStory): print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels)) print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels)) print "Silhouette Coefficient:" print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def evaluate(labels_true, labels): homogeneity = metrics.homogeneity_score(labels_true, labels) completeness = metrics.completeness_score(labels_true, labels) v_measure = metrics.v_measure_score(labels_true, labels) adjusted_rand = metrics.adjusted_rand_score(labels_true, labels) adjusted_mutual_info = metrics.adjusted_mutual_info_score(labels_true, labels) #silhouette = metrics.silhouette_score(data, labels, metric='sqeuclidean') return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info#, silhouette
def cluseval(label, truth): rand = metrics.adjusted_rand_score(truth, label) mutual = metrics.adjusted_mutual_info_score(truth, label) h**o = metrics.homogeneity_score(truth, label) complete = metrics.completeness_score(truth, label) v = metrics.v_measure_score(truth, label) result = [rand, mutual, h**o, complete, v] return result
def main(): # Parse command line arguments parser = argparse.ArgumentParser(usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Perform spectral clustering.') parser.add_argument("--clusters", "-c", type=int, help='Number of clusters.') parser.add_argument("--knn", "-k", type=int, default=0, help='Number of nearest neighbors, 0 means all.') parser.add_argument("--sm", "-s", help='File containing similarity matrix') parser.add_argument("--iterations", "-i", type=int, default=10, help='Number of KMeans iterations.') parser.add_argument("--true_labels", "-t", help='File containing the true labels.') parser.add_argument("--output", "-o", help='Name of the file to write' + ' the labels to.') parser.add_argument("--normalize", "-n", action='store_true', help='Normalize each row so that the max value is one.') args = parser.parse_args() sm = np.load(args.sm) if args.normalize: sm /= sm.max(axis=1)[:, np.newaxis] # Ensure symmetric sm = (sm + sm.T) / 2 labels = [] if args.knn > 0: labels = SpectralClustering(n_clusters=args.clusters, affinity='nearest_neighbors', n_neighbors=args.knn, n_init=args.iterations).fit(sm).labels_ else: labels = SpectralClustering(n_clusters=args.clusters, affinity='precomputed', n_init=args.iterations).fit(sm).labels_ with open(args.output, 'w') as fout: for l in labels: fout.write(str(l) + '\n') # Load the true labels. if args.true_labels: true_labels = [] with open(args.true_labels, 'r') as fin: for line in fin: true_labels.append(int(line.strip())) # Run the metrics. print("Homogeneity: %0.3f" % metrics.homogeneity_score(true_labels, labels)) print("Completeness: %0.3f" % metrics.completeness_score(true_labels, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(true_labels, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(true_labels, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(true_labels, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sm, labels))
def evaluate(km, labels): print("Homogeneity: %.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %.3f" % metrics.silhouette_score(X, \ labels, \ sample_size=1000))
def evaluateAllAlgorithms(self): algs = [self.labels_db,self.labels_ap] t**s =['DBASE','AP'] for i in range(2): print 'Algorithm:',t**s[i] print("\tHomogeneity: %0.3f" % metrics.homogeneity_score(self.labels_gt, algs[i])) print("\tCompleteness: %0.3f" % metrics.completeness_score(self.labels_gt, algs[i])) print("\tV-measure: %0.3f" % metrics.v_measure_score(self.labels_gt, algs[i])) print("\tAdjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(self.labels_gt, algs[i])) print("\tAdjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(self.labels_gt, algs[i]))
def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_)))
def print_metrics(self, data, labels, labels_real): print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_real, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_real, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_real, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_real, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_real, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data, labels))
def bench(estimator, name): # Lifted from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#example-cluster-plot-kmeans-digits-py t0 = time() print('% 9s %.2fs %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_) ))
def cluster1(X, y): a = {} pca = PCA(n_components=2) #降为2维 pca = pca.fit(X) X_dr = pca.transform(X) #聚类种类及名称 clustering_names = [ 'MiniBatchKMeans', 'MeanShift', 'AgglomerativeClustering', 'DBSCAN', 'Birch' ] x = X_dr #规范化数据集以便于参数选择 x = StandardScaler().fit_transform(x) #均值漂移估计带宽 bandwidth = cluster.estimate_bandwidth(x, quantile=0.3) #kneighbors_graph类返回用KNN时和每个样本最近的K个训练集样本的位置 connectivity = kneighbors_graph(x, n_neighbors=10, include_self=False) #使连接对称 connectivity = 0.5 * (connectivity + connectivity.T) # 创建聚类估计器 two_means = cluster.MiniBatchKMeans(n_clusters=3, n_init=10) #MiniBatchKMeans ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) #MeanShift average_linkage = cluster.AgglomerativeClustering( n_clusters=3) #AgglomerativeClustering dbscan = cluster.DBSCAN(eps=0.5) #DBSCAN birch = cluster.Birch(n_clusters=3) #Birch #聚类算法 clustering_algorithms = [two_means, ms, average_linkage, dbscan, birch] colors = np.array([x for x in "bgrcmykbgrcmykbgrcmykbgrcmyk"]) #hstack()函数水平把数组堆叠起来 colors = np.hstack([colors] * 20) num = [] for name, algorithm in zip(clustering_names, clustering_algorithms): r = [] g = [] b = [] # t0 = time.time() #time()函数返回当前时间的时间戳 algorithm.fit(X) # t1 = time.time() #hasattr()函数用于判断对象是否包含对应的属性 if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(x) # if hasattr(algorithm, 'cluster_centers_'): # centers = algorithm.cluster_centers_ # center_colors = colors[:len(centers)] for i in range(len(colors[y_pred].tolist())): #循环获取聚类结果的各个点的x,y,color if colors[y_pred].tolist()[i] == 'r': r.append([x[:, 0][i], x[:, 1][i]]) if colors[y_pred].tolist()[i] == 'g': g.append([x[:, 0][i], x[:, 1][i]]) if colors[y_pred].tolist()[i] == 'b': b.append([x[:, 0][i], x[:, 1][i]]) #创建聚类名称与结果的键值对 a.update({"%s" % name: {'r': r, 'g': g, 'b': b}}) num.append(metrics.v_measure_score(y, y_pred)) return x, a, num
def cluster_to_find_similar_products(): df = pd.read_csv( '/Users/srinath/playground/data-science/BimboInventoryDemand/producto_tabla.csv' ) labels = df['Producto_ID'] extracted_features = [extract_data(p) for p in df['NombreProducto'].values] extracted_features_np = np.row_stack(extracted_features) extracted_features_df = pd.DataFrame(extracted_features_np, columns=[ 'description', 'brand', 'weight', 'pieces', "has_choco", "has_vanilla", "has_multigrain" ]) print "have " + str(df.shape[0]) + "products" #vectorize names vectorizer = TfidfVectorizer(max_df=0.5, max_features=200, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(extracted_features_df['description']) print X print("n_samples: %d, n_features: %d" % X.shape) print("Performing dimensionality reduction using LSA") # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(5) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print("new size", X.shape) print type(X) extracted_features_df = encode_onehot(extracted_features_df, ['brand']) extracted_features_df = drop_feilds_1df(extracted_features_df, ['description']) print "X,df", X.shape, extracted_features_df.values.shape X = np.hstack((X, extracted_features_df.values)) # Do the actual clustering km = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1, verbose=True) print("Clustering sparse data with %s" % km) #km.fit(X) results = km.fit_predict(X) print len(results), results print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() products_clusters = np.column_stack([labels, results]) to_saveDf = pd.DataFrame(products_clusters, columns=["Producto_ID", "Cluster"]) to_saveDf.to_csv('product_clusters.csv', index=False) to_saveDf['NombreProducto'] = df['NombreProducto']
false_pos+=1 #311 if p==0 and g==1: false_neg+=1 #171 #655 : actual positives ; #593 : actual negative #795 : predicted positive ; #453 : predicted negative precision_model= true_pos/(true_pos+false_pos) #0.6088 recall_model=true_pos/(true_pos+false_neg) #0.7389 f_1=2*(precision_model *recall_model)/ (precision_model+recall_model) #0.667586 print("Accuracy:",metrics.accuracy_score(actual, predicted)) #0.61378 (accuracy calculation) ##---------metrics to evaluate performance of clustering------------- # zero is bad; 1 is good ; homogeneity=metrics.homogeneity_score(actual, predicted) completeness= metrics.completeness_score(actual, predicted) v_measure=metrics.v_measure_score(actual, predicted) #perfect labelling==1, bad labelling closer to 0 ami= metrics.adjusted_mutual_info_score(actual, predicted) nmi=metrics.normalized_mutual_info_score(actual, predicted) mutual_i= metrics.mutual_info_score(actual, predicted) #this can be either positive or negaive (negative is bad) #btwn -1 and 1; negative values are bad (independent labelings), similar clusterings have a positive ARI, 1.0 is the perfect match score. ari=metrics.adjusted_rand_score(actual, predicted) # ROC curve fpr, tpr, thresholds = roc_curve(actual, predicted, pos_label = 1) roc_auc = auc(fpr, tpr) plt.figure(1, figsize = (15, 10)) plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], lw=2, linestyle='--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
labels = [int(x / 4) for x in dataset.target] vectorizer = CountVectorizer(min_df=3, stop_words="english") dataset_array = vectorizer.fit_transform(dataset.data) tfidf_transformer = TfidfTransformer() dataset_tfidf = tfidf_transformer.fit_transform(dataset_array) print("p1: dimensions of the TF-IDF matrix is: ", dataset_tfidf.shape) # Q2: contingency table of clustering result km = KMeans(n_clusters=2, random_state=0, max_iter=1000, n_init=30) km.fit(dataset_tfidf) get_contingency_table(labels, km.labels_) # Q3: 5 measures print("Homogeneity: %0.4f" % homogeneity_score(labels, km.labels_)) print("Completeness: %0.4f" % completeness_score(labels, km.labels_)) print("V-measure: %0.4f" % v_measure_score(labels, km.labels_)) print("Adjusted Rand Index: %.4f" % adjusted_rand_score(labels, km.labels_)) print("Adjusted mutual info score: %.4f" % adjusted_mutual_info_score(labels, km.labels_)) # Q4: plot variance plot_variance(dataset_tfidf) # Q5,6: SVD and NMF best_r_svd = plot_r_choice(dataset_tfidf, labels, "SVD") print("The best r for SVD is " + str(best_r_svd)) best_r_nmf = plot_r_choice(dataset_tfidf, labels, "NMF") print("The best r for NMF is " + str(best_r_nmf))
clusterize = KMeans(n_clusters=3, random_state=42) output = clusterize.fit_predict(data) data_res = [] #clusterize.labels_ = [str(label + 1) for label in clusterize.labels_] data_res.append(({ 'ARI': metrics.adjusted_rand_score(expert_labels, clusterize.labels_), 'AMI': metrics.adjusted_mutual_info_score(expert_labels, clusterize.labels_), 'Homogenity': metrics.homogeneity_score(expert_labels, clusterize.labels_), 'Completeness': metrics.completeness_score(expert_labels, clusterize.labels_), 'V-measure': metrics.v_measure_score(expert_labels, clusterize.labels_), 'Silhouette': metrics.silhouette_score(data, clusterize.labels_) })) results = pd.DataFrame(data=data_res, columns=[ 'ARI', 'AMI', 'Homogenity', 'Completeness', 'V-measure', 'Silhouette' ], index=['K-means']) print(results) if vizualize: pca = PCA(n_components=2)
from sklearn import metrics if __name__ == "__main__": y = [0, 0, 0, 1, 1, 1] y_hat = [0, 0, 1, 1, 2, 2] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) v2 = 2 * c * h / (c + h) v = metrics.v_measure_score(y, y_hat) print(u'V-Measure:', v2, v) y = [0, 0, 0, 1, 1, 1] y_hat = [0, 0, 1, 3, 3, 3] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) v = metrics.v_measure_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) print(u'V-Measure:', v) # 允许不同值 y = [0, 0, 0, 1, 1, 1] y_hat = [1, 1, 1, 0, 0, 0] h = metrics.homogeneity_score(y, y_hat) c = metrics.completeness_score(y, y_hat) v = metrics.v_measure_score(y, y_hat) print(u'同一性(Homogeneity):', h) print(u'完整性(Completeness):', c) print(u'V-Measure:', v)
""" #We can turn those concept as scores homogeneity_score and completeness_score. Both are bounded below by 0.0 and above by 1.0 (higher is better): from sklearn import metrics labels_true = [0, 0, 0, 1, 1, 1] labels_pred = [0, 0, 1, 1, 2, 2] metrics.homogeneity_score(labels_true, labels_pred) metrics.completeness_score(labels_true, labels_pred) #Their harmonic mean called V-measure is computed by v_measure_score metrics.v_measure_score(labels_true, labels_pred) #All calculated together metrics.homogeneity_completeness_v_measure(labels_true, labels_pred) #https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation #http://www.learnbymarketing.com/methods/k-means-clustering/ """ Q1. (Create a program that fulfills the following specification.) deliveryfleet.csv
labels_true = numpy.array(iris_target) data = numpy.array(iris_data) X = PCA(n_components=2).fit_transform(iris_data) # ############################################################################# # Compute Affinity Propagation af = AffinityPropagation(preference=-10).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle plt.close('all') plt.figure(1) plt.clf()
#clf.transform(X_test) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) y_test = [int(i) for i in test_labels] pred_test = [int(i) for i in pred] score = metrics.homogeneity_score(y_test, pred_test) print("homogeneity_score: %0.3f" % score) score = metrics.completeness_score(y_test, pred_test) print("completeness_score: %0.3f" % score) score = metrics.v_measure_score(y_test, pred_test) print("v_measure_score: %0.3f" % score) score = metrics.accuracy_score(y_test, pred_test) print("acc_score: %0.3f" % score) score = metrics.normalized_mutual_info_score(y_test, pred_test) print("nmi_score: %0.3f" % score) #file=open("D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_traintest","w") file = open( "/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_traintest", "w") #file=open("D:/PhD/dr.norbert/dataset/shorttext/stackoverflow/semisupervised/stackoverflowraw_ensembele_traintest","w") #file=open("D:/PhD/dr.norbert/dataset/shorttext/data-web-snippets/semisupervised/data-web-snippetsraw_ensembele_traintest","w") for i in range(len(train_labels)): file.write(train_labels[i] + "\t" + train_trueLabels[i] + "\t" +
metrics_report = {'kmeans': {}, 'gmm': {}} labels = {'kmeans': kmeans.labels_, 'gmm': gmm} for each in metrics_report.keys(): metrics_report[each]['ARI'] = round( metrics.adjusted_rand_score(y, labels[each]), 2) metrics_report[each]['AMI'] = round( metrics.adjusted_mutual_info_score(y, labels[each]), 2) metrics_report[each]['homogeneity'] = round( metrics.homogeneity_score(y, labels[each]), 2) metrics_report[each]['completeness'] = round( metrics.completeness_score(y, labels[each]), 2) metrics_report[each]['v_measure'] = round( metrics.v_measure_score(y, labels[each]), 2) metrics_report[each]['silhouette'] = round( metrics.silhouette_score(X, labels[each]), 2) metrics_report[each]['accuracy'] = round( metrics.accuracy_score(y, labels[each]) * 100, 2) print(metrics_report) #visualizing - k-means clustering of ICA transformed dataset plt.scatter(X_scaled_transformed[kmeans.labels_ == 1, 0], X_scaled_transformed[kmeans.labels_ == 1, 1], s=40, c='red', label='Cluster 1') plt.scatter(X_scaled_transformed[kmeans.labels_ == 0, 0], X_scaled_transformed[kmeans.labels_ == 0, 1],
# 训练聚类模型 n_clusters = 3 # 设置聚类数量 model_kmeans = KMeans(n_clusters=n_clusters, random_state=0) # 建立聚类模型对象 model_kmeans.fit(X) # 训练聚类模型 y_pre = model_kmeans.predict(X) # 预测聚类模型 # 模型效果指标评估 n_samples, n_features = X.shape # 总样本量,总特征数 inertias = model_kmeans.inertia_ # 样本距离最近的聚类中心的总和 adjusted_rand_s = metrics.adjusted_rand_score(y_true, y_pre) # 调整后的兰德指数 mutual_info_s = metrics.mutual_info_score(y_true, y_pre) # 互信息 adjusted_mutual_info_s = metrics.adjusted_mutual_info_score(y_true, y_pre) # 调整后的互信息 homogeneity_s = metrics.homogeneity_score(y_true, y_pre) # 同质化得分 completeness_s = metrics.completeness_score(y_true, y_pre) # 完整性得分 v_measure_s = metrics.v_measure_score(y_true, y_pre) # V-measure得分 silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean') # 平均轮廓系数 calinski_harabaz_s = metrics.calinski_harabaz_score( X, y_pre) # Calinski和Harabaz得分 print('samples: %d \t features: %d' % (n_samples, n_features)) # 打印输出样本量和特征数量 print(70 * '-') # 打印分隔线 print('ine\tARI\tMI\tAMI\thomo\tcomp\tv_m\tsilh\tc&h') # 打印输出指标标题 print('%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d' % (inertias, adjusted_rand_s, mutual_info_s, adjusted_mutual_info_s, homogeneity_s, completeness_s, v_measure_s, silhouette_s, calinski_harabaz_s)) # 打印输出指标值 print(70 * '-') # 打印分隔线 print('short name \t full name') # 打印输出缩写和全名标题 print('ine \t inertias') print('ARI \t adjusted_rand_s') print('MI \t mutual_info_s')
plt.scatter(X[:, 0], X[:, 1], s=50) plt.show() # homogeneity, completeness, and v-measure k = [2, 3, 4, 5, 6, 7, 8] homo_score = [] comp_score = [] vm_score = [] for n_cluster in k: y_pred = KMeans(n_clusters=n_cluster, max_iter=1000, random_state=47).fit_predict(X) h**o = metrics.homogeneity_score(y, y_pred) comp = metrics.completeness_score(y, y_pred) vm = metrics.v_measure_score(y, y_pred) homo_score.append(h**o) comp_score.append(comp) vm_score.append(vm) plt.plot(k, homo_score, 'r', label='Homogeneity') plt.plot(k, comp_score, 'b', label='Completeness') plt.plot(k, vm_score, 'y', label='V- Measure') plt.xlabel('Value of K') plt.ylabel('homogeneity_completeness_v_measure') plt.legend(loc=4) plt.show() # Adjusted Rand Index k = [2, 3, 4, 5, 6, 7, 8] scores = []
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None): vecinos = algorithms[ALGORITHM] algoritmos = "coseno" if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]: algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS elif PORCENTAJE_VECINOS != 0: algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS) titulo = MODELO + "-" + algoritmos if normalizar is not None: titulo += "-" + normalizar fname = sys.argv[2] + "/" + titulo + ".out" if os.path.isfile(fname): return print(titulo) print("-" * 20) if PORCENTAJE_VECINOS == 0: X = coseno if MODELO == "dbscan": # Solo sirve para coseno! X = 1 - X else: neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy" if os.path.isfile(neighbour_file_name): NEIGHBOURS = np.load(neighbour_file_name) else: print("Calculando vecinos") NEIGHBOURS = np.zeros((len(service_number), len(service_number))) for i in range(0, len(service_number)): for j in range(i, len(service_number)): NEIGHBOURS[i][j] = vecinos(followers, users, i, j) if i != j: NEIGHBOURS[j][i] = NEIGHBOURS[i][j] np.save(neighbour_file_name, NEIGHBOURS) if normalizar is not None: print("Normalizando Vecinos") if normalizar == 'minmax': NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS) elif normalizar == 'scale': NEIGHBOURS = preprocessing.scale(NEIGHBOURS) elif normalizar == 'robust': NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS) elif normalizar == 'softmax': NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True) elif normalizar == 'matrixminmax': NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS)) elif normalizar == 'matrixmax': NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS) if MODELO == "dbscan": # Si es distancia if normalizar is not None: NEIGHBOURS = 1 - NEIGHBOURS else: NEIGHBOURS = - NEIGHBOURS X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS else: # Si es afinidad if PORCENTAJE_VECINOS == "boost": X = np.multiply(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "maxsim": X = np.maximum(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "dist": NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS)) COSINE_SORTED = np.argsort(np.argsort(coseno)) POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED))) X = POS_BOOST else: X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS print("Generando Modelo") if MODELO == 'kmedoids': model = KMedoids(n_clusters=1500).fit(X) if MODELO == 'kmedoids470': model = KMedoids(n_clusters=470).fit(X) elif MODELO == 'ap': model = AffinityPropagation(affinity='precomputed').fit(X) elif MODELO == 'dbscan': model = DBSCAN(metric='precomputed').fit(X) labels = model.labels_ clusters = defaultdict(list) for index, classif in enumerate(labels): clusters[classif].append(index) n_clusters_ = len(clusters) info = "" info += 'Clusters: %d\n' % n_clusters_ # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels) info += 'Entropy: %0.3f\n' % entropy(labels_true, labels) info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels) info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels) info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels) info += 'Purity: %0.3f\n' % purity(labels_true, labels) info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels) info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels) info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels) clustersize = Counter(labels) salida = open(fname, 'w', encoding='UTF-8') print(info) salida.write(titulo + "\n") for cluster, services in clusters.items(): countcat = Counter([labels_true[svc] for svc in services]) max_key, num = countcat.most_common(1)[0] salida.write("%i (%s - %i/%i): %s \n" % ( cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services]))) salida.write("-" * 20 + "\n") salida.write(info) salida.close()
def kmeans_model(self, test_size, random_state,show=None): # pre-process the data standardized_data = scale(self.data) # splitting the data into training and testing sets # typically 3/4 of the data is used to train, 1/4 of the data is used to test # x is the data you are testing : y is the target values of the corresponding data x_train, x_test, y_train, y_test, images_train, images_test = train_test_split(standardized_data, self.target, self.images, test_size=test_size, random_state=random_state) # gets the number of training features n_samples, n_features = x_train.shape # print out the number of samples and features print("# of training samples: ", n_samples) print("# of training features: ", n_features) # num_digits is the amount of unique targets n_digits = len(np.unique(y_train)) # create the KMeans model. # init defaults to init='k-means++' # add n-init argument to determine how many different centroid configurations the algorithm will try clf = cluster.KMeans(init='k-means++', n_clusters=n_digits, random_state=random_state) # fit the x_train data to the model clf.fit(x_train) if show: # create the figure with a size of 8x3 inches fig = plt.figure(figsize=(8, 4)) # Add title fig.suptitle('Cluster Center Images', fontsize=14, fontweight='bold') # For all labels (0-9) for i in range(10): # Initialize subplots in a grid of 2X5, at i+1th position ax = fig.add_subplot(2, 5, 1 + i) # Display images ax.imshow(clf.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest") # Don't show the axes plt.axis('off') # Show the plot plt.show() # predict the labels for x_test y_pred = clf.predict(x_test) # print out the first 50 predicted and test values print("Predicted Values:\n",y_pred[:50]) print("Target Values:\n",y_test[:50]) print("Shape of Data:\n",clf.cluster_centers_.shape) # Create an isomap and fit the `digits` data to it x_iso = Isomap(n_neighbors=10).fit_transform(x_train) # Compute cluster centers and predict cluster index for each sample clusters = clf.fit_predict(x_train) if show: # Create a plot with subplots in a grid of 1X2 fig = plt.figure(1, (8, 4)) gs = gridspec.GridSpec(1, 2) ax = [fig.add_subplot(ss) for ss in gs] # Adjust layout fig.suptitle('Predicted Versus Training Labels(ISOMAP)', fontsize=14, fontweight='bold') # Add scatterplots to the subplots ax[0].scatter(x_iso[:, 0], x_iso[:, 1], c=clusters, edgecolors='black') ax[0].set_title('Predicted Training Labels') ax[1].scatter(x_iso[:, 0], x_iso[:, 1], c=y_train, edgecolors='black') ax[1].set_title('Actual Training Labels') gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) # Show the plots plt.show() # Model and fit the `digits` data to the PCA model x_pca = PCA(n_components=2).fit_transform(x_train) # Compute cluster centers and predict cluster index for each sample clusters = clf.fit_predict(x_train) if show: # Create a plot with subplots in a grid of 1X2 fig = plt.figure(1, (8, 4)) gs = gridspec.GridSpec(1, 2) ax = [fig.add_subplot(ss) for ss in gs] # Adjust layout fig.suptitle('Predicted Versus Training Labels (PCA)', fontsize=14, fontweight='bold') fig.subplots_adjust(top=0.85) # Add scatterplots to the subplots ax[0].scatter(x_pca[:, 0], x_pca[:, 1], c=clusters, edgecolors='black') ax[0].set_title('Predicted Training Labels') ax[1].scatter(x_pca[:, 0], x_pca[:, 1], c=y_train, edgecolors='black') ax[1].set_title('Actual Training Labels') gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) # Show the plots plt.show() # Print out the confusion matrix to see how the model is incorrect print("Classification Report:\n",metrics.classification_report(y_test, y_pred)) print("Confusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred)) # So looking at these numbers we can see that the kmeans model is not a good fit for our problem # this means that we must pick a different model for our data print('% 9s' % 'inertia h**o compl v-meas ARI AMI silhouette') print('%i %.3f %.3f %.3f %.3f %.3f %.3f' % (clf.inertia_, homogeneity_score(y_test, y_pred), completeness_score(y_test, y_pred), v_measure_score(y_test, y_pred), adjusted_rand_score(y_test, y_pred), adjusted_mutual_info_score(y_test, y_pred), silhouette_score(x_test, y_pred, metric='euclidean')))
n_samples, n_features = data.shape print("\t n_samples %d, \t n_features %d" % (n_samples, n_features)) print(82 * '_') print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') t0 = time.time() kmeans = KMeans(init='random', n_clusters=10, n_init=10) kmeans.fit(data) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ('Random', (time.time() - t0), kmeans.inertia_, metrics.homogeneity_score(labels, kmeans.labels_), metrics.completeness_score(labels, kmeans.labels_), metrics.v_measure_score(labels, kmeans.labels_), metrics.adjusted_rand_score(labels, kmeans.labels_), metrics.adjusted_mutual_info_score( labels, kmeans.labels_, average_method='arithmetic'), metrics.silhouette_score( data, kmeans.labels_, metric='euclidean', sample_size=sample_size))) print(82 * '_') # Visualize the results on PCA-reduced data - random_raw reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='random', n_clusters=10, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ.
# x_data = x_data[:,params] # print(x_data.shape) km = KMeans(n_clusters=3) km.fit(x_data) # 每个样本所属的类 predict_pre = km.labels_ print("===========================================") print("聚类结果:") print(predict_pre) # 兰德系数,衡量的是两个数据分布的吻合程度 print("调整兰德系数是:" + str(metrics.adjusted_rand_score(y_data,predict_pre))) # V-measure print("同质性:" + str(metrics.homogeneity_score(y_data, predict_pre))) print("完整性:" + str(metrics.completeness_score(y_data, predict_pre))) print("两者的调和平均V-measure:" + str(metrics.v_measure_score(y_data, predict_pre))) # 轮廓系数 print("轮廓系数:" + str(metrics.silhouette_score(x_data, predict_pre))) # 建立四个颜色的列表 color = ['orange', 'green', 'blue'] # 遍历列表,把预测结果标记成对应的颜色 colr1 = [color[i] for i in predict_pre] plt.scatter(x_data[:, 1], x_data[:, 2], color=colr1) plt.xlabel("Feature 1") plt.ylabel("Feature 2") plt.show()
plt.savefig(path.join(PLOT_DIR, abbrev + "_em-nmf_scatter.png"), bbox_inches='tight') plt.show() plt.close() # parallel coordinates plot print("# Parallel Coordinates Plot for " + label) visualizer = ParallelCoordinates(features=feature_names, sample=0.1, shuffle=True, fast=True) visualizer.fit_transform(X, y_pred) visualizer.ax.set_xticklabels(visualizer.ax.get_xticklabels(), rotation=45, horizontalalignment='right') visualizer.finalize() plt.savefig(path.join(PLOT_DIR, abbrev + "_em-nmf_parallel.png"), bbox_inches='tight') visualizer.show() plt.close() # compare with ground truth (classes) print(label + ": Homogeneity Score = " + str(metrics.homogeneity_score(y, y_pred))) print(label + ": V Measure Score = " + str(metrics.v_measure_score(y, y_pred))) print(label + ": Mutual Info Score = " + str(metrics.mutual_info_score(y, y_pred))) print(label + ": Adjusted Rand Index = " + str(metrics.adjusted_rand_score(y, y_pred)))
n_jobs=None, p=None) y_method1 = cluster.fit_predict(X_pca) labels = cluster.labels_ noOfClusters = len(set(labels)) print(noOfClusters) print("Method1: ", Counter(y_method1)) s_score = silhouette_score(X_pca, y_method1) print(s_score) homogeneity_score = homogeneity_score(target, labels) v_measure_score = v_measure_score(target, labels, beta=20.0) completeness_score = completeness_score(target, labels) contingency_matrix = contingency_matrix(target, labels) print("homogeneity_score: ", homogeneity_score) print("v_measure_score: ", v_measure_score) print("completeness_score: ", completeness_score) print("contingency_matrix: ", contingency_matrix) print(datetime.now() - startTime) #DBSCAN Guiding Question 3: import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder, LabelEncoder, scale from sklearn.decomposition import PCA
#*****************************calculation************************************** num_cluster = 3 clusters = tfidf_kmeans(TF_X, k=num_cluster) #print(len(clusters)) #print(len(labels)) print(82 * '_') print( 'init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tkappa\tcorr\tsilh_Clus\tsilh_HMN' ) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%-9s\t%.3f\t%.3f' % ( name, (time() - t0), metrics.homogeneity_score(labels, clusters), metrics.completeness_score(labels, clusters), metrics.v_measure_score(labels, clusters), metrics.adjusted_rand_score(labels, clusters), metrics.adjusted_mutual_info_score(labels, clusters), metrics.cohen_kappa_score(labels, clusters, weights='linear'), str(spearmanr(labels, clusters)), metrics.silhouette_score(TF_X, clusters, metric='euclidean'), metrics.silhouette_score(TF_X, labels, metric='euclidean'), )) #**************************error analysis************************************** from sklearn.metrics.cluster import contingency_matrix x = labels #actual labels y = clusters #predicted labels error_analysis = contingency_matrix(x, y) #***************************plot************************************************ from sklearn.metrics.pairwise import cosine_similarity
# report timing printlog('\t Time taken = {0} s'.format(end - start)) # ---------------------------------------------------------------------- # stats # Number of clusters in labels, ignoring noise if present. n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_clusters_true = len(set(labels_true)) - (1 if -1 in labels else 0) printlog('\t Estimated number of clusters: {0}'.format(n_clusters)) # print stats args = [labels_true, labels] pargs = [ metrics.homogeneity_score(*args), metrics.completeness_score(*args), metrics.v_measure_score(*args), metrics.adjusted_rand_score(*args), metrics.adjusted_mutual_info_score(*args) ] printlog("\t Homogeneity: {0:.3f}\n\t Completeness: {1:.3f}" "\n\t V-measure: {2:.3f}\n\t Adjusted Rand Index: {3:.3f}" "\n\t Adjusted Mutual Information: {4:.3f}".format(*pargs)) # ---------------------------------------------------------------------- # comparing results printlog('Comparing results...') merged = compare_results(groups, labels_true, labels) # ---------------------------------------------------------------------- # Plot result printlog('Plotting graphs...')
stop_words='english', use_idf=True, smooth_idf=True, norm='l2') #vectorizer = TfidfVectorizer(max_df=0.15, min_df=1, stop_words=stopwords, use_idf=True, smooth_idf=True, norm='l2') X_test = vectorizer.fit_transform(test_data) km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=5) km.fit(X_test) print(len(km.labels_), len(test_data)) score = metrics.homogeneity_score(test_labels, km.labels_) print("homogeneity_score: %0.3f" % score) score = metrics.completeness_score(test_labels, km.labels_) print("completeness_score: %0.3f" % score) score = metrics.v_measure_score(test_labels, km.labels_) print("v_measure_score: %0.3f" % score) score = metrics.accuracy_score(test_labels, km.labels_) print("acc_score: %0.3f" % score) score = metrics.normalized_mutual_info_score(test_labels, km.labels_) print("nmi_score: %0.3f" % score) file = open( "D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_traintest", "w") for i in range(len(train_labels)): file.write(train_labels[i] + "\t" + train_trueLabels[i] + "\t" + train_data[i]) for i in range(len(km.labels_)):
if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print()
bestChromosomeInAllGenerations, bestLabelsPredInAllGenerations, bestFitnessInAllGenerations, allBestFitness = EvoNP.run( points, nPoints, k, nChromosomes, nGenerations, crossoverProbability, mutationProbability) print("HS: " + str( float("%0.2f" % metrics.homogeneity_score( labelsTrue, bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations])))) print("CS: " + str( float("%0.2f" % metrics.completeness_score( labelsTrue, bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations])))) print("VM: " + str( float("%0.2f" % metrics.v_measure_score( labelsTrue, bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations])))) print("AMI: " + str( float("%0.2f" % metrics.adjusted_mutual_info_score( labelsTrue, bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations])))) print("ARI: " + str( float("%0.2f" % metrics.adjusted_rand_score( labelsTrue, bestLabelsPredInAllGenerations[bestChromosomeInAllGenerations])))) # plot fitness progression allGenerations = [x + 1 for x in range(nGenerations)] plt.plot(allGenerations, allBestFitness) plt.title(filename[:-4]) plt.xlabel('Generations')
titles = '原始数据', 'KMeans++聚类', '旋转后数据', '旋转后KMeans++聚类',\ '方差不相等数据', '方差不相等KMeans++聚类', '数量不相等数据', '数量不相等KMeans++聚类' model = KMeans(n_clusters=4, init='k-means++', n_init=5) plt.figure(figsize=(8, 9), facecolor='w') for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1): plt.subplot(4, 2, i) plt.title(title) if i % 2 == 1: y_pred = y else: y_pred = model.fit_predict(x) print(i) print('Homogeneity:', homogeneity_score(y, y_pred)) print('completeness:', completeness_score(y, y_pred)) print('V measure:', v_measure_score(y, y_pred)) print('AMI:', adjusted_mutual_info_score(y, y_pred)) print('ARI:', adjusted_rand_score(y, y_pred)) print('Silhouette:', silhouette_score(x, y_pred), '\n') plt.scatter(x[:, 0], x[:, 1], c=y_pred, s=30, cmap=cm, edgecolors='none') x1_min, x2_min = np.min(x, axis=0) x1_max, x2_max = np.max(x, axis=0) x1_min, x1_max = expand(x1_min, x1_max) x2_min, x2_max = expand(x2_min, x2_max) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(b=True, ls=':') plt.tight_layout(2, rect=(0, 0, 1, 0.97)) plt.suptitle('数据分布对KMeans聚类的影响', fontsize=18) plt.show()
# bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),name="random", data=data) for i in range(5): bench_k_means(GaussianMixture(n_components=n_digits_i[i], random_state=0), name="GaussianMixture", data=PCA_data_trans) #ICA---------------------------------- kmeans_ICA = KMeans(n_clusters=2, random_state=0).fit(ICA_data_trans) float(sum(kmeans_ICA.labels_ == labels)) / float(len(labels)) metrics.homogeneity_score(labels, kmeans_ICA.labels_) EMax_ICA = GaussianMixture(n_components=2, random_state=0).fit(ICA_data_trans) EMax_ICA.labels_ = EMax_ICA.predict(ICA_data_trans) float(sum(EMax_ICA.labels_ == labels)) / float(len(labels)) metrics.homogeneity_score(labels, EMax_ICA.labels_) metrics.completeness_score(labels, EMax_ICA.labels_) metrics.v_measure_score(labels, EMax_ICA.labels_) metrics.adjusted_rand_score(labels, EMax_ICA.labels_) metrics.adjusted_mutual_info_score(labels, EMax_ICA.labels_) metrics.silhouette_score(ICA_data_trans, EMax_ICA.labels_, metric='euclidean', sample_size=sample_size) n_digits_i = [2, 5, 10, 20, 50] for i in range(5): bench_k_means(KMeans(init='k-means++', n_clusters=n_digits_i[i], n_init=10), name="k-means++", data=ICA_data_trans) # bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),name="random", data=data) for i in range(5):
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""): clf = EM(n_components=times) clf.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # centroids = clf.cluster_centers_ # plt.scatter(centroids[:, 0], centroids[:, 1], # marker='x', s=169, linewidths=3, # color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() clf = EM(n_components=times) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) checker = EM(n_components=times) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) # newtx = np.append(td) # newrx = np.append(rd) myNN(test, ty, result, ry, alg="EM_"+alg) errs = [] scores = [] # this is what we will compare to checker = EM(n_components=2) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} # create a clusterer clf = EM(n_components=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] errs.append(sum((processed-truth)**2) / float(len(ry))) scores.append(clf.score(tx, ty)) adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) v_meas.append(metrics.v_measure_score(ry.ravel(), result)) mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg) # other metrics # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"] plt.figure() plt.title(dataset+": EM Clustering measures - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.plot(range(2,times),adj_rand, label="Adjusted Random") plt.plot(range(2,times),v_meas, label="V Measure") plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score") plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score") plt.legend() plt.savefig("EMMetrics"+dataset+"_"+alg+".png") kmeans = KM(n_clusters=2) kmeans.fit(reduced_data) Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n' 'Centroids are marked with a white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
def find_similar_products(): df = pd.read_csv( '/Users/srinath/playground/data-science/BimboInventoryDemand/producto_tabla.csv' ) labels = df['Producto_ID'] print "have " + str(df.shape[0]) + "products" vectorizer = TfidfVectorizer(max_df=0.5, max_features=200, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(df['NombreProducto']) print("n_samples: %d, n_features: %d" % X.shape) print type(X) print X print("Performing dimensionality reduction using LSA") # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(5) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print("new size", X.shape) print type(X) print X # Do the actual clustering km = KMeans(n_clusters=30, init='k-means++', max_iter=100, n_init=1, verbose=True) print("Clustering sparse data with %s" % km) #km.fit(X) results = km.fit_predict(X) print len(results), results print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() products_clusters = np.column_stack([labels, results]) to_saveDf = pd.DataFrame(products_clusters, columns=["Producto_ID", "Cluster"]) to_saveDf.to_csv('product_clusters.csv', index=False) to_saveDf['NombreProducto'] = df['NombreProducto'] grouped = to_saveDf.groupby(['Cluster'])['NombreProducto'] grouped.apply(print_cluster)
def km(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""): processed = [] adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] sil = [] inertia = [] for i in range(2,times): clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(tx) test = clf.predict(tx) result = clf.predict(rx) adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) v_meas.append(metrics.v_measure_score(ry.ravel(), result)) mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) inertia.append(clf.inertia_) plots = [adj_rand, v_meas, mutual_info, adj_mutual_info] plt.title(dataset+": KM Clustering measures - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.plot(range(2,times), adj_rand, label="Adjusted Random") plt.plot(range(2,times), v_meas, label="V Measure") plt.plot(range(2,times), mutual_info, label = "Fowlkes Mallows Score") plt.plot(range(2,times), adj_mutual_info, label="Homogeneity Score") plt.legend() plt.ylim(ymin=-0.05, ymax=1.05) plt.savefig("KMeansMetric"+dataset+"_"+alg+".png") plt.figure() plt.title(dataset+": KMeans Inertia - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Inertia') plt.plot(range(2,times), inertia) plt.savefig("KM-Inertia-"+dataset+"-"+alg+".png") td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 best_clusterer = KM(n_clusters=4) best_clusterer.fit(X) Z = best_clusterer.predict(X) print(len(Z)) print(len(X)) plt.figure(1) plt.clf() colors = ['r', 'g', 'b', 'y', 'c', 'm','#eeefff', '#317c15', '#4479b4', '#6b2b9c', '#63133b', '#6c0d22', '#0c7c8c', '#67c50e','#c5670e', '#946c47', '#58902a', '#54b4e4', '#e4549e', '#2b2e85' ] for i in range(0, len(X)): plt.plot(X[i][0], X[i][1], marker='.', color=colors[Z[i]], markersize=2) #plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = best_clusterer.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='k', zorder=10) plt.title('K-means Clusters ' + alg) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() kmeans = KM(n_clusters=3) kmeans.fit(tx) result=pd.DataFrame(kmeans.transform(tx), columns=['KM%i' % i for i in range(3)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(result['KM0'], result['KM1'], result['KM2'], c=my_color, cmap="Dark2_r", s=60) plt.show() reduced_data = PCA(n_components=2).fit_transform(tx) kmeans = KM(n_clusters=4) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(dataset + ': K-means clustering (' + alg + '-reduced data)\n' 'Centroids are marked with a white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() checker = KM(n_clusters=2) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) clusters = {x:[] for x in range(4)} clf = KM(n_clusters=4) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(4)} processed = [mapper[val] for val in result] print(sum((processed-truth)**2) / float(len(ry))) clf = KM(n_clusters=times) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) checker = KM(n_clusters=times) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(td) newrx = np.append(rd) myNN(test, ty, result, ry, alg="KM_"+alg) nn(newtx, ty, newrx, ry, add="onKM"+add)
km = categ.fit_predict(X) #Agglomerative Clustering Technique ''' from sklearn.cluster import AgglomerativeClustering categ = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward') km = categ.fit_predict(X) ''' #Confusion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y, km) #accuracy Score from sklearn.metrics import v_measure_score print("v_measure_score", v_measure_score(Y, km)) #print("accuracy_score = {:.2f}%".format(accuracy_score(Y,km)*100)) #Visualising the cluster ''' plt.scatter(X[Y == 0,0], X[Y == 0,1], s=100, c='red', label = 'cluster 1') plt.scatter(X[Y == 1,0], X[Y == 1,1], s=100, c='green', label = 'cluster 2') plt.scatter(X[Y == 2,0], X[Y == 2,1], s=100, c='black', label = 'cluster 3') #plt.scatter(y_pred.cluster_centers_[:,0], y_pred.cluster_centers_[:,1], s = 300, c='yellow') plt.title('K-mean_cluster(real)') plt.xlabel('pc1') plt.ylabel('pc2') plt.show() #Visualising the cluster plt.scatter(X[km == 0,0], X[km == 0,1], s=100, c='red', label = 'cluster 1')
km_sse= [] km_silhouette = [] km_vmeasure =[] km_ami = [] km_homogeneity = [] km_completeness = [] cluster_range = (2,12) for i in range(cluster_range[0],cluster_range[1]): km = KMeans(n_clusters=i, random_state=0).fit(X_random_proj) preds = km.predict(X_random_proj) km_sse.append(-km.score(X_random_proj)) km_silhouette.append(silhouette_score(X_random_proj,preds)) km_vmeasure.append(v_measure_score(y,preds)) km_ami.append(adjusted_mutual_info_score(y,preds)) km_homogeneity.append(homogeneity_score(y,preds)) km_completeness.append(completeness_score(y,preds)) print(f"Done for cluster {i}") # In[100]: plt.figure(figsize=(21,11)) #SSE plt.subplot(2,3,1) plt.plot([i for i in range(cluster_range[0],cluster_range[1])],km_sse,'b-o',linewidth=3,markersize=12) plt.grid(True)