Example #1
0
def compute_silhouette_score(clusters):
    """
    Compute the euclidean silhouette score and the cosine silhouette score. Return the scores.

    :param clusters: clusters assignment for each tweet
    :type clusters: list

    :return: the silhouette scores
    :rtype: tuple
    """

    # Load the files
    tfidf_matrix = pickle.load(open('TF-IDF Matrix - ' + str(n_data) + ' Tweets.p', 'rb'))

    # Compute the Silhouette Score
    start = timer()
    distance = 1 - cosine_similarity(tfidf_matrix)
    euclidean_silhouette_score = silhouette_score(tfidf_matrix, clusters, metric='euclidean')
    cosine_silhouette_score = silhouette_score(distance, clusters, metric='precomputed')
    end = timer()
    print('Silhouette Score (Euclidean): %.4f' % euclidean_silhouette_score)
    print('Silhouette Score (Cosine):  %.4f' % cosine_silhouette_score)
    print('Obtained the Silhouette Score in %.2f seconds' % (end - start))
    txt_file.write('Silhouette Score (Euclidean): %.4f. \n' % euclidean_silhouette_score)
    txt_file.write('Silhouette Score (Cosine):  %.4f. \n' % cosine_silhouette_score)
    txt_file.write('Obtained the Silhouette Score in %.2f seconds. \n' % (end - start))

    return euclidean_silhouette_score, cosine_silhouette_score
Example #2
0
 def test_sihouette(self):
     n1 = np.array([[1,2,1], [1,3,1], [7,8,2], [7,9,2], [13,19,3]])
     print(Silhouette.score(n1))
     print(silhouette_score(n1, n1[:,-1]))
     n2 = np.array([[1,2,1], [1,3,2], [7,8,2], [7,9,1], [13,19,3]])
     print(Silhouette.score(n2))
     print(silhouette_score(n2, n2[:,-1]))
def get_constant_height_labels(clustering, n_clusters=None):
    """
    use silhouette analysis to select the best heigh to cut a linkage matrix
    :df: a correlation matrix
    parse_heatmap: int (optional). If defined, devides the columns of the 
                    heatmap based on cutting the dendrogram
    """
    N_variables = len(clustering['reorder_vec'])
    scores = []
    if n_clusters is None:
        for k_clusters in range(2,N_variables//3):
            labels = cut_tree(clustering['linkage'], n_clusters=k_clusters)
            try:
                score = silhouette_score(clustering['distance_df'], 
                                         labels.ravel(), metric='precomputed')
            except ValueError:
                continue
            scores.append((k_clusters,score))
        best_k = max(scores, key=lambda x: x[1])[0]
        labels = cut_tree(clustering['linkage'], n_clusters=best_k)

    else:
        labels = cut_tree(clustering['linkage'], n_clusters=n_clusters)
        score = silhouette_score(clustering['distance_df'], 
                                         labels, metric='precomputed')
        scores.append((n_clusters, score))
    labels = reorder_labels(labels.flatten(), clustering['linkage'])
    # comparison
    MI = adjusted_mutual_info_score(labels, clustering['labels'])
    return labels, scores, MI
Example #4
0
def bench_k_means(estimator, data, labels):
    t0 = time()
    estimator.fit(data)
    print("time to fit: {:.5}".format(time() - t0))
    homogenity = metrics.homogeneity_score(labels, estimator.labels_)
    completeness = metrics.completeness_score(labels, estimator.labels_)
    v_measure = metrics.v_measure_score(labels, estimator.labels_)
    print("homogenity {:.5}, completeness {:.5}, v_measure_score {:.5}".format(
        homogenity, completeness, v_measure)
    )

    adj_rand_score = metrics.adjusted_rand_score(
        labels, estimator.labels_
    )
    print("adjusted_rand_score {:.5}".format(adj_rand_score))

    adj_mutual_info_score = metrics.adjusted_mutual_info_score(
        labels,  estimator.labels_
    )
    print("adjusted_mutual_info_score {:.5}".format(
        adj_mutual_info_score)
    )

    silhouette_score = metrics.silhouette_score(
        data, estimator.labels_, metric='euclidean'
    )
    print("silhouette_score {:.5}".format(
        metrics.silhouette_score(data, estimator.labels_,
                                 metric='euclidean'))
    )

    return [
        homogenity, completeness, v_measure, adj_rand_score,
        adj_mutual_info_score, silhouette_score
    ]
Example #5
0
def clustering_drawing():
    X,Tag = getData()

    n = 3
    kmeans_model = KMeans(n_clusters = n).fit(X)
    labels = kmeans_model.labels_
    score = metrics.silhouette_score(X, labels, metric='euclidean')

    scoreList = [score] 
    nList = [3,4,5,6,7,8,9]

    for i in range(4,10):# 聚类4-10类循环
        # print i
        kmeans_model_temp = KMeans(n_clusters=i).fit(X)
        labels_temp = kmeans_model_temp.labels_
        score_temp = metrics.silhouette_score(X, labels_temp, metric='euclidean')
        print i,score_temp
        scoreList.append(float(score_temp))
        if float(score_temp) > score:
        	kmeans_model = kmeans_model_temp
        	labels = labels_temp
        	n = i
    print n,labels
    plt.axis([3,9,0.8,1.0])
    plt.plot(nList, scoreList, 'r--')
    plt.show()	
Example #6
0
def print_cluster(clusterTrainClass, labels, clusterTestStory):
	print("Homogeneity: %0.3f" % metrics.homogeneity_score(clusterTrainClass, labels))
	print("Completeness: %0.3f" % metrics.completeness_score(clusterTrainClass, labels))
	print("V-measure: %0.3f" % metrics.v_measure_score(clusterTrainClass, labels))
	print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(clusterTrainClass, labels))
	print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(clusterTrainClass, labels))
	print "Silhouette Coefficient:"
	print metrics.silhouette_score(clusterTestStory, labels, metric='euclidean')
def benchmark(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return [time() - t0, estimator.inertia_, metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=None)]
def drawwDBSCAN(newarray,comparearray,cityname):
    X = StandardScaler().fit_transform(newarray)
    # print newarray
    # print "#########"
    # print X
    # X = newarray
    ##############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels))

    ##############################################################################
    # Plot result
    matplotlib.style.use('ggplot')
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)

    plt.title('Estimated number of clusters: %d' % n_clusters_)

    imgname = "./clusterimage/hourcondimention/" +cityname+'.png'
    fig = plt.gcf()
    fig.set_size_inches(16.5, 12.5)
    fig.savefig(imgname)

    ScandARI = drawlableCLuster(comparearray,labels,cityname.split('_')[3])
    print ScandARI

    with open('summary_hour_total_dimention.csv','a') as f:
        write = csv.writer(f)
        # write.writerow(['name','clusters','SC'])
        write.writerow([cityname,n_clusters_,metrics.silhouette_score(X, labels, metric='sqeuclidean')])
        write.writerow(["hour_dimention_twitterinfo"+cityname.split('_')[3],ScandARI[0],ScandARI[1],ScandARI[2]])
Example #9
0
    def eval_perf(self):
        X_tst, y_tst = self.data.get_test_set()
        code = self.dbn.f_code(X_tst)

        from sklearn import metrics
        sil_c = metrics.silhouette_score(code, y_tst)
        sil_X = metrics.silhouette_score(X_tst, y_tst)
        
        print 'Silhouette code y', sil_c
        print 'Silhouette X y', sil_X
Example #10
0
def cluster_kmeans1():
    dataset = datasets.load_iris()
    X = dataset.data
    kmeans_model = KMeans(n_clusters=4,random_state=1).fit(X)

    labels = kmeans_model.labels_
    print X
    print kmeans_model.cluster_centers_
    print labels
    print metrics.silhouette_score(X,labels,metric="euclidean")
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Example #12
0
def acc_silhouette(X, lbls_true, lbls_pred, reject, strat_lbl_inds, use_strat=False, metric='euclidean'):
    if use_strat:
        dists = sc.distances(X[:, strat_lbl_inds], gene_ids=np.arange(strat_lbl_inds.size), metric=metric )
        sil = metrics.silhouette_score(dists, lbls_pred[strat_lbl_inds], metric='precomputed')
        perc = np.int(np.float(len(strat_lbl_inds))/np.float(lbls_true.size) * 100.0)
        desc = ('Silhouette (strat={0},{1})'.format(perc, metric), 'Silhouette ({0})'.format(metric))
    else:
        dists = sc.distances(X, gene_ids=np.arange(X.shape[1]), metric=metric )
        sil = metrics.silhouette_score(dists, lbls_pred, metric='precomputed')
        desc = ('Silhouette ({0})'.format(metric), 'Silhouette ({0})'.format(metric))
    return sil, desc
Example #13
0
 def fit(self, X, Y=None):
     proj = skl_cluster.KMeans(**self.params)
     if isinstance(X, Table):
         proj = proj.fit(X.X, Y)
         proj.silhouette = silhouette_score(X.X, proj.labels_)
     else:
         proj = proj.fit(X, Y)
         proj.silhouette = silhouette_score(X, proj.labels_)
     proj.inertia = proj.inertia_ / len(X)
     cluster_dist = Euclidean(proj.cluster_centers_)
     proj.inter_cluster = np.mean(cluster_dist[np.triu_indices_from(cluster_dist, 1)])
     return KMeansModel(proj, self.preprocessors)
Example #14
0
def bench_k_means(estimator, name, data, silhouette_results):
    t0 = time()
    estimator.fit(data)
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print('% 9s\t %.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % \
        (name, (time() - t0), \
        estimator.inertia_, \
        metrics.homogeneity_score(labels, estimator.labels_), \
        metrics.completeness_score(labels, estimator.labels_), \
        metrics.v_measure_score(labels, estimator.labels_), \
        metrics.adjusted_rand_score(labels, estimator.labels_), \
        metrics.adjusted_mutual_info_score(labels,  estimator.labels_), \
        metrics.silhouette_score(data, estimator.labels_, metric='euclidean')))
    return str(metrics.silhouette_score(data,estimator.labels_, metric='euclidean')) 
Example #15
0
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

    X = StandardScaler().fit_transform(a_driver['DStats'])
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
#    db = KMeans(n_clusters=20,n_jobs = -1).fit(X)
    db = DBSCAN(eps=0.45).fit(X)
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
#    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(X, labels,metric="mahalanobis"))
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
Example #16
0
	def save_cluster_metrics(self, points, predictions, key, level2_mode = False):

		try:
			silhoutte_global = metrics.silhouette_score(points, predictions, metric='euclidean')
			silhoutte_weighted = utils.silhoutte_weighted(points, predictions)

			self.silhouette_scores_global[key] = silhoutte_global
			self.silhouette_scores_weighted[key] = silhoutte_weighted

			if level2_mode:
				self.level2_silhoutte_global.append(silhoutte_global)
				self.level2_silhoutte_weighted.append(silhoutte_weighted)

		except ValueError as e:
			pass

		# dunn_scores = cluster_evaluation.dunn_index(points, predictions, means)

		dunn_scores = [0, 0, 0]

		if (dunn_scores[0] is not None) and (dunn_scores[1] is not None) and (dunn_scores[2] is not None):

			self.dunn_scores_1[key] = dunn_scores[0]
			self.dunn_scores_2[key] = dunn_scores[1]
			self.dunn_scores_3[key] = dunn_scores[2]

			if level2_mode:
				self.level2_dunn_1.append(dunn_scores[0])
				self.level2_dunn_2.append(dunn_scores[1])
				self.level2_dunn_3.append(dunn_scores[2])
def find_best(df, cls, norm):
    '''
    INPUTS: Pandas DataFrame, String of which dataset is being used,
            Boolean if data is normalized
    OUTPUTS: Prints score to screen. Saves model to RESULTS_DIR
    '''
    if norm == True:
        df = StandardScaler(copy=False).fit_transform(df)

    files = [f for f in os.listdir(RESULTS_DIR)
                   if f.endswith('{}_{}.pkl'.format(cls, norm)) and
                   if f.startswith('k')]]
    scores = []
    for f in files:
        model = pickle.load(open(RESULTS_DIR + f, 'rb'))
        labels = model.predict(df)
        score = silhouette_score(df.values, labels, sample_size=10000)

        name = f.split('_')[1]
        scores.append((score, name))
        print "{} {} {} {}".format(f.split('_')[1], float(score), cls, norm)

        del labels
        del model

    ranked_scores = sorted(scores, reverse=True)
    ranked_scores = [(item[1], item[0]) for item in ranked_scores]

    with open('{}_{}.pkl'.format(cls, norm), 'wb') as f:
        pickle.dump(ranked_scores, f)

    for item in ranked_scores: print item
Example #18
0
 def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): 
      
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv)
     Allstrings=list()
     #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts]
     for row_dict in Allrow_dicts:
         if self.POS =="ALL_EXT":
             Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
         else:
             Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
              
     Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings]  
      
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     vectorizer = CountVectorizer()    
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #--z---------------------------------------------- Array=term_doc.toarray
      
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine')
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #term_doc_lsa = lsa.fit_transform(term_doc)
     print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
     print Res_Labels
     print("n_samples: %d, n_features: %d" % term_doc.shape) 
def ap(data):
    X = data
    af = AffinityPropagation(
        damping=0.8,
        max_iter=200,
        convergence_iter=15,
        preference=None,
        affinity='euclidean',
        verbose=True).fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    # print(
    #     "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    # print("Completeness: %0.3f" % metrics.completeness_score(
    #     labels_true, labels))
    # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    # print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(
    #     labels_true, labels))
    # print("Adjusted Mutual Information: %0.3f" %
    #       metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        X, labels, metric='sqeuclidean'))
    def __initialize_clusters(self):
        # Load the data
        self.companies, self.descriptions, self.company_idx_map = load_data(self.data_dir, has_header=True)

        # Vectorize the data using TF-IDF to help reduce dimensionality
        self.vectorizer = TfidfVectorizer(max_df=0.5, stop_words='english', use_idf=True, ngram_range=(1, 2)) #min_df=2
        X = self.vectorizer.fit_transform(self.descriptions)
        self.instance_vector_array = X.toarray()

        print("n_samples: %d, n_features: %d" % X.shape)

        # Initialize K-means algorithm preferences
        print("Initializing clusters, this takes a few seconds ...")
        self.km = KMeans(n_clusters=self.k, init='k-means++', max_iter=100, n_init=1, verbose=False)

        # Use k-means to generate clusters
        self.km.fit(X)

        # initialize results dictionary
        labels = self.km.labels_
        for i in range(0, self.k, 1):
            self.results[i] = []

        # assign results by label
        for i in range(0, len(labels), 1):
            self.results[labels[i]].append(self.companies[i])
            self.company_to_cluster[self.companies[i].name] = labels[i]

        print("Silhouette Coefficient: %0.3f"
              % metrics.silhouette_score(X, self.km.labels_, sample_size=1000))

        # Write cluster results to Output file
        output_results(self.km, self.k, self.vectorizer, self.results)
        pass
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Example #22
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
Example #23
0
def optimal_cutoff(Y,dist_mat,min_size):
    labels = np.array([sch.fcluster(Y,c,criterion='distance') for c in Y[:,2]])
    score = np.array([metrics.silhouette_score(dist_mat,l) for l in labels[:-min_size]])
    c = Y[:-min_size,2]
    f = interp(c,-score,kind='linear')
    opt_c = opt.fmin(f,x0=c[2*min_size])
    return opt_c
def getBestGMM(X,n_components,cv_types):
    
    '''
    Function that finds the best GMM cluster trying different gaussians and 
    different number of clusters    
    '''
    
    lowest_bic = np.infty
    bic = []
    silhouette = []
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a mixture of Gaussians with EM
            gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
            gmm.fit(X)
            bic.append(gmm.bic(X))
            Y_predicted=gmm.predict(X)
            if cv_type =='tied':
                silhouette.append(metrics.silhouette_score(X, Y_predicted,  metric='euclidean'))
                #I only save the values for tied, because i know from the first run that its the best gaussian
            if n_components>=1:
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm
                    
    bic = np.array(bic)
    color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm','y'])
    return best_gmm,color_iter,bic,silhouette
Example #25
0
def kmean_clusters(min_list, d, show, thresh):
    np.random.seed(0)

    k_rng = range(1, len(min_list))
    est = [KMeans(n_clusters=k).fit(d) for k in k_rng]

    silhouette_score = [
        metrics.silhouette_score(d, e.labels_, metric='euclidean') for e in est[1:]]
    within_sum_squares = [e.inertia_ for e in est]

    diff_sq = [sq / within_sum_squares[0] for sq in within_sum_squares]

    diff_sq_pd = pd.Series(diff_sq)

    k_list = list(k_rng)
    select_k = k_list[len(k_list) - 1]
    thresh_pd = diff_sq_pd[diff_sq_pd < thresh]
    if thresh_pd.shape[0] > 0:
        select_k = k_list[thresh_pd.index[0]]

    if show:
        TLineDrawer.plot_elow_k_choice(k_rng, silhouette_score, within_sum_squares, select_k)

    select_est = est[select_k - 1]
    y_kmean = select_est.predict(d)
    return y_kmean, select_k
Example #26
0
File: kmeans.py Project: awm182/494
def findClusterSize(data): #also returns the value of the Silhouette Coefficient (I trust it more than elbow method)
	K = range(2,10)
	meandistortions = []
	silCoeffs = []

	for k in K:
		kmeans = KMeans(n_clusters=k)
		kmeans.fit(data)
		meandistortions.append(sum(np.min(cdist(data,kmeans.cluster_centers_,'euclidean'),axis=1))/data.shape[0])
		silCoeffs.append(metrics.silhouette_score(data,labels=kmeans.labels_,metric='euclidean'))

	print "\n\nMean Distortions"
	print "------------------"
	for i in K:
		print 'K:',i,'\t',meandistortions[i-2],'\t Silhouette Coeff: ',silCoeffs[i-2]

	kDistTuple = []
	print "\nDistortion Decline"
	print "-----------------------"
	for i in range(1,len(meandistortions)):
		difference = meandistortions[i-1]-meandistortions[i]
		kDistTuple.append([difference,i+1])
	for i in range(len(kDistTuple)-1):
		print 'K: %2d -> %2d %10.5f'%(kDistTuple[i][1],kDistTuple[i+1][1],kDistTuple[i][0])

	kDistTuple.sort()
	print "\n\nElbow Method Suggestion for Clusters: ",kDistTuple[0][1]

	kSilCoeff = zip(silCoeffs,K)
	kSilCoeff.sort()
	print "Best Silhouette Coeff and cluster size: ",kSilCoeff[-1]
	return kSilCoeff[-1]
def compute_silhouette_score(X, tree, metric_measure):
	'''
	n : sample sizes |X|
	num of clusters, k = [1..n]
	for each value of k
	      P_k: partition of X having k cluster (based on the maximum distance (or the radius) of a cluster) 
	      compute silhouette score for P_k 

	input:  X : data 
		tree: ward tree
		matric_measure ('euclidean', ...)
	output: float array 1D size n
		value of silhouette score of partion P_k 
	'''
    	n = len(X)
    	score = np.zeros(n-1)
	print 'Length : ', n

    	for i in range(n-1):
       	#canot calculate the silhouette score for only one cluster
           #should start from 2 clusters 
           k = i + 2
		print '\n Cutting at k = ', k
        	label = _hc_cut(k,tree.children_, tree.n_leaves_)
		print '\n Compute score ...'
        	s = metrics.silhouette_score(X, label, metric = metric_measure)
		#s = silhouette_score_block(X, label, metric= metric_measure , sample_size=None	)	
		score[k-2] = s
def fit_dbscan(data, eps, min_samples, normalize=True,
               show=True, juxta_cluster_indices_grouped=None, threshold_legend=None):
    X = np.transpose(data)

    if normalize:
        from sklearn.preprocessing import minmax_scale
        minmax_scale(X, feature_range=(-1, 1), axis=0, copy=False)

    from sklearn.cluster import DBSCAN
    from sklearn import metrics
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    score = metrics.silhouette_score(X, labels, sample_size=5000)
    print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_))
    print("Silhouette Coefficient: {}".format(score))

    if show:
        pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend)

    return db, n_clusters_, labels, core_samples_mask, score
def calculateNumberOfIdealClusters(maxAmount, corpus):
	print "Initializing silhouette analysis"
	range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs

	silhouette_high = 0;
	silhouette_high_n_clusters = 2;

	for n_clusters in range_n_clusters:
		# Initialize the clusterer with n_clusters value
		cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
		cluster_labels = cluster.fit_predict(corpus)

		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed clusters
		silhouette_avg = silhouette_score(corpus, cluster_labels)

		print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)

		if (silhouette_avg > silhouette_high):
		    silhouette_high = silhouette_avg
		    silhouette_high_n_clusters = n_clusters

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(corpus, cluster_labels)

	print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
	return silhouette_high_n_clusters
Example #30
0
def _cluster(params):
    cls = None
    method = sh.getConst('method')
    if method=='kmedoid':
        assert False
        # from kmedoid import kmedsoid
        # cls = kmedoid
    elif method=='dbscan':
        from sklearn.cluster import DBSCAN
        cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'],
                     metric='precomputed')
    else:
        assert False, 'FATAL: unknown cluster method'

    ##
    mat = sh.getConst('mat')
    labels = cls.fit_predict(mat)
    nLabels = len(set(labels))

    ##
    sil = None; cal = None
    if (nLabels >= 2)and(nLabels <= len(labels)-1):
        sil = met.silhouette_score(mat,labels,'precomputed')
        cal = met.calinski_harabaz_score(mat,labels)
    perf = dict(silhouette_score=sil,calinski_harabaz_score=cal)

    return (labels,perf)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(cc_clean)

# Kmeans for 2 to 8 clusters
KS = range(2, 10)

# storage
inertia = []
silo = []

for k in KS:
  km = KMeans(k)
  km.fit(X_pca)
  labs = km.predict(X_pca)
  inertia.append(km.inertia_)
  silo.append(silhouette_score(X_pca, labs))

print(silo)

# plot 
plt.figure(figsize=(15,5))


plt.subplot(1, 2, 1)
plt.title("Inertia")
sns.lineplot(KS, inertia)

plt.subplot(1, 2, 2)
plt.title("Silohouette Score")
sns.lineplot(KS, silo)
for i in range(49):
    cluster_num = i + 2
    print('the cluster num is: %d' % cluster_num)

    # Set the K-Means input vector here #
    Kmeans = KMeans(n_clusters=cluster_num,
                    random_state=None).fit(costKmeans_norm)

    labels = np.array(Kmeans.labels_)

    BC, WC = My_harabaz_score(costKmeans_norm, Kmeans.labels_, cluster_num)
    temp_score = metrics.calinski_harabaz_score(costKmeans_norm,
                                                Kmeans.labels_)
    print('the harabaz score is: %f' % temp_score)
    harabaz_score.append(temp_score)
    temp_score1 = metrics.silhouette_score(costKmeans_norm,
                                           Kmeans.labels_,
                                           metric='euclidean')
    sil_coe.append(temp_score1)
    print('the sil_coe is: %f' % temp_score1)
    BC_score.append(BC)
    WC_score.append(WC)
    print(BC)
    print(WC)
    print(labels)
    labelsforKmeans.append(labels)

sil_coe_diff = np.diff(sil_coe)
harabaz_score_diff = np.diff(harabaz_score)
Example #33
0
            centers = pca.transform(clusterer.means_)
            figname = create_path("fig",
                                  sys.argv[1],
                                  "GMM",
                                  sys.argv[2],
                                  filename=("%d_%s_gmm_vis.png" %
                                            (n_clusters, covariance_type)))
            visualize_cluster(X_vis, cluster_labels, n_clusters, centers,
                              figname)

            ari = metrics.adjusted_rand_score(y, cluster_labels)
            ami = metrics.adjusted_mutual_info_score(y, cluster_labels)
            nmi = metrics.normalized_mutual_info_score(y, cluster_labels)
            fms = metrics.fowlkes_mallows_score(y, cluster_labels)
            sil = metrics.silhouette_score(X,
                                           cluster_labels,
                                           metric='euclidean')
            chi = metrics.calinski_harabaz_score(X, cluster_labels)
            dbi = metrics.davies_bouldin_score(X, cluster_labels)

            print("Adjusted Rand index: %.6f" % ari)
            print("Adjusted Mutual Information: %.6f" % ami)
            print("Normalized Mutual Information: %.6f" % nmi)
            print("Fowlkes-Mallows score: %.6f" % fms)
            print("Silhouette Coefficient: %.6f" % sil)
            print("Calinski-Harabaz Index: %.6f" % chi)
            print("Davies-Bouldin Index: %.6f" % dbi)

            ari_score.append(ari)
            ami_score.append(ami)
            nmi_score.append(nmi)
core_samples = db.core_sample_indices_
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

##############################################################################
# Plot result
import pylab as pl

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'
        markersize = 6
    class_members = [index[0] for index in np.argwhere(labels == k)]
    cluster_core_samples = [index for index in core_samples
threedee.set_ylabel('y')
threedee.set_zlabel('z')
fig_2 = threedee.get_figure()
fig_2.suptitle('3D Clustered Data', fontsize=16)
fig_2.savefig('Q6_1.png')
plt.show()

#given that we know/can know the averages of these can also work an error from this
#and see if this is a better indicator

#compare the created samples mean and the predicted means 

means_3 = np.array(clf_3.means_)
covariance_3 = clf_3.covariances_

print("The silhouette score is "+ str(silhouette_score(gen_df, labels_3, metric = 'euclidean')))

print("examining the means of the generated GMMs and the predicted ones")
print("counter mean")
print(counter_mean)
print("predicted mean")
print(means_3)
mean_difference = means_3 - counter_mean
print("The differences between the means in this model")
print(mean_difference)
print("This is because the large values dominate the mean")
#Large values are over represented in the mean

#The problem here is that there is far too much overlap in these distributions
#so while the model can fit it the means it provides mean nothing
#want to test that the model would work if the distribution were spread out
Example #36
0
    elapsed = timeit.default_timer() - start_time
    print('Execution time: {0:.4f} sec'.format(elapsed))

    x, y = zip(*sorted(
        sil_coef.items()))  # unpack a list of pairs into two tuples

    plt.plot(x, y)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')

    plt.show()

    return sil_coef


sil_coef = cluster(df_N, 10)

#%%

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(phi_true, phi_predict))
print("Completeness: %0.3f" %
      metrics.completeness_score(phi_true, phi_predict))
print("V-measure: %0.3f" % metrics.v_measure_score(phi_true, phi_predict))
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(phi_true, phi_predict))
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(phi_true, phi_predict))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(dfHeurN, phi_predict, metric='sqeuclidean'))
Example #37
0
def silhouette_coefficient(dataSet):

    # List of number of clusters
    range_n_clusters = [2, 3, 4, 5, 6]
    X = dataSet
    #  pca = decomposition.PCA(n_components=2)
    # pca.fit(X)
    # X = pca.transform(X)
    # For each number of clusters, perform Silhouette analysis and visualize the results.
    for n_clusters in range_n_clusters:

        # Perform k-means.

        kmeans = KMeans(n_clusters=n_clusters, random_state=10)
        y_pred = kmeans.fit_predict(X)
        # Compute the cluster homogeneity and completeness.
        homogeneity = metrics.homogeneity_score(y_pred, y_pred)
        completeness = metrics.completeness_score(y_pred, y_pred)

        # Compute the Silhouette Coefficient for each sample.
        s = metrics.silhouette_samples(X, y_pred)

        # Compute the mean Silhouette Coefficient of all data points.
        s_mean = metrics.silhouette_score(X, y_pred)

        # For plot configuration -----------------------------------------------------------------------------------
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # Configure plot.
        plt.suptitle(
            'Silhouette analysis for K-Means clustering with n_clusters: {}'.
            format(n_clusters),
            fontsize=14,
            fontweight='bold')

        # Configure 1st subplot.
        ax1.set_title('Silhouette Coefficient for each sample')
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        ax1.set_xlim([-1, 1])
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Configure 2st subplot.
        ax2.set_title(
            'Homogeneity: {}, Completeness: {}, Mean Silhouette score: {}'.
            format(homogeneity, completeness, s_mean))
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        # For 1st subplot ------------------------------------------------------------------------------------------

        # Plot Silhouette Coefficient for each sample
        y_lower = 10
        for i in range(n_clusters):
            ith_s = s[y_pred == i]
            ith_s.sort()
            size_cluster_i = ith_s.shape[0]
            y_upper = y_lower + size_cluster_i
            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_s,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        # Plot the mean Silhouette Coefficient using red vertical dash line.
        ax1.axvline(x=s_mean, color="red", linestyle="--")

        # For 2st subplot -------------------------------------------------------------------------------------------
        #pca = decomposition.PCA(n_components=2)
        #pca.fit(X)
        #plot_X = pca.transform(X)
        # Plot the predictions
        colors = cm.spectral(y_pred.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], c=colors)
    return X
Example #38
0
centers = kmeans.cluster_centers_
score = kmeans.score(df_scaled)

# Compute Clustering Metrics
n_clusters_ = len(centers)

print('Number of clusters: %d' % n_clusters_)
#print("Homogeneity: %0.3f" % metrics.homogeneity_score(phi_true, phi_predict))
#print("Completeness: %0.3f" % metrics.completeness_score(phi_true, phi_predict))
#print("V-measure: %0.3f" % metrics.v_measure_score(phi_true, phi_predict))
#print("Adjusted Rand Index: %0.3f"
#      % metrics.adjusted_rand_score(phi_true, phi_predict))
#print("Adjusted Mutual Information: %0.3f"
#      % metrics.adjusted_mutual_info_score(phi_true, phi_predict))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(df_scaled, phi_predict, metric='sqeuclidean'))

# timeit statement
elapsed = timeit.default_timer() - start_time

#%%

import math

df = df_metrics
plt.scatter(df.AvgStart, df.AvgEnergy)


#%%
def round10(x):
    return int(math.ceil(x / 5.0)) * 5
Example #39
0
def main():
    dataset = pd.read_csv('dataset.csv')
    positive = dataset.loc[dataset['Lab Status'] == 'Positive ID']
    latitude = get_column_value(positive, 'Latitude').tolist()
    longitude = get_column_value(positive, 'Longitude').tolist()
    date = get_column_value(positive, 'Detection Date').tolist()

    date = pd.to_datetime(date)
    interval = (date - date[0]).days
    interval = interval - np.min(interval)

    data = []
    for i, la in enumerate(latitude):
        data.append([latitude[i], longitude[i], interval[i]])
    data = np.array(data)
    data = data[np.argsort(data[:, 2])]
    data_scale = preprocessing.scale(data)

    SSE = []
    for k in range(2, 9):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data_scale)
        SSE.append(kmeans.inertia_)
    X = range(2, 9)
    plt.xlabel('Number of Clusters(k)')
    plt.ylabel('SSE')
    plt.title('SSE vs k')
    plt.plot(X, SSE, 'o-')
    plt.show()

    Scores = []
    for k in range(2, 9):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        Scores.append(
            silhouette_score(data, kmeans.labels_, metric='euclidean'))
    X = range(2, 9)
    plt.xlabel('Number of Clusters(k)')
    plt.ylabel('Silhouette Coefficient')
    plt.title('Silhouette Coefficient vs k')
    plt.plot(X, Scores, 'o-')
    plt.show()

    cluster_num = 3
    kmeans = KMeans(n_clusters=cluster_num, random_state=0).fit(data_scale)
    label = kmeans.labels_
    centers = []
    label_list = []

    for i in range(cluster_num):
        label_list.append(data[label == i, 0:2].tolist())
        centers.append(np.mean(data[label == i], axis=0).tolist())

    centers = np.array(centers)
    centers_list = np.delete(centers, -1, axis=1).tolist()
    centers = centers[np.argsort(centers[:, 2])]
    print(centers)

    ax1 = plt.axes(projection='3d')
    ax1.scatter3D(data[:, 1],
                  data[:, 0],
                  data[:, 2],
                  c=kmeans.labels_,
                  cmap='rainbow')

    ax1.scatter3D(centers[:, 1],
                  centers[:, 0],
                  centers[:, 2],
                  c='black',
                  s=150,
                  alpha=0.5)
    plt.show()

    x = centers[:, 1].reshape((-1, 1))
    y = centers[:, 0]

    reg = LinearRegression().fit(x, y)
    k = reg.coef_[0]
    b = reg.intercept_
    print("Y = %.5fX + (%.5f)" % (k, b))

    plt.scatter(data[:, 1], data[:, 0], c=label, cmap='rainbow')
    plt.scatter(centers[:, 1], centers[:, 0], c='black', s=150, alpha=0.5)
    data = data[np.argsort(data[:, 1])]
    plt.plot(data[np.argsort(data[:, 1])][:, 1].reshape((-1, 1)),
             reg.predict(data[np.argsort(data[:, 1])][:, 1].reshape((-1, 1))),
             c='b',
             linestyle='--')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Linear Regression of Cluster Centers(k=%d)' % cluster_num)

    plt.grid()
    plt.show()

    cluster_foot_x, cluster_foot_y = get_foot_point(centers[-1, 1],
                                                    centers[-1, 0], k, b)

    print("center-%d distance to line:%.5f" %
          (cluster_num,
           get_distance([centers[-1, 1], centers[-1, 0]],
                        [cluster_foot_x, cluster_foot_y])))
    sum_dis = 0
    for i in range(data.shape[0]):
        foot_x, foot_y = get_foot_point(data[i, 1], data[i, 0], k, b)
        sum_dis += get_distance([data[i, 1], data[i, 0]], [foot_x, foot_y])
    print("sum_dis:%.5f" % sum_dis)

    colors = ['blue', 'green', 'orange', 'pink', 'purple', 'red']
    map = folium.Map(location=[48.9938, -122.702],
                     zoom_start=8,
                     tiles="OpenStreetMap")
    for i in range(len(label_list)):
        point_list = label_list[i]
        for point in range(len(point_list)):
            folium.CircleMarker(radius=2.5,
                                location=label_list[i][point],
                                color=colors[i],
                                fill=True,
                                fill_color=colors[i],
                                fill_opacity=1).add_to(map)

    for i in range(len(centers_list)):
        folium.CircleMarker(cradius=6,
                            location=centers_list[i],
                            color=colors[i],
                            fill=True,
                            fill_color=colors[i],
                            fill_opacity=0.3).add_to(map)

    map.save('map_cluster%d.html' % cluster_num)
labels = d.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
print(db.core_sample_indices_)
print(db.labels_ == d.labels_)
if (db.labels_ == d.labels_).all():
    print('lists the same')
else:
    print('lists differ')
print('self.x from Dbscan')
print(d.x)
# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [
def silhouette_elbow__analysis(X, range_n_clusters, all_cluster_labels, all_centers):
    """
    :param X:  原始样本
    :param range_n_clusters:  K的取值情况, list
    :param all_cluster_labels:  簇标签结果 list
    :param all_centers: 每种K值情况下的簇中心 list
    :return:
    """
    assert len(all_cluster_labels) == len(all_centers) == len(range_n_clusters)
    plt.figure(figsize=(10, 8))
    row_plot = 3  # 子图的行数
    all_dist = []
    for n, n_clusters in enumerate(range_n_clusters):
        # ================= 轮廓分析法 ============================
        cluster_labels = all_cluster_labels[n]
        plt.subplot(row_plot, (len(range_n_clusters) + 1) // row_plot, n + 1)
        plt.xlim([-0.1, 1])  # 设置x轴的范围(轮廓系数)
        plt.ylim([0, len(X) + (n_clusters + 1) * 10])  # 顶端的间隙
        silhouette_avg = silhouette_score(X, cluster_labels)  # 所有样本的轮廓系数均值
        print(" 当 n_clusters = ", n_clusters, "时,轮廓系数为: ", silhouette_avg)
        # 计算每个样本对应的轮廓系数
        sample_silhouette_values = silhouette_samples(X, cluster_labels)
        y_lower = 10
        for i in range(n_clusters):  # 遍历每一个簇
            # 取第i个簇中对应所有样本的轮廓系数,并进行排序
            s_values = sample_silhouette_values[cluster_labels == i]
            s_values.sort()
            size_cluster_i = s_values.shape[0]  # 得到第i个簇的样本数量
            y_upper = y_lower + size_cluster_i  # 图中每个簇在y轴上的宽度
            # 限定y的范围,填充x1和x2所围成的区域
            plt.fill_betweenx(y=np.arange(y_lower, y_upper), x1=0, x2=s_values, alpha=0.7)
            # 在y轴右侧标记每个簇的序号
            plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            # 计算下一个条形图y轴的其实值
            y_lower = y_upper + 10  # 10 for the 0 samples
        fm.fontManager.addfont('../data/SimHei.ttf')
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来  正常显示中文标签
        plt.title(f"K = {n_clusters} 时的轮廓系数图", fontsize=12)
        plt.xlabel("轮廓系数", fontsize=12)
        plt.ylabel("聚类簇序号", fontsize=12)
        # 以x=silhouette_avg 画一条平行于y轴的线
        plt.axvline(x=silhouette_avg, color="red", linestyle="--")
        plt.yticks([])  # 去掉y轴的刻度
        plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])  # 设置x轴的刻度

        # ============  肘部法计算簇内距离和并可视化  =========================
        dist = 0
        centers = all_centers[n]
        for i in range(n_clusters):  # 遍历每一个簇,计算当前簇的簇内距离
            x_data = X[cluster_labels == i]
            tmp = np.sum((x_data - centers[i]) ** 2, axis=1)
            dist += np.sum(np.sqrt(tmp))  # 累计当前聚类结果下所有簇的簇内距离和
        all_dist.append(dist)
    plt.subplot(row_plot, (len(range_n_clusters) + 1) // row_plot, len(range_n_clusters) + 1)
    plt.title("肘部法结果")
    plt.plot(range_n_clusters, all_dist)  # 绘制肘部曲线
    plt.scatter(range_n_clusters, all_dist)  # 绘制各个点
    for i in range(len(range_n_clusters)):  # 在图上进行K值标记
        plt.annotate(f"k = {range_n_clusters[i]}",
                     xy=(range_n_clusters[i], all_dist[i]), fontsize=14,
                     xytext=(range_n_clusters[i] + 0.1, all_dist[i]))
        plt.hlines(all_dist[i], xmin=0, xmax=range_n_clusters[i], color="red", linestyle="--")

    plt.xlim(range_n_clusters[0] - 0.5, range_n_clusters[-1] + 0.8)  # 调整范围
    plt.ylim(all_dist[-1] * 0.9, all_dist[0] + all_dist[-1] * 0.1)
    plt.yticks([])  # 去掉y轴上的刻度显示
    plt.xlabel("K", fontsize=12)
    plt.ylabel("distance", fontsize=12)
    plt.tight_layout()
    plt.show()
Example #42
0
def getSilhouetteCoeff(data, labels):
    silCoeff = silhouette_score(data, labels, metric='euclidean')
    return silCoeff
Example #43
0
            def my_score(X, y):
                return mutual_info_classif(X, y, random_state=724)

            selectedfeatures = SelectKBest(my_score, k=25)
            selectedfeatures.fit(data_train, label_train)
            smalldata_train = selectedfeatures.transform(
                data_train)  #ti qu guo te zheng de 25 wei xun lian shu ju
            #kmeans = KMeans(n_clusters=50, random_state=0).fit_predict(data_train)
            smalldata_train = np.hstack(
                (smalldata_train, data_train[0:, original_length:]))
            best_silhouette_score = -2
            for nclusters in range(40, 120, 20):
                kmeans_model = KMeans(n_clusters=nclusters, random_state=926)
                kmeans_model.fit(smalldata_train)
                tmp = silhouette_score(smalldata_train,
                                       kmeans_model.labels_,
                                       sample_size=40000)
                print("n score:  %0.3f %0.3f" % (nclusters, tmp))
                if (tmp > best_silhouette_score):
                    best_silhouette_score = tmp
                    best_n_clusters = nclusters
            print("%0.3f best_n_clusters" % (best_n_clusters))
            kmeans_model = KMeans(n_clusters=best_n_clusters, random_state=926)
            kmeans_model.fit(smalldata_train)
            kmeans = kmeans_model.labels_
            #   print kmeans
            #print>>f, kmeans.size
            split = np.zeros(10000)
            nosplit = np.zeros(10000)
            clusters = 0
            len1 = int(data_train.size /
Example #44
0
def vis_cluster(X,X_pca,n_clusters):
    y_lower = 10 # y값의 기준 
    kmeans = KMeans(n_clusters= n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_pca)
    
    # clustering 
    silhouette_avg = silhouette_score(X, cluster_labels)
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    

    # visual 
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])    
    
    
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

        
        
        
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X_pca[:, 0], X_pca[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = kmeans.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

    plt.show()
clusters = fcluster(distance, k, criterion='maxclust')
plt.figure(figsize=(10, 8))
plt.scatter(x[:, 0], x[:, 1], c=clusters, cmap='prism')
'''K-Means'''
from sklearn.cluster import KMeans
model = KMeans(n_clusters=5)
model.fit(x)
y_kmeans = model.predict(x)

plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=10, cmap='inferno')
centers = model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='cyan', s=300)

from mlxtend.plotting import plot_decision_regions

print(model.inertia_)

elbow = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters=i).fit(x)
    elbow.append([i, kmeans.inertia_])

plt.plot(pd.DataFrame(elbow)[0], pd.DataFrame(elbow)[1])

from sklearn.metrics import silhouette_score
silhoutte = []
for i in range(2, 8):
    kmeans = KMeans(n_clusters=i).fit(x)
    silhoutte.append([i, silhouette_score(x, kmeans.labels_)])

plt.plot(pd.DataFrame(silhoutte)[0], pd.DataFrame(silhoutte)[1])
Example #46
0
    model.fit(X)
    # append model to cluster list
    clusters.append(model)
    inertia_vals.append(model.inertia_)

# plot the inertia vs K values
plt.plot(range(2, 15, 1), inertia_vals, marker='*')
plt.show()

from sklearn.metrics import silhouette_score
#
# print(clusters[1])
# print("Silhouette score for k=4", silhouette_score(X, clusters[1].predict(X)))

print(clusters[2])
print("Silhouette score for k=4", silhouette_score(X, clusters[2].predict(X)))

print(clusters[3])
print("Silhouette score for k=5", silhouette_score(X, clusters[3].predict(X)))

print(clusters[4])
print("Silhouette score for k=6", silhouette_score(X, clusters[4].predict(X)))

print(clusters[5])
print("Silhouette score for k=7", silhouette_score(X, clusters[5].predict(X)))

# K means clustering using the term vector
kmeans = KMeans(n_clusters=6, random_state=rs).fit(X)


# function to visualise text cluster. Useful for the assignment too
Example #47
0
# In[9]:

from sklearn.cluster import KMeans
Cluster = KMeans(n_clusters=3, random_state=2)
Cluster.fit(data)
y_pred = Cluster.predict(data)

plt.scatter(data_arr[:, 0], data_arr[:, 1], c=y_pred, s=50, cmap='plasma')
plt.rcParams.update({'figure.figsize': (10, 7.5), 'figure.dpi': 100})

# In[10]:

Cluster.fit(data)
y_pred = Cluster.predict(test)

plt.scatter(test[:, 0], test[:, 1], c=y_pred, s=50, cmap='plasma')
plt.rcParams.update({'figure.figsize': (10, 7.5), 'figure.dpi': 100})

# In[11]:

from sklearn.metrics import silhouette_score

for i in range(2, 10):
    clusterer = KMeans(n_clusters=i, random_state=i)
    cluster_labels = clusterer.fit_predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)
    print("For n_clusters =", i, "The average silhouette_score is :",
          silhouette_avg)

# In[ ]:
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()
Example #49
0
def tune_hyperparameters(data_list, if_tune_m=True, m_range=None, if_tune_dim=True, dim_range=None, if_tune_p=False, p_range=None, log_norm=True, l2_norm=True, true_labels=None, verbose=True):
    # Specify data normalization
    data_list = preprocess(data_list, log_norm=log_norm, l2_norm=l2_norm)
    num_datasets = len(data_list)
    # Impute m if None
    if m_range==None:
        m_est = max(m_estimate(data_list))
        if if_tune_m:
            m_range = [m_est+i*5 for i in range(-3, 3)]
        else:
            m_range = [m_est]
            print('WARNING no value of m is given, default m={} for the dataset(s) from estimation.'.format(m_est))
    # Impute dim if None
    if dim_range==None:
        dim_est = dim_estimate(data_list)
        if if_tune_dim:
            dim_range = [dim_est+i*10 for i in range(-2, 2)]
        else:
            dim_range = [dim_est]
            print('WARNING no value of dim is given, default dim={} for the dataset(s) from estimation.'.format(dim_est))
    # Impute p if None
    if p_range==None:
        if if_tune_p:
            p_range = [0.1, 0.3, 0.5]
        else:
            p_range = [0.3]
            print('WARNING no value of p is given, default p=0.3 for the dataset(s) from estimation.')
    # If ground truth given, find n_clusters
    if true_labels is not None:
        n_clusters = len(np.unique(true_labels))
    out = []
    if verbose:
        print('Testing hyperparameters in the range below:')
        print('Range for m: {}'.format(m_range))
        print('Range for dim: {}'.format(dim_range))
        print('Range for p: {}'.format(p_range))
    for m in m_range:
        for n_dim in dim_range:
            for p in p_range:
                if m*p < 3:
                    print('Skip m={} and p={} as the number of ghost cells is smaller than 3.'.format(m, p))
                    continue
                ZW = run_OCAT(data_list=data_list, m_list=[m]*num_datasets, dim=n_dim, p=p, log_norm=False, l2_norm=False)
                if true_labels is None:
                    labels_pred, n_clusters = evaluate_clusters(ZW, return_num_cluster=True)
                    sil_score = silhouette_score(ZW, labels_pred)
                    out.append([m, n_dim, p, n_clusters, sil_score])
                else:
                    labels_pred = evaluate_clusters(ZW, num_cluster=n_clusters)
                    NMI_cell = normalized_mutual_info_score(true_labels, labels_pred)

                    AMI_cell = adjusted_mutual_info_score(true_labels, labels_pred)

                    ARI_cell = adjusted_rand_score(true_labels, labels_pred)
                    out.append([m, n_dim, p, NMI_cell, AMI_cell, ARI_cell])
    out = np.array(out)
    if true_labels is not None:
        df = pd.DataFrame(data=out, columns=['m', 'n_dim', 'p', 'NMI_score', 'AMI_score', 'ARI_score'])
    else:
        df = pd.DataFrame(data=out, columns=['m', 'n_dim', 'p', 'n_clusters', 'silhoutte_score'])
    if verbose:
        print(df)
    return df
import matplotlib.pyplot as plt
for i in clases:
    dlabels = np.where(dy[:, 0] == i)[0]
    plt.plot(data1[dlabels, 0], data1[dlabels, 1], 'x')

plt.xlabel('Agrupamiento original')
plt.show()

# ### Silhoulette Score de los datos con la clasificacion original
#

# In[527]:

from sklearn.metrics import silhouette_score

silhouette_score(data1, dy[:, 0] - 1, metric='sqeuclidean')

# #### Clasifico Con las 2 mejores caracteristicas y grafico

# In[528]:

cm = cmeans(nclusters=clases.shape[0])
cm.fit(datas[0], m=3)

# ## Matriz de pertenencias de los datos a los clusters

# In[531]:

membership = cm.predict(datas[0], m=3)
np.round(membership, 1)
Example #51
0
def illustration(data, range_n_clusters):
    """
    TBD
    """

    # Scale des données obligatoire avant la réduction des dimensions
    std_scale = preprocessing.StandardScaler().fit(data)
    X = std_scale.transform(data)

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(36, 14)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10

        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[
                cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0],
                    X[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors,
                    edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200,
                    edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0],
                        c[1],
                        marker='$%d$' % i,
                        alpha=1,
                        s=50,
                        edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

        plt.show()
Example #52
0
silhouette = {}

D = pairwise_distances(training_dataset, metric='euclidean')

for n_clusters in range_n_clusters:
    print('number of clusters :{}'.format(n_clusters))

    M, C = kmedoids.kMedoids(D, n_clusters)
    labels = np.zeros(1173)

    for label in C:
        #print(label)
        for point_idx in C[label]:
            labels[point_idx] = label

    silhouette_avg = silhouette_score(training_dataset, labels)
    vcr_avg = calinski_harabasz_score(training_dataset, labels)
    silhouette[n_clusters] = silhouette_avg

    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)

    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          vcr_avg)

    #sample_silhouette_values = silhouette_samples(training_dataset, cluster_labels)

plt.figure()
plt.plot(list(inertia.keys()), list(inertia.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
Example #53
0
plt.ylim([0, 10])
plt.title('Instances')
plt.scatter(x1, x2)

colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'b']
markers = ['o', 's', 'D', 'v', '^', 'p', '*', '+']

clusters = [2, 3, 4, 5, 8]
subplot_counter = 1
sc_scores = []
for t in clusters:
    subplot_counter += 1
    plt.subplot(3, 2, subplot_counter)
    kmeans_model = KMeans(n_clusters=t).fit(X)
    for i, l in enumerate(kmeans_model.labels_):
        plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l], ls='None')
    plt.xlim([0, 10])
    plt.ylim([0, 10])
    sc_score = silhouette_score(X, kmeans_model.labels_, metric='euclidean')
    sc_scores.append(sc_score)

    # 绘制轮廓系数与不同类簇数量的直观显示图。
    plt.title('K = %s, silhouette coefficient= %0.03f' % (t, sc_score))

# 绘制轮廓系数与不同类簇数量的关系曲线。
plt.figure()
plt.plot(clusters, sc_scores, '*-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient Score')

plt.show()
Example #54
0
    cah = AgglomerativeClustering(n_clusters=k)
    cah.fit(centroids)
    labels = cah.labels_

    nv_centroids = pd.DataFrame(centroids)
    nv_centroids["labels"] = labels
    nv_centroids = nv_centroids.groupby("labels").mean()

    # Consolidation du KMeans
    clf_2 = KMeans(n_clusters=k, init=nv_centroids)
    clf_2.fit(sv_data_scaled)

    labels_final = clf_2.labels_

    s_score = silhouette_score(sv_data_scaled,
                               labels_final,
                               metric="sqeuclidean")
    s_scores.append(s_score)

plt.plot(k_clust, s_scores)

# Score de silhouette le plus élevé pour n_clusters = 2
# => meilleur nombre pour l'homogénéité intra-cluster et la séparation inter-cluster

# MAIS ne permet pas une bonne séparation inter-cluster, comme vu au dessus (van et bus très proche)
# Choisir 3 clusters semble donc pertinent quitte à avoir moins d'homogénéité intra-cluster

clf = KMeans(n_clusters=3)
clf.fit(sv_data_scaled)

labels = clf.labels_
Example #55
0
                init='k-means++',
                max_iter=500,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(X, labels, sample_size=1000))

print()

if not (opts.n_components or opts.use_hashing):
    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
Example #56
0
 def get(self, x, labels):
     return silhouette_score(x, labels, **self.kwargs)
    print(i)
W = W[0:4]
biases= biases[0:4]

    
    
def encode(encoder_weights,encoder_biases,data):
    res=data
    for index, (w,b) in enumerate(zip(encoder_weights,encoder_biases)):
        if index+1 == len(encoder_weights):
            res= np.dot(res,w) + b
        else:
            res  = np.maximum(0,np.dot(res,w)  + b)
    return res

res = encode(W,biases,X_test)
print(res.shape)

unique_labels = np.unique(y_test)
for index,unique_label in enumerate(unique_labels):
    data_latent_space = res[y_test==unique_label]
    plt.scatter(data_latent_space[:,0],data_latent_space[:,1],alpha=0.3,c =cmap(index))

plt.xlabel("Latent X")
plt.ylabel("Latest Y")
plt.title("Autoencoder results")

print(silhouette_score(res,y_test))
print("PCA silouette score( how good the clustering is made")
print(silhouette_score(res_pca,y_test))
Example #58
0
def silhouette(pairWisePointDistance, clusterLabels):
    # pairWisePointDistance: Array of pairwise distances between samples, or a feature array.
    # clusterLabels: 		 Predicted labels for each sample.
    return silhouette_score(pairWisePointDistance, clusterLabels)
df_cluster[features] = scaler.fit_transform(df_cluster[features])
df_cluster.describe().transpose()

# Elbow Method: determine the appropriate number of K
inertias = {}
silhouettes = {}
for k in range(2, 11):
    kmeans = KMeans(init='k-means++',
                    n_init=10,
                    n_clusters=k,
                    max_iter=1000,
                    random_state=42).fit(df_cluster)
    inertias[
        k] = kmeans.inertia_  # Inertia: Sum of distances of samples to their closest cluster center
    silhouettes[k] = silhouette_score(df_cluster,
                                      kmeans.labels_,
                                      metric='euclidean')

plt.figure()
plt.grid(True)
plt.plot(list(inertias.keys()), list(inertias.values()))
plt.title('K-Means, Elbow Method')
plt.xlabel("Number of clusters, K")
plt.ylabel("Inertia")

plt.figure()
plt.grid(True)
plt.plot(list(silhouettes.keys()), list(silhouettes.values()))
plt.title('K-Means, Elbow Method')
plt.xlabel("Number of clusters, K")
plt.ylabel("Silhouette")
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print(
    "Adjusted Rand-Index: %.3f"
    % metrics.adjusted_rand_score(labels, km.labels_)
)
print(
    "Silhouette Coefficient: %0.3f"
    % metrics.silhouette_score(X, km.labels_, sample_size=1000)
)

print()


if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()