Esempio n. 1
0
    def _global_clustering(self, X=None):

        #对fitting之后获得的subclusters进行global_clustering
        clusterer = self.n_clusters
        centroids = self.subcluster_centers_
        compute_labels = (X is not None) and self.compute_labels

        # 预处理
        not_enough_centroids = False
        if isinstance(clusterer, int):
            clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
            if len(centroids) < self.n_clusters:
                not_enough_centroids = True
        elif (clusterer is not None):
            raise ValueError("n_clusters should be an instance of " "ClusterMixin or an int")

        # 避免predict环节,重复运算
        self._subcluster_norms = row_norms(
            self.subcluster_centers_, squared=True)

        if clusterer is None or not_enough_centroids:
            self.subcluster_labels_ = np.arange(len(centroids))
            if not_enough_centroids:
                warnings.warn(
                    "Number of subclusters found (%d) by Birch is less than (%d). Decrease the threshold."% (len(centroids), self.n_clusters))
        else:
            # 对所有叶子节点的subcluster进行聚类,它将subcluster的centroids作为样本,并且找到最终的centroids.
            self.subcluster_labels_ = clusterer.fit_predict(
                self.subcluster_centers_)

        if compute_labels:
            self.labels_ = self.predict(X)
Esempio n. 2
0
def f1():
    model = AgglomerativeClustering(n_clusters=claster_number,
                                    affinity='euclidean',
                                    linkage='complete')

    # model.fit(matrix.toarray())
    # print (model.distance)
    # print(linkage(matrix.toarray(), method='single', metric='euclidean'))

    preds = model.fit_predict(matrix.toarray())

    res = dict()
    for i, p in enumerate(preds):
        # print(p)
        res[basename(dataset['filenames'][i])] = dataset['target_names'][p]
    prev = None
    for k, v in sorted(res.items(), key=operator.itemgetter(1)):
        if prev != v:
            print(v, ':')
            prev = v
        print('\t\t', k)

    dist = 1 - cosine_similarity(matrix.toarray())
    row_sums = dist.sum(axis=1)
    new_matrix = dist / row_sums[:, np.newaxis]
    plt.figure(figsize=(20, 20), dpi=300)
    sb.heatmap(new_matrix)
    lbls = list()
    for fn in dataset['filenames']:
        lbls.append(basename(fn)[:-4])
    plt.xticks(np.arange(0, article_number), lbls, rotation='vertical')
    plt.yticks(np.arange(0, article_number), lbls, rotation='horizontal')
    plt.show()
    print(())
Esempio n. 3
0
def agglomerative_clustering (matrix):
    print "====== Agglomerative Clustering ==============="

    model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
    preds = model.fit_predict(matrix)
    clusters =  model.labels_.tolist()

    return (preds, clusters)
Esempio n. 4
0
    def _global_clustering(self, X=None):
        """
        Global clustering for the subclusters obtained after fitting
        """
        clusterer = self.n_clusters
        centroids = self.subcluster_centers_
        compute_labels = (X is not None) and self.compute_labels

        # Preprocessing for the global clustering.
        not_enough_centroids = False
        if isinstance(clusterer, int):
            clusterer = AgglomerativeClustering(
                n_clusters=self.n_clusters)
            # There is no need to perform the global clustering step.
            if len(centroids) < self.n_clusters:
                not_enough_centroids = True
        elif (clusterer is not None and not
              hasattr(clusterer, 'fit_predict')):
            raise ValueError("n_clusters should be an instance of "
                             "ClusterMixin or an int")

        # To use in predict to avoid recalculation.
        self._subcluster_norms = row_norms(
            self.subcluster_centers_, squared=True)

        if clusterer is None or not_enough_centroids:
            self.subcluster_labels_ = np.arange(len(centroids))
            if not_enough_centroids:
                warnings.warn(
                    "Number of subclusters found (%d) by Birch is less "
                    "than (%d). Decrease the threshold."
                    % (len(centroids), self.n_clusters))
        else:
            # The global clustering step that clusters the subclusters of
            # the leaves. It assumes the centroids of the subclusters as
            # samples and finds the final centroids.
            self.subcluster_labels_ = clusterer.fit_predict(
                self.subcluster_centers_)

        if compute_labels:
            self.labels_ = self.predict(X)
Esempio n. 5
0
plt.show()

#kMeans clustering
from sklearn.cluster import KMeans
km = KMeans(init='random', max_iter=150, n_clusters=2, random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], c='green')
plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], c='red')
plt.title("KMeans")
plt.show()

#Agglomerative Clustering with complete linkage
from sklearn.cluster.hierarchical import AgglomerativeClustering
aggcl = AgglomerativeClustering(n_clusters=2, linkage='complete')
y_agcl = aggcl.fit_predict(X)

plt.scatter(X[y_agcl == 0, 0], X[y_agcl == 0, 1], c='green')
plt.scatter(X[y_agcl == 1, 0], X[y_agcl == 1, 1], c='red')
plt.title("Aggolomerative Clustering")
plt.show()

#Demonstaring clustering using density-based approach
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.2, min_samples=5)
y_dbs = dbs.fit_predict(X)

plt.scatter(X[y_dbs == 0, 0], X[y_dbs == 0, 1], c='green')
plt.scatter(X[y_dbs == 1, 0], X[y_dbs == 1, 1], c='red')
plt.title("Density based(DBSCAN) Clustering")
plt.show()
Esempio n. 6
0

# In[24]:


for cluster in dbscan_clusters:
    print(data.loc[cluster].mean())


# In[11]:


from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=4, linkage='average', affinity='manhattan')
aggl_preds = model.fit_predict(scaled_features)


# In[12]:


clusters_aggl = []
for lbl in np.unique(aggl_preds):
    indices = [i for i, x in enumerate(aggl_preds) if x == lbl]
    clusters_aggl.append(indices)


# In[25]:


for cluster in clusters_aggl: