Ejemplo n.º 1
0
def cluster(y, X, n_clusters):
    if n_clusters != 1 :
        y_ = onp.array(y,dtype='float64')
        # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0)
        # for i in range(mask.size):
        #     y_[range(mask[i]),i] = np.nan
        corr = pd.DataFrame(y_).corr(method='kendall')
        model = SpectralCoclustering(n_clusters=n_clusters)
        model.fit(corr)
        clusters = [model.get_indices(i)[0] for i in range(n_clusters)]
        def fn_by_cluster(x,fn,weights=None):
            if weights is not None:
                return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis]
                                       for i, rng in enumerate(clusters)], axis=-1)
            else:
                return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis]
                                       for i, rng in enumerate(clusters)],axis=-1)

        fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters]

        y_ = fn_by_cluster(y,np.sum)
        #Compute within-group market shares
        y_weights = fn_market_share(y)
        X_ = fn_by_cluster(X,np.average,y_weights)
        return y_, X_, clusters
    else:
        return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
Ejemplo n.º 2
0
def rearrange_confusion_matrix(cm, n_clusters):
    from sklearn.cluster import SpectralCoclustering

    clst = SpectralCoclustering(n_clusters=n_clusters).fit(cm)

    idx = []
    for c in range(n_clusters):
        idx.append(clst.get_indices(c)[0])
    idx = np.concatenate(idx)

    cm_clustered = np.zeros(cm.shape, dtype=int)

    for i, idxi in enumerate(idx):
        for j, idxj in enumerate(idx):
            cm_clustered[i,j] = cm[idxi, idxj]

    return cm_clustered, idx
Ejemplo n.º 3
0
def cocluster(np_sums, matrix_diags, vectorizer):
    ''' Perform the coclustering '''
    x = np.array(np_sums)
    # print(x)
    n_clusters = 20
    clustering = SpectralCoclustering(n_clusters=n_clusters,
                                      random_state=0).fit(x)

    for i in range(n_clusters):
        row_nums, col_nums = clustering.get_indices(i)
        row_words = [matrix_diags[num] for num in row_nums]
        col_words = [vectorizer.get_feature_names()[num] for num in col_nums]

        print("Cluster: ", i)
        print("===========")
        print("Diagnoses: ", row_words)
        print()
        print("n-grams: ", col_words)
        print()
    Like Counter.most_common in Python >=2.7.
    """
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)


bicluster_ncuts = list(
    bicluster_ncut(i) for i in range(len(newsgroups.target_names)))
best_idx = np.argsort(bicluster_ncuts)[:5]

print()
print("Best biclusters:")
print("----------------")
for idx, cluster in enumerate(best_idx):
    n_rows, n_cols = cocluster.get_shape(cluster)
    cluster_docs, cluster_words = cocluster.get_indices(cluster)
    if not len(cluster_docs) or not len(cluster_words):
        continue

    # categories
    counter = defaultdict(int)
    for i in cluster_docs:
        counter[document_names[i]] += 1
    cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name)
                           for name, c in most_common(counter)[:3])

    # words
    out_of_cluster_docs = cocluster.row_labels_ != cluster
    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
    word_col = X[:, cluster_words]
    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
Ejemplo n.º 5
0
class DocumentClustering:
    def __init__(self, k=5):
        self.name = 'k-means'
        self.k = k
        self.X = None
        self.clustering = None
        self.vectorizer = None
        self.dataset_size = 0
        self.doc2vec_matrix = False

    def make_matrix(self,
                    documents=None,
                    n_components=-1,
                    doc2vec_matrix=None):
        if isinstance(doc2vec_matrix, np.ndarray) == False:
            self.vectorizer = TfidfVectorizer()
            # self.vectorizer = CountVectorizer()
            self.X = self.vectorizer.fit_transform(documents)
            self.dataset_size = len(documents)
        else:
            self.X = doc2vec_matrix
            self.dataset_size = len(doc2vec_matrix)
            self.doc2vec_matrix = True

        if (n_components != -1):
            if n_components > len(self.vectorizer.get_feature_names()):
                n_components = len(self.vectorizer.get_feature_names())
            print('n_components ' + str(n_components))
            # Vectorizer results are normalized, which makes KMeans behave as
            # spherical k-means for better results. Since LSA/SVD results are
            # not normalized, we have to redo the normalization.
            print("Performing dimensionality reduction using LSA")
            t0 = time()
            svd = TruncatedSVD(n_components)
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)

            self.X = lsa.fit_transform(self.X)

            print("done in %fs" % (time() - t0))

            explained_variance = svd.explained_variance_ratio_.sum()
            print("Explained variance of the SVD step: {}%".format(
                int(explained_variance * 100)))

            print()

    def cluster(self, cluster_name):
        self.name = cluster_name.strip()
        print('cluster_name ' + self.name)
        if self.name == 'k-means':
            print('cluster_name: ' + self.name)
            self.clustering = KMeans(n_clusters=self.k,
                                     init='k-means++',
                                     max_iter=500,
                                     n_init=1)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif cluster_name == 'agglo':
            self.clustering = AgglomerativeClustering(n_clusters=self.k,
                                                      affinity='euclidean',
                                                      memory=None,
                                                      connectivity=None,
                                                      compute_full_tree='auto',
                                                      linkage='ward',
                                                      distance_threshold=None)

            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            #to make dense matrix
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif self.name == 'spectral_cocluster':
            self.clustering = SpectralCoclustering(n_clusters=self.k,
                                                   svd_method='arpack',
                                                   random_state=0)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()

    def print_results(self):
        # print the clustering result
        print(self.name)
        if self.name == 'k-means':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}
            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                print(str(cluster_label) + " -- " + str(document_id))
            order_centroids = self.clustering.cluster_centers_.argsort(
            )[:, ::-1]
            terms = self.vectorizer.get_feature_names()
            for i in range(self.k):
                print("Cluster %d:" % i, end='')
                for ind in order_centroids[i, :10]:
                    print(' %s' % terms[ind], end='')
                print()

        elif self.name == 'agglo':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}

            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                #print(str(cluster_label) + " -- " + str(document_id))

            results = self.get_cluster_top_keywords(clusters)
            for _cluster in results:
                key_terms = results[_cluster]
                print("Cluster " + str(_cluster) + " : " +
                      str(len(clusters[_cluster])) + " documents")
                print(key_terms)
            print()

        elif self.name == 'spectral_cocluster':
            target_number = 10
            bicluster_ncuts = list(
                self.bicluster_ncut(i) for i in range(self.k))
            best_idx = np.argsort(bicluster_ncuts)[:target_number]

            feature_names = self.vectorizer.get_feature_names()
            print()
            print("Best biclusters:")
            print("----------------")
            for idx, cluster in enumerate(best_idx):
                n_rows, n_cols = self.clustering.get_shape(cluster)
                cluster_docs, cluster_words = self.clustering.get_indices(
                    cluster)
                if not len(cluster_docs) or not len(cluster_words):
                    continue

                # categories
                counter = defaultdict(int)
                for i in cluster_docs:
                    counter[str(i)] += 1
                cat_string = ", ".join(
                    "{:.0f}% {}".format(float(c) / n_rows * 100, name)
                    for name, c in self.most_common(counter)[:3])

                # words
                out_of_cluster_docs = self.clustering.row_labels_ != cluster
                out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
                word_col = self.X[:, cluster_words]
                word_scores = np.array(word_col[cluster_docs, :].sum(
                    axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0))
                word_scores = word_scores.ravel()
                important_words = list(feature_names[cluster_words[i]]
                                       for i in word_scores.argsort()[:-11:-1])

                print("bicluster {} : {} documents, {} words".format(
                    idx, n_rows, n_cols))
                print("categories   : {}".format(cat_string))
                print("words        : {}\n".format(', '.join(important_words)))

    def bicluster_ncut(self, i):
        rows, cols = self.clustering.get_indices(i)
        if not (np.any(rows) and np.any(cols)):
            import sys
            return sys.float_info.max

        row_complement = np.nonzero(np.logical_not(
            self.clustering.rows_[i]))[0]
        col_complement = np.nonzero(np.logical_not(
            self.clustering.columns_[i]))[0]
        # Note: the following is identical to X[rows[:, np.newaxis],
        # cols].sum() but much faster in scipy <= 0.16
        weight = self.X[rows][:, cols].sum()
        cut = (self.X[row_complement][:, cols].sum() +
               self.X[rows][:, col_complement].sum())

        return cut / weight

    def most_common(self, d):
        """Items of a defaultdict(int) with the highest values.
        """
        return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

    def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10):
        """Shows the top k words for each cluster
        Keyword Arguments:
            keywords_per_cluster {int} -- The k words to show for each cluster (default: {10})
        Returns:
            dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']}
        """
        terms = self.vectorizer.get_feature_names()
        out = {}
        docs_for_cluster = {}
        # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508
        for cluster in clusters:
            # To flatten/combine all documents into one
            docs_for_cluster[cluster] = np.array(
                [self.X[i] for i in clusters[cluster]])
            # Cluster vectors to feature words
            out[cluster] = np.array(terms)[np.flip(
                np.argsort(docs_for_cluster[cluster]), -1)]
            cluster_shape = out[cluster].shape
            out[cluster] = out[cluster].reshape(
                cluster_shape[0] *
                cluster_shape[1])[:keywords_per_cluster].tolist()

        return out

    def visualize(self):
        # The output is a one-dimensional array of N documents corresponding to the clusters
        # assigned to our N data points.
        if self.name == 'spectral_cocluster':
            pca_t = None
            if self.doc2vec_matrix == False:
                pca_t = PCA().fit_transform(self.X.toarray())
            else:
                pca_t = PCA().fit_transform(self.X)
            #pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.row_labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'agglo':
            pca_t = PCA().fit_transform(self.X)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'k-means':
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()

            pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()