Esempio n. 1
0
def cluster(y, X, n_clusters):
    if n_clusters != 1 :
        y_ = onp.array(y,dtype='float64')
        # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0)
        # for i in range(mask.size):
        #     y_[range(mask[i]),i] = np.nan
        corr = pd.DataFrame(y_).corr(method='kendall')
        model = SpectralCoclustering(n_clusters=n_clusters)
        model.fit(corr)
        clusters = [model.get_indices(i)[0] for i in range(n_clusters)]
        def fn_by_cluster(x,fn,weights=None):
            if weights is not None:
                return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis]
                                       for i, rng in enumerate(clusters)], axis=-1)
            else:
                return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis]
                                       for i, rng in enumerate(clusters)],axis=-1)

        fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters]

        y_ = fn_by_cluster(y,np.sum)
        #Compute within-group market shares
        y_weights = fn_market_share(y)
        X_ = fn_by_cluster(X,np.average,y_weights)
        return y_, X_, clusters
    else:
        return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
Esempio n. 2
0
def test_spectral_coclustering():
    # Test Dhillon's Spectral CoClustering on a simple problem.
    param_grid = {
        "svd_method": ["randomized", "arpack"],
        "n_svd_vecs": [None, 20],
        "mini_batch": [False, True],
        "init": ["k-means++"],
        "n_init": [10],
    }
    random_state = 0
    S, rows, cols = make_biclusters((30, 30),
                                    3,
                                    noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert model.rows_.shape == (3, 30)
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert consensus_score(model.biclusters_, (rows, cols)) == 1

            _test_shape_indices(model)
Esempio n. 3
0
class Spectral(object):
    def __init__(self, dataset):
        """initialize spectral class
        
        Arguments:
            dataset {np.array} -- dataset to fetch
        """
        data_map = {'classic3': 1, 'cstr': 3, 'mnist': 2}
        self.dataset = dataset
        print("Fetching ", dataset)
        self.data, self.labels = get_data_set(data_map[dataset])
        if (~self.data.any(axis=0)).any():
            print("Found empty features. deleting...")
            self.data = np.delete(self.data,
                                  np.where(~self.data.any(axis=0))[0],
                                  axis=1)

    def view_dataset(self, title, data, markersize=0.001):
        """plot data matrix
        
        Arguments:
            title {str} -- title of plot
            data {np.array} -- dataset to plot
        
        Keyword Arguments:
            markersize {float} -- size of datapoints (default: {0.001})
        """
        plt.spy(data, markersize=markersize)
        plt.title(title)
        plt.show()

    def shuffle_data(self):
        """shuffles self.data
        """
        print("Shuffling")
        self.data, self.labels = shuffle(self.data, self.labels)
        self.view_dataset(data=self.data, title='shuffled data')

    def form_biclusters(self):
        """generates spectral bi-clusters from self.data
        """
        n_clusters = len(np.unique(self.labels))
        print("Generating {} clusters".format(n_clusters))
        self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1)
        self.bicluster.fit(self.data)

    def get_accuracy(self):
        """calculates NMI between self.bicluster rows and data labels
        """
        nmi = normalized_mutual_info_score(self.bicluster.row_labels_,
                                           self.labels)
        print("Accuracy is ", nmi)

    def show_clusters(self):
        """sorts data according to bicluster row and col labels and plots
        """
        fit_data = self.data[np.argsort(self.bicluster.row_labels_)]
        fit_data = fit_data[:, np.argsort(self.bicluster.column_labels_)]
        self.view_dataset(data=fit_data, title='co-clusters')
Esempio n. 4
0
def spect(input_data, n):
    spec_instance = SpectralCoclustering(n_clusters=n)
    spec_instance.fit(input_data)
    pred = spec_instance.row_labels_
    print(pred)
    print("ACC: " + str(accuracy_score(pred, labels)))
    acc["SPECT" + str(n)] = str(accuracy_score(pred, labels))
    tsnePlot(pred, n, input_data, 'SPECT')
def compute_coclustering(
        fit_data,
        num_clusters=1,
        tol_bicluster=0.005,  # sparsity otherwise annoyingly causes underflows w/ sklearn
):
    if num_clusters == 1:
        num_clusters = min(fit_data.shape[0], 5)
    model = SpectralCoclustering(n_clusters=num_clusters, random_state=0)
    model.fit(fit_data + tol_bicluster)
    ordered_rows = np.argsort(model.row_labels_)
    ordered_cols = np.argsort(model.column_labels_)
    return (ordered_rows, ordered_cols, model.row_labels_[ordered_rows],
            model.column_labels_[ordered_cols])
Esempio n. 6
0
class Cluster:
    def __init__(self, n_clusters, feature_vectors):
        self.n_clusters = n_clusters
        self.feature_vectors = feature_vectors

    def kmeans(self):
        self.model = KMeans(n_clusters=self.n_clusters)

    def agglomerative(self, linkage, affinity):
        self.model = AgglomerativeClustering(
            n_clusters=self.n_clusters, linkage=linkage, affinity=affinity)

    def birch(self):
        #acc is 0.87
        self.model = Birch(n_clusters=self.n_clusters)

    def spectral(self, affinity, n_neighbors=None):
        self.model = SpectralClustering(
            n_clusters=self.n_clusters, affinity=affinity, n_neighbors=n_neighbors)

    def spectral_biclustering(self):
        self.model = SpectralBiclustering(n_clusters=self.n_clusters)

    def spectral_coclustering(self):
        self.model = SpectralCoclustering(n_clusters=self.n_clusters)

    def fit_model(self):
        # fit model and predict
        self.model.fit(self.feature_vectors)
        try:
            self.predicted_labels = self.model.labels_
        except AttributeError:
            # spectral_biclustering and Coclustering
            print(self.model.row_labels_.shape)
            self.predicted_labels = self.model.row_labels_
        except Exception:
            print(Exception)

    def save_result(self, file_path):
        np.savetxt('{}'.format(file_path),
                   self.predicted_labels.astype(int), fmt='%i')

    def goodness(self, true_labels, base_precision, improved_precision, verbose=False):
        self.fit_model()
        # evaluate performance
        normalized_mutual_info = normalized_mutual_info_score(
            true_labels, self.predicted_labels)
        points = (normalized_mutual_info-base_precision)/improved_precision + 1
        if verbose:
            print('current project can get {:d} points'.format(int(points)))
        return normalized_mutual_info
Esempio n. 7
0
def visualizeCorr(sgp, args):
    sgp.cpu()
    if args.file.split('/')[-2] == 'simulation':
        final_corr = data['corr']
        allX = torch.tensor(data['data']).type(torch.float)
        allIid = data['iid'].reshape(-1)

        plt.figure()
        sns.heatmap(
            final_corr,
            cmap="YlGnBu",
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )

        corr = sgp.deepkernel(allX, allIid).detach().cpu().numpy()
        plt.figure()
        sns.heatmap(
            corr,
            cmap='YlGnBu',
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )
        plt.show()
    else:
        from sklearn.cluster import SpectralCoclustering
        indv_corr = sgp.deepkernel.indv_kernel(torch.arange(
            len(idMap))).detach().cpu().numpy()
        num_c = args.number_cluster
        model = SpectralCoclustering(n_clusters=num_c, random_state=0)
        model.fit(indv_corr)
        fit_data = indv_corr[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.row_labels_)]
        rows = np.random.permutation(np.arange(len(fit_data)))
        rows = rows[:3300]
        rows = np.sort(rows)
        clusterRes = model.row_labels_
        cl = np.argsort(clusterRes)
        ax = sns.heatmap(
            indv_corr[cl][:, cl],
            cmap='YlGnBu',
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )
        plt.show()
Esempio n. 8
0
def bicluster_correlation_matrix(X, n_clusters=10, figsize=None):
    """
    Group similar variables together by running Spectral coclustering algorithm on a dataset's correlation matrix.
    See https://bit.ly/2QgXZB2 for more details.

    Spectral coclustering finds groups of similar (row, column) subsets where each column can only belong to
    a single bicluster. This is different than "checkerboard" biclustering.

    Parameters
    ------------
    X: {pd.DataFrame} numeric feature data. Shape {observations} x {features}
    n_clusters: {int} number of biclusters to construct
    figsize: {2-tuple of int} pyplot Figure size. Default [10, 6].

    Returns
    ------------
    coclust: {fitted sklearn.cluster.SpectralCoclustering object}
    """

    # -- get estimate of correlation matrix using median-imputed version of data,
    # -- and then downsample to 50k datapoints for speed.
    num_df = X.iloc[np.random.choice(range(X.shape[0])
                                       , size=min(100000, X.shape[0])
                                       , replace=False)]
    cor_mat = num_df.fillna(num_df.median()).corr()

    # -- run coclustering.
    coclust = SpectralCoclustering(n_clusters=n_clusters
                                   , random_state=666)
    coclust.fit(cor_mat)

    # -- re-order correlation matrix by cluster indices.
    biclust_dat = cor_mat.iloc[np.argsort(coclust.row_labels_)]
    biclust_dat = biclust_dat.iloc[:, np.argsort(coclust.column_labels_)]

    # -- display biclustering pattern.
    fig = plt.figure(figsize=figsize if figsize else [10, 10])
    ax = fig.add_subplot(111)
    ax = ax.matshow(biclust_dat
                     , cmap='cool')
    ax.set_title(f'Correlation matrix post-biclustering: {n_clusters} clusters')

    ax.set_yticks(range(biclust_dat.shape[0]))
    ax.set_yticklabels(biclust_dat.index.tolist())

    plt.show()

    return coclust
                                      noise=5,
                                      shuffle=False,
                                      random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralCoclustering(n_clusters=5, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.3f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

#plt.show()
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralCoclustering

whisky = pd.read_csv('whiskies.txt')
whisky['Region'] = pd.read_csv('regions.txt')
flavors = whisky.iloc[:, 2:14]
corr_flavors = pd.DataFrame.corr(flavors)
corr_whisky = pd.DataFrame.corr(flavors.transpose())

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(corr_whisky)

whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
whisky = whisky.iloc[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)

correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())
correlations = np.array(correlations)

# First, we import a tool to allow text to pop up on a plot when the cursor
# hovers over it.  Also, we import a data structure used to store arguments
# of what to plot in Bokeh.  Finally, we will use numpy for this section as well!

from bokeh.models import HoverTool, ColumnDataSource
import numpy as np
Esempio n. 11
0
clustering.column_labels_  #doctest: +SKIP

clustering

# In[12]:

from sklearn.metrics import consensus_score
from matplotlib import pyplot as plt
# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(X.shape[0])
col_idx = rng.permutation(X.shape[1])

# 여기서부터 우리 실험 데이터!
model = SpectralCoclustering(n_clusters=7, random_state=0)
model.fit(X)

# In[19]:

fig = plt.figure(figsize=(40, 55))
ax = fig.add_subplot(1, 1, 1)

fit_data = X[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

#plt.matshow(fit_data, cmap=plt.cm.Blues)
ax.matshow(fit_data, cmap=plt.cm.Blues)

#plt.title("After biclustering")
ax.set_title("After biclustering")
Esempio n. 12
0
print(corr_whisky)

plt.figure(figsize = (10,10))
plt.pcolor(corr_whisky)
plt.axis("tight")
plt.colorbar()
plt.show()
'''


# Spectral co clustering

from sklearn.cluster import SpectralCoclustering

model = SpectralCoclustering(n_clusters= 6, random_state= 0) 
model.fit(corr_whisky) # Data from the correlation matrix

# Every row corresponds to the cluster, every column 
# to the data parameter
print( np.sum(model.rows_, axis= 1) ) # Sumamos las columnas

# How many clusters belonging from each element
print( np.sum(model.rows_, axis= 0) )

# Each element from the array positions belongs from the number
# from this position
print(model.row_labels_)


# Comparing the correlation tables
whisky['Group'] = pd.Series(model.row_labels_, index = whisky.index)
y_true = newsgroups.target

vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack',
                                 random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories),
                         batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_kmeans, y_true)))

feature_names = vectorizer.get_feature_names()
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)


def bicluster_ncut(i):
#Prints out the grid shape of the genres
print(visGrid.shape)
print(len(Genre_ID_to_name.keys()))

#Code that illustrates the heat map of co-occurring genre of movies
annot_lookup = []
for i in range(len(nr_ids)):
    annot_lookup.append(Genre_ID_to_name[nr_ids[i]])

sns.heatmap(visGrid, xticklabels=annot_lookup, yticklabels=annot_lookup)
plt.title("Heat map of Co-occurring Movie Genres")
plt.show()

#Bi-clustering to show genres that occur together and genres that don't occur together
model = SpectralCoclustering(n_clusters=5)
model.fit(visGrid)

fit_data = visGrid[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

annot_lookup_sorted = []
for i in np.argsort(model.row_labels_):
    annot_lookup_sorted.append(Genre_ID_to_name[nr_ids[i]])

sns.heatmap(fit_data,
            xticklabels=annot_lookup_sorted,
            yticklabels=annot_lookup_sorted,
            annot=False)
plt.title("After biclustering; rearranged to show biclusters")
plt.show()
Esempio n. 15
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralCoclustering
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, output_file, show

#whisky = pd.read_csv(r'C:/Users/Daria/Documents/PythonScripts/whiskies.txt')
whisky = pd.read_csv(r"whiskies.csv", index_col=0)
whisky["Region"] = pd.read_csv(
    r"C:/Users/Daria/Documents/PythonScripts/regions.txt")
flavors = whisky.iloc[:, 2:14]
correlation_flavors = pd.DataFrame.corr(flavors)
correlation_whisky = pd.DataFrame.corr(flavors.transpose())

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(correlation_whisky)
np.sum(model.rows_, 1)  #each whisky belongs to one of 6 clusters
np.sum(model.rows_, 0)  # does each whicky belong to only 1 cluster?
model.row_labels_  # each observation belongs to this cluster
whisky = whisky.iloc[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)
#correlations = pd.DataFrame.corr(whisky.iloc[:,2:14].transpose())
#correlations = np.array(correlations)
corr = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())  #DataFrame
correl = np.array(corr)  # convert to NumPy array

plt.figure(figsize=(14, 7))
plt.subplot(121)
plt.pcolor(correlation_whisky)
plt.colorbar()
plt.title('Original')
Esempio n. 16
0
        value = 100 * float(item)
        matrix[row_index][column_index] = value
        if value < min_list[column_index]:
            min_list[column_index] = value
        if value > max_list[column_index]:
            max_list[column_index] = value
        if value != 0:
            ave_list[column_index] += 1  # 此时ave_list复用表示非零元素个数
        column_index += 1
print(unsta_max)
print("row_num", row_num)

#  跑对角线双聚类 每个基因打上0.。4 的标签

model = SpectralCoclustering(n_clusters=10, random_state=0)
model.fit(matrix)
for i in range(len(row_dict)):
    print(i, '.', row_list[i], ':', model.row_labels_[i])
print(model.column_labels_)

fit_data = matrix[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

#  临床变量随机森林
con_num = file9.readline().split().__len__() - 1
print("con_num:", con_num)
lines = file9.readlines()
sam_num = lines.__len__()
print("sam_num:", sam_num)
x_train = np.empty(shape=(sam_num, con_num), dtype=np.int)
y_train = np.empty(sam_num, dtype=int)
Esempio n. 17
0
def test_spectralcoclustering_parameter_validation(params, type_err, err_msg):
    """Check parameters validation in `SpectralBiClustering`"""
    data = np.arange(25).reshape((5, 5))
    model = SpectralCoclustering(**params)
    with pytest.raises(type_err, match=err_msg):
        model.fit(data)
Esempio n. 18
0
plt.figure(figsize=(10, 10))
plt.pcolor(corrFlavours)
plt.colorbar()
plt.savefig("CorrFlavours.pdf")

corrWhisky = pd.DataFrame.corr(flavours.transpose())
plt.figure(figsize=(10, 10))
plt.pcolor(corrWhisky)
plt.axis("tight")
plt.colorbar()
plt.savefig("CorrWhisky.pdf")

# -- 4.1.4 Clustering Whiskies by Flavour Profile
model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(corrWhisky)
# print(model.rows_)  # output -> array with dimensions number of row clusters * number of rows
# ^^ CORRELATION MATRIX ^^
# Each row in this array identifies a cluster, here ranging from 0 to 5
# Each column identifies a row in the correlation matrix, here ranging from 0 to 85
"""
[[False False False False False False False False False False False False
  False False False False False False False  True False False False False
  False False False False False False False False False False False False
  False False False  True False False False False False False False False
  False False False False False False False  True False False False False
  False False False False False False  True False  True False False False
  False False False False False False False False False False False False
  False False]
 [False False False False False  True False False False False False  True
  False  True False False  True False  True False False False False False
Esempio n. 19
0
class DocumentClustering:
    def __init__(self, k=5):
        self.name = 'k-means'
        self.k = k
        self.X = None
        self.clustering = None
        self.vectorizer = None
        self.dataset_size = 0
        self.doc2vec_matrix = False

    def make_matrix(self,
                    documents=None,
                    n_components=-1,
                    doc2vec_matrix=None):
        if isinstance(doc2vec_matrix, np.ndarray) == False:
            self.vectorizer = TfidfVectorizer()
            # self.vectorizer = CountVectorizer()
            self.X = self.vectorizer.fit_transform(documents)
            self.dataset_size = len(documents)
        else:
            self.X = doc2vec_matrix
            self.dataset_size = len(doc2vec_matrix)
            self.doc2vec_matrix = True

        if (n_components != -1):
            if n_components > len(self.vectorizer.get_feature_names()):
                n_components = len(self.vectorizer.get_feature_names())
            print('n_components ' + str(n_components))
            # Vectorizer results are normalized, which makes KMeans behave as
            # spherical k-means for better results. Since LSA/SVD results are
            # not normalized, we have to redo the normalization.
            print("Performing dimensionality reduction using LSA")
            t0 = time()
            svd = TruncatedSVD(n_components)
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)

            self.X = lsa.fit_transform(self.X)

            print("done in %fs" % (time() - t0))

            explained_variance = svd.explained_variance_ratio_.sum()
            print("Explained variance of the SVD step: {}%".format(
                int(explained_variance * 100)))

            print()

    def cluster(self, cluster_name):
        self.name = cluster_name.strip()
        print('cluster_name ' + self.name)
        if self.name == 'k-means':
            print('cluster_name: ' + self.name)
            self.clustering = KMeans(n_clusters=self.k,
                                     init='k-means++',
                                     max_iter=500,
                                     n_init=1)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif cluster_name == 'agglo':
            self.clustering = AgglomerativeClustering(n_clusters=self.k,
                                                      affinity='euclidean',
                                                      memory=None,
                                                      connectivity=None,
                                                      compute_full_tree='auto',
                                                      linkage='ward',
                                                      distance_threshold=None)

            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            #to make dense matrix
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif self.name == 'spectral_cocluster':
            self.clustering = SpectralCoclustering(n_clusters=self.k,
                                                   svd_method='arpack',
                                                   random_state=0)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()

    def print_results(self):
        # print the clustering result
        print(self.name)
        if self.name == 'k-means':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}
            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                print(str(cluster_label) + " -- " + str(document_id))
            order_centroids = self.clustering.cluster_centers_.argsort(
            )[:, ::-1]
            terms = self.vectorizer.get_feature_names()
            for i in range(self.k):
                print("Cluster %d:" % i, end='')
                for ind in order_centroids[i, :10]:
                    print(' %s' % terms[ind], end='')
                print()

        elif self.name == 'agglo':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}

            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                #print(str(cluster_label) + " -- " + str(document_id))

            results = self.get_cluster_top_keywords(clusters)
            for _cluster in results:
                key_terms = results[_cluster]
                print("Cluster " + str(_cluster) + " : " +
                      str(len(clusters[_cluster])) + " documents")
                print(key_terms)
            print()

        elif self.name == 'spectral_cocluster':
            target_number = 10
            bicluster_ncuts = list(
                self.bicluster_ncut(i) for i in range(self.k))
            best_idx = np.argsort(bicluster_ncuts)[:target_number]

            feature_names = self.vectorizer.get_feature_names()
            print()
            print("Best biclusters:")
            print("----------------")
            for idx, cluster in enumerate(best_idx):
                n_rows, n_cols = self.clustering.get_shape(cluster)
                cluster_docs, cluster_words = self.clustering.get_indices(
                    cluster)
                if not len(cluster_docs) or not len(cluster_words):
                    continue

                # categories
                counter = defaultdict(int)
                for i in cluster_docs:
                    counter[str(i)] += 1
                cat_string = ", ".join(
                    "{:.0f}% {}".format(float(c) / n_rows * 100, name)
                    for name, c in self.most_common(counter)[:3])

                # words
                out_of_cluster_docs = self.clustering.row_labels_ != cluster
                out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
                word_col = self.X[:, cluster_words]
                word_scores = np.array(word_col[cluster_docs, :].sum(
                    axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0))
                word_scores = word_scores.ravel()
                important_words = list(feature_names[cluster_words[i]]
                                       for i in word_scores.argsort()[:-11:-1])

                print("bicluster {} : {} documents, {} words".format(
                    idx, n_rows, n_cols))
                print("categories   : {}".format(cat_string))
                print("words        : {}\n".format(', '.join(important_words)))

    def bicluster_ncut(self, i):
        rows, cols = self.clustering.get_indices(i)
        if not (np.any(rows) and np.any(cols)):
            import sys
            return sys.float_info.max

        row_complement = np.nonzero(np.logical_not(
            self.clustering.rows_[i]))[0]
        col_complement = np.nonzero(np.logical_not(
            self.clustering.columns_[i]))[0]
        # Note: the following is identical to X[rows[:, np.newaxis],
        # cols].sum() but much faster in scipy <= 0.16
        weight = self.X[rows][:, cols].sum()
        cut = (self.X[row_complement][:, cols].sum() +
               self.X[rows][:, col_complement].sum())

        return cut / weight

    def most_common(self, d):
        """Items of a defaultdict(int) with the highest values.
        """
        return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

    def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10):
        """Shows the top k words for each cluster
        Keyword Arguments:
            keywords_per_cluster {int} -- The k words to show for each cluster (default: {10})
        Returns:
            dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']}
        """
        terms = self.vectorizer.get_feature_names()
        out = {}
        docs_for_cluster = {}
        # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508
        for cluster in clusters:
            # To flatten/combine all documents into one
            docs_for_cluster[cluster] = np.array(
                [self.X[i] for i in clusters[cluster]])
            # Cluster vectors to feature words
            out[cluster] = np.array(terms)[np.flip(
                np.argsort(docs_for_cluster[cluster]), -1)]
            cluster_shape = out[cluster].shape
            out[cluster] = out[cluster].reshape(
                cluster_shape[0] *
                cluster_shape[1])[:keywords_per_cluster].tolist()

        return out

    def visualize(self):
        # The output is a one-dimensional array of N documents corresponding to the clusters
        # assigned to our N data points.
        if self.name == 'spectral_cocluster':
            pca_t = None
            if self.doc2vec_matrix == False:
                pca_t = PCA().fit_transform(self.X.toarray())
            else:
                pca_t = PCA().fit_transform(self.X)
            #pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.row_labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'agglo':
            pca_t = PCA().fit_transform(self.X)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'k-means':
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()

            pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()
Esempio n. 20
0
def type_consistent_cocluster(topic_word_dict0,
                              ename2embed_bert,
                              n_cluster_min,
                              print_cls=False,
                              save_file=None):
    topic_word_dict = {}
    all_words = []

    for topic in topic_word_dict0:
        topic_word_dict[topic] = []
        for ename in topic_word_dict0[topic]:
            if ename in ename2embed_bert:
                topic_word_dict[topic].append(ename)
                all_words.append(ename)

    topics = list(topic_word_dict0.keys())
    #     print("topics")
    #     print(topics)

    all_children = [x for x in all_words]
    #     all_words.extend([x for x in topics if x in ename2embed_bert])
    all_embed = [ename2embed_bert[x][0] for x in all_words]
    #     print(all_children)

    all_words_and_their_parents = []
    for word in all_words:
        for topic in topic_word_dict:
            if word in topic_word_dict[topic]:
                word0 = (topic, word)
                break
        all_words_and_their_parents.append(word0)


#     print(all_words_and_their_parents)

# AP
    clustering = AffinityPropagation().fit(all_embed)
    n_clusters = max(clustering.labels_) + 1
    clusters = {}
    col_vectors = np.zeros((len(topic_word_dict), n_clusters), dtype=float)
    for i in range(n_clusters):
        clusters[i] = [
            all_words_and_their_parents[x]
            for x in range(len(clustering.labels_))
            if clustering.labels_[x] == i
        ]
        for word0 in clusters[i]:
            word0_col = int(word0[0])
            col_vectors[word0_col, i] = 1
    col_vectors = np.array(col_vectors)
    col_vectors += 0.1 * np.ones((len(topic_word_dict), n_clusters), dtype=int)

    for n_cluster in range(n_cluster_min, n_cluster_min + 10):

        model = SpectralCoclustering(n_clusters=n_cluster, random_state=0)
        model.fit(col_vectors)

        new_topic_word_dict = {}
        coverage_list = []
        for ind in range(n_cluster):
            # print(ind)
            small_matrix = col_vectors[[
                x for x in range(len(model.row_labels_))
                if model.row_labels_[x] == ind
            ]]
            small_matrix = small_matrix[:, [
                x for x in range(len(model.column_labels_))
                if model.column_labels_[x] == ind
            ]]
            coverage_list.append(
                np.sum(small_matrix) / np.sum(np.ones_like(small_matrix)))
        if max(coverage_list) >= 0.7:
            break

    fit_data = col_vectors[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    cluster_count = [sum(model.row_labels_ == x) for x in range(n_cluster)]
    # print("row cluster count: ", cluster_count)

    cluster_count = [sum(model.column_labels_ == x) for x in range(n_cluster)]
    # print("column cluster count: ", cluster_count)

    coverage_thre = min(max(coverage_list), 0.4)
    # print('coverage: ',coverage_list)

    for ind in range(n_cluster):
        if coverage_list[ind] < coverage_thre:
            # print("del cluster ",ind)
            continue
        for topic in topic_word_dict:
            if model.row_labels_[int(topic)] == ind:
                new_topic_word_dict[topic] = [
                    x for x in topic_word_dict[topic]
                ]

    return new_topic_word_dict