Ejemplo n.º 1
0
 def form_biclusters(self):
     """generates spectral bi-clusters from self.data
     """
     n_clusters = len(np.unique(self.labels))
     print("Generating {} clusters".format(n_clusters))
     self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1)
     self.bicluster.fit(self.data)
Ejemplo n.º 2
0
    def spectral(Y, X, dtype=sp.float32):
        from sklearn.cluster import SpectralCoclustering

        def scale_normalize(X):
            " from https://github.com/scikit-learn/scikit-learn/blob/b194674c4/sklearn/cluster/_bicluster.py#L108"
            row_diag = sp.asarray(sp.sqrt(X.sum(axis=1))).squeeze()
            col_diag = sp.asarray(sp.sqrt(X.sum(axis=0))).squeeze()
            row_diag[row_diag == 0] = 1.0
            col_diag[col_diag == 0] = 1.0
            row_diag = 1.0 / row_diag
            col_diag = 1.0 / col_diag
            if smat.issparse(X):
                n_rows, n_cols = X.shape
                r = smat.dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
                c = smat.dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
                an = r * X * c
            else:
                an = row_diag[:, sp.newaxis] * X * col_diag
            return an, row_diag, col_diag

        coclustering = SpectralCoclustering(n_clusters=16384, random_state=1)
        normalized_data, row_diag, col_diag = scale_normalize(Y.T)
        n_sv = 1 + int(sp.ceil(sp.log2(coclustering.n_clusters)))
        u, v = coclustering._svd(normalized_data, n_sv, n_discard=1)
        label_embedding = smat.csr_matrix(u, dtype=dtype)
        return label_embedding
Ejemplo n.º 3
0
def test_spectral_coclustering():
    # Test Dhillon's Spectral CoClustering on a simple problem.
    param_grid = {
        "svd_method": ["randomized", "arpack"],
        "n_svd_vecs": [None, 20],
        "mini_batch": [False, True],
        "init": ["k-means++"],
        "n_init": [10],
    }
    random_state = 0
    S, rows, cols = make_biclusters((30, 30),
                                    3,
                                    noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert model.rows_.shape == (3, 30)
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert consensus_score(model.biclusters_, (rows, cols)) == 1

            _test_shape_indices(model)
Ejemplo n.º 4
0
def cluster(y, X, n_clusters):
    if n_clusters != 1 :
        y_ = onp.array(y,dtype='float64')
        # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0)
        # for i in range(mask.size):
        #     y_[range(mask[i]),i] = np.nan
        corr = pd.DataFrame(y_).corr(method='kendall')
        model = SpectralCoclustering(n_clusters=n_clusters)
        model.fit(corr)
        clusters = [model.get_indices(i)[0] for i in range(n_clusters)]
        def fn_by_cluster(x,fn,weights=None):
            if weights is not None:
                return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis]
                                       for i, rng in enumerate(clusters)], axis=-1)
            else:
                return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis]
                                       for i, rng in enumerate(clusters)],axis=-1)

        fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters]

        y_ = fn_by_cluster(y,np.sum)
        #Compute within-group market shares
        y_weights = fn_market_share(y)
        X_ = fn_by_cluster(X,np.average,y_weights)
        return y_, X_, clusters
    else:
        return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
Ejemplo n.º 5
0
    def cluster(self, cluster_name):
        self.name = cluster_name.strip()
        print('cluster_name ' + self.name)
        if self.name == 'k-means':
            print('cluster_name: ' + self.name)
            self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif cluster_name == 'agglo':
            self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None,
                                                      connectivity=None,
                                                      compute_full_tree='auto',
                                                      linkage='ward',
                                                      distance_threshold=None)

            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            #to make dense matrix
            self.X = self.X.toarray()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif self.name == 'spectral_cocluster':
            self.clustering = SpectralCoclustering(n_clusters=self.k,svd_method='arpack', random_state=0)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
Ejemplo n.º 6
0
class Spectral(object):
    def __init__(self, dataset):
        """initialize spectral class
        
        Arguments:
            dataset {np.array} -- dataset to fetch
        """
        data_map = {'classic3': 1, 'cstr': 3, 'mnist': 2}
        self.dataset = dataset
        print("Fetching ", dataset)
        self.data, self.labels = get_data_set(data_map[dataset])
        if (~self.data.any(axis=0)).any():
            print("Found empty features. deleting...")
            self.data = np.delete(self.data,
                                  np.where(~self.data.any(axis=0))[0],
                                  axis=1)

    def view_dataset(self, title, data, markersize=0.001):
        """plot data matrix
        
        Arguments:
            title {str} -- title of plot
            data {np.array} -- dataset to plot
        
        Keyword Arguments:
            markersize {float} -- size of datapoints (default: {0.001})
        """
        plt.spy(data, markersize=markersize)
        plt.title(title)
        plt.show()

    def shuffle_data(self):
        """shuffles self.data
        """
        print("Shuffling")
        self.data, self.labels = shuffle(self.data, self.labels)
        self.view_dataset(data=self.data, title='shuffled data')

    def form_biclusters(self):
        """generates spectral bi-clusters from self.data
        """
        n_clusters = len(np.unique(self.labels))
        print("Generating {} clusters".format(n_clusters))
        self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1)
        self.bicluster.fit(self.data)

    def get_accuracy(self):
        """calculates NMI between self.bicluster rows and data labels
        """
        nmi = normalized_mutual_info_score(self.bicluster.row_labels_,
                                           self.labels)
        print("Accuracy is ", nmi)

    def show_clusters(self):
        """sorts data according to bicluster row and col labels and plots
        """
        fit_data = self.data[np.argsort(self.bicluster.row_labels_)]
        fit_data = fit_data[:, np.argsort(self.bicluster.column_labels_)]
        self.view_dataset(data=fit_data, title='co-clusters')
Ejemplo n.º 7
0
def spect(input_data, n):
    spec_instance = SpectralCoclustering(n_clusters=n)
    spec_instance.fit(input_data)
    pred = spec_instance.row_labels_
    print(pred)
    print("ACC: " + str(accuracy_score(pred, labels)))
    acc["SPECT" + str(n)] = str(accuracy_score(pred, labels))
    tsnePlot(pred, n, input_data, 'SPECT')
def compute_coclustering(
        fit_data,
        num_clusters=1,
        tol_bicluster=0.005,  # sparsity otherwise annoyingly causes underflows w/ sklearn
):
    if num_clusters == 1:
        num_clusters = min(fit_data.shape[0], 5)
    model = SpectralCoclustering(n_clusters=num_clusters, random_state=0)
    model.fit(fit_data + tol_bicluster)
    ordered_rows = np.argsort(model.row_labels_)
    ordered_cols = np.argsort(model.column_labels_)
    return (ordered_rows, ordered_cols, model.row_labels_[ordered_rows],
            model.column_labels_[ordered_cols])
Ejemplo n.º 9
0
def visualizeCorr(sgp, args):
    sgp.cpu()
    if args.file.split('/')[-2] == 'simulation':
        final_corr = data['corr']
        allX = torch.tensor(data['data']).type(torch.float)
        allIid = data['iid'].reshape(-1)

        plt.figure()
        sns.heatmap(
            final_corr,
            cmap="YlGnBu",
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )

        corr = sgp.deepkernel(allX, allIid).detach().cpu().numpy()
        plt.figure()
        sns.heatmap(
            corr,
            cmap='YlGnBu',
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )
        plt.show()
    else:
        from sklearn.cluster import SpectralCoclustering
        indv_corr = sgp.deepkernel.indv_kernel(torch.arange(
            len(idMap))).detach().cpu().numpy()
        num_c = args.number_cluster
        model = SpectralCoclustering(n_clusters=num_c, random_state=0)
        model.fit(indv_corr)
        fit_data = indv_corr[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.row_labels_)]
        rows = np.random.permutation(np.arange(len(fit_data)))
        rows = rows[:3300]
        rows = np.sort(rows)
        clusterRes = model.row_labels_
        cl = np.argsort(clusterRes)
        ax = sns.heatmap(
            indv_corr[cl][:, cl],
            cmap='YlGnBu',
            square=True,
            robust=True,
            xticklabels=False,
            yticklabels=False,
        )
        plt.show()
Ejemplo n.º 10
0
class Cluster:
    def __init__(self, n_clusters, feature_vectors):
        self.n_clusters = n_clusters
        self.feature_vectors = feature_vectors

    def kmeans(self):
        self.model = KMeans(n_clusters=self.n_clusters)

    def agglomerative(self, linkage, affinity):
        self.model = AgglomerativeClustering(
            n_clusters=self.n_clusters, linkage=linkage, affinity=affinity)

    def birch(self):
        #acc is 0.87
        self.model = Birch(n_clusters=self.n_clusters)

    def spectral(self, affinity, n_neighbors=None):
        self.model = SpectralClustering(
            n_clusters=self.n_clusters, affinity=affinity, n_neighbors=n_neighbors)

    def spectral_biclustering(self):
        self.model = SpectralBiclustering(n_clusters=self.n_clusters)

    def spectral_coclustering(self):
        self.model = SpectralCoclustering(n_clusters=self.n_clusters)

    def fit_model(self):
        # fit model and predict
        self.model.fit(self.feature_vectors)
        try:
            self.predicted_labels = self.model.labels_
        except AttributeError:
            # spectral_biclustering and Coclustering
            print(self.model.row_labels_.shape)
            self.predicted_labels = self.model.row_labels_
        except Exception:
            print(Exception)

    def save_result(self, file_path):
        np.savetxt('{}'.format(file_path),
                   self.predicted_labels.astype(int), fmt='%i')

    def goodness(self, true_labels, base_precision, improved_precision, verbose=False):
        self.fit_model()
        # evaluate performance
        normalized_mutual_info = normalized_mutual_info_score(
            true_labels, self.predicted_labels)
        points = (normalized_mutual_info-base_precision)/improved_precision + 1
        if verbose:
            print('current project can get {:d} points'.format(int(points)))
        return normalized_mutual_info
Ejemplo n.º 11
0
def bicluster_correlation_matrix(X, n_clusters=10, figsize=None):
    """
    Group similar variables together by running Spectral coclustering algorithm on a dataset's correlation matrix.
    See https://bit.ly/2QgXZB2 for more details.

    Spectral coclustering finds groups of similar (row, column) subsets where each column can only belong to
    a single bicluster. This is different than "checkerboard" biclustering.

    Parameters
    ------------
    X: {pd.DataFrame} numeric feature data. Shape {observations} x {features}
    n_clusters: {int} number of biclusters to construct
    figsize: {2-tuple of int} pyplot Figure size. Default [10, 6].

    Returns
    ------------
    coclust: {fitted sklearn.cluster.SpectralCoclustering object}
    """

    # -- get estimate of correlation matrix using median-imputed version of data,
    # -- and then downsample to 50k datapoints for speed.
    num_df = X.iloc[np.random.choice(range(X.shape[0])
                                       , size=min(100000, X.shape[0])
                                       , replace=False)]
    cor_mat = num_df.fillna(num_df.median()).corr()

    # -- run coclustering.
    coclust = SpectralCoclustering(n_clusters=n_clusters
                                   , random_state=666)
    coclust.fit(cor_mat)

    # -- re-order correlation matrix by cluster indices.
    biclust_dat = cor_mat.iloc[np.argsort(coclust.row_labels_)]
    biclust_dat = biclust_dat.iloc[:, np.argsort(coclust.column_labels_)]

    # -- display biclustering pattern.
    fig = plt.figure(figsize=figsize if figsize else [10, 10])
    ax = fig.add_subplot(111)
    ax = ax.matshow(biclust_dat
                     , cmap='cool')
    ax.set_title(f'Correlation matrix post-biclustering: {n_clusters} clusters')

    ax.set_yticks(range(biclust_dat.shape[0]))
    ax.set_yticklabels(biclust_dat.index.tolist())

    plt.show()

    return coclust
Ejemplo n.º 12
0
def rearrange_confusion_matrix(cm, n_clusters):
    from sklearn.cluster import SpectralCoclustering

    clst = SpectralCoclustering(n_clusters=n_clusters).fit(cm)

    idx = []
    for c in range(n_clusters):
        idx.append(clst.get_indices(c)[0])
    idx = np.concatenate(idx)

    cm_clustered = np.zeros(cm.shape, dtype=int)

    for i, idxi in enumerate(idx):
        for j, idxj in enumerate(idx):
            cm_clustered[i,j] = cm[idxi, idxj]

    return cm_clustered, idx
Ejemplo n.º 13
0
def cocluster(np_sums, matrix_diags, vectorizer):
    ''' Perform the coclustering '''
    x = np.array(np_sums)
    # print(x)
    n_clusters = 20
    clustering = SpectralCoclustering(n_clusters=n_clusters,
                                      random_state=0).fit(x)

    for i in range(n_clusters):
        row_nums, col_nums = clustering.get_indices(i)
        row_words = [matrix_diags[num] for num in row_nums]
        col_words = [vectorizer.get_feature_names()[num] for num in col_nums]

        print("Cluster: ", i)
        print("===========")
        print("Diagnoses: ", row_words)
        print()
        print("n-grams: ", col_words)
        print()
Ejemplo n.º 14
0
def plot_matrix(all_feature_names_arg,
                mat,
                filename,
                force_no_cocluster=False):
    print(datetime.datetime.now(), 'plot_matrix')
    print('  mat.shape=', mat.shape)
    plt.figure(figsize=(10, 4))

    # set the x-axis to only include the biggest words
    if not args.no_biggest_words:
        l2_norms = np.linalg.norm(mat, axis=0, ord=args.norm)
        indices = l2_norms.argsort()[-args.num_words:]
        mat = mat[:, indices]
        all_feature_names = all_feature_names_arg
        words = [all_feature_names[i] for i in indices]
        plt.xticks(range(0, len(words)), words, rotation=-90)

    # cocluster the axes
    if not args.no_cocluster and not force_no_cocluster:
        clustering = SpectralCoclustering(n_clusters=6,
                                          random_state=1).fit(mat)
        col_indices = np.argsort(clustering.column_labels_)
        mat = mat[:, col_indices]
        try:
            words = [words[i] for i in col_indices]
            plt.xticks(range(0, len(words)), words, rotation=-90)
        except:
            pass

    # plot the figure
    plt.imshow(mat,
               aspect='auto',
               cmap='RdBu',
               norm=colors.SymLogNorm(linthresh=0.03,
                                      linscale=0.03,
                                      vmin=-1e6,
                                      vmax=1e6))

    plt.yticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7, 8], labels=model.classes_)
    plt.ylim(-0.5, 8.5)

    plt.colorbar()
    plt.tight_layout()
    plt.savefig(filename)
Ejemplo n.º 15
0
def spectral_clustering_experiment_helper(args):
    dataset, lr, seed, ModelClass, n_clusters = args

    np.random.seed(seed)

    choice_sets, choices, _ = dataset.load_pytorch()

    spec_clusters = SpectralCoclustering(
        n_clusters=n_clusters,
        random_state=seed).fit(choice_sets.squeeze().numpy()).row_labels_
    rand_clusters = np.random.permutation(spec_clusters)

    spec_results = []
    rand_results = []

    n_items = choice_sets.size(1)

    for cluster in sorted(set(spec_clusters)):
        for clusters, results in zip([spec_clusters, rand_clusters],
                                     [spec_results, rand_results]):

            cluster_idx = clusters == cluster
            cluster_choice_sets = choice_sets[cluster_idx]
            cluster_choices = choices[cluster_idx]
            w = torch.ones(len(cluster_choices))

            model = ModelClass(n_items)

            loss = fit(model, (cluster_choice_sets, cluster_choices, w),
                       epochs=EPOCHS,
                       learning_rate=lr,
                       l2_lambda=L2_LAMBDA,
                       show_progress=False)

            results.append((len(cluster_choices), loss, model.state_dict(),
                            model.num_params))

    return args, spec_results, rand_results
Ejemplo n.º 16
0
        {
            "n_components": 3,
            "n_best": 4
        },
    ],
)
def test_errors(args):
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(**args)
    with pytest.raises(ValueError):
        model.fit(data)


def test_wrong_shape():
    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    with pytest.raises(ValueError):
        model.fit(data)


@pytest.mark.parametrize("est",
                         (SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):

    X, _, _ = make_biclusters((3, 3), 3, random_state=0)

    assert not hasattr(est, "n_features_in_")
    est.fit(X)
    assert est.n_features_in_ == 3
                                      noise=5,
                                      shuffle=False,
                                      random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralCoclustering(n_clusters=5, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.3f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

#plt.show()
Ejemplo n.º 18
0
def spectral_coclustering(tfidf_matrix, n_clusters=100):
    return SpectralCoclustering(n_clusters=n_clusters).fit(tfidf_matrix)
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralCoclustering

whisky = pd.read_csv('whiskies.txt')
whisky['Region'] = pd.read_csv('regions.txt')
flavors = whisky.iloc[:, 2:14]
corr_flavors = pd.DataFrame.corr(flavors)
corr_whisky = pd.DataFrame.corr(flavors.transpose())

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(corr_whisky)

whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
whisky = whisky.iloc[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)

correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())
correlations = np.array(correlations)

# First, we import a tool to allow text to pop up on a plot when the cursor
# hovers over it.  Also, we import a data structure used to store arguments
# of what to plot in Bokeh.  Finally, we will use numpy for this section as well!

from bokeh.models import HoverTool, ColumnDataSource
import numpy as np
Ejemplo n.º 20
0
predict.tail()

# In[10]:

# concatenate labels to df as a new column
r = pd.concat([data, predict], axis=1)

print(r)
r.tail()

# In[11]:

import numpy as np
from sklearn.cluster import SpectralCoclustering
X = data.to_numpy()
clustering = SpectralCoclustering(n_clusters=5, random_state=0).fit(X)
clustering.row_labels_  #doctest: +SKIP

clustering.column_labels_  #doctest: +SKIP

clustering

# In[12]:

from sklearn.metrics import consensus_score
from matplotlib import pyplot as plt
# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(X.shape[0])
col_idx = rng.permutation(X.shape[1])
Ejemplo n.º 21
0
def test_spectralcoclustering_parameter_validation(params, type_err, err_msg):
    """Check parameters validation in `SpectralBiClustering`"""
    data = np.arange(25).reshape((5, 5))
    model = SpectralCoclustering(**params)
    with pytest.raises(type_err, match=err_msg):
        model.fit(data)
# exclude 'comp.os.ms-windows.misc'
categories = [
    'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos',
    'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
    'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian',
    'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
    'talk.religion.misc'
]
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack',
                                 random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories),
                         batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_cocluster, y_true)))
Ejemplo n.º 23
0
 def spectral_coclustering(self):
     self.model = SpectralCoclustering(n_clusters=self.n_clusters)

categories = [
    'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos',
    'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
    'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian',
    'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
    'talk.religion.misc'
]
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack',
                                 random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories),
                         batch_size=20000,
                         random_state=0)

print("Vectoeizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {: .2}s. V-measure: {: .4f}".format(
    time() - start_time, v_measure_score(y_cocluster, y_true)))
#Prints out the grid shape of the genres
print(visGrid.shape)
print(len(Genre_ID_to_name.keys()))

#Code that illustrates the heat map of co-occurring genre of movies
annot_lookup = []
for i in range(len(nr_ids)):
    annot_lookup.append(Genre_ID_to_name[nr_ids[i]])

sns.heatmap(visGrid, xticklabels=annot_lookup, yticklabels=annot_lookup)
plt.title("Heat map of Co-occurring Movie Genres")
plt.show()

#Bi-clustering to show genres that occur together and genres that don't occur together
model = SpectralCoclustering(n_clusters=5)
model.fit(visGrid)

fit_data = visGrid[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

annot_lookup_sorted = []
for i in np.argsort(model.row_labels_):
    annot_lookup_sorted.append(Genre_ID_to_name[nr_ids[i]])

sns.heatmap(fit_data,
            xticklabels=annot_lookup_sorted,
            yticklabels=annot_lookup_sorted,
            annot=False)
plt.title("After biclustering; rearranged to show biclusters")
plt.show()
Ejemplo n.º 26
0
    for item in word[1:]:
        value = 100 * float(item)
        matrix[row_index][column_index] = value
        if value < min_list[column_index]:
            min_list[column_index] = value
        if value > max_list[column_index]:
            max_list[column_index] = value
        if value != 0:
            ave_list[column_index] += 1  # 此时ave_list复用表示非零元素个数
        column_index += 1
print(unsta_max)
print("row_num", row_num)

#  跑对角线双聚类 每个基因打上0.。4 的标签

model = SpectralCoclustering(n_clusters=10, random_state=0)
model.fit(matrix)
for i in range(len(row_dict)):
    print(i, '.', row_list[i], ':', model.row_labels_[i])
print(model.column_labels_)

fit_data = matrix[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

#  临床变量随机森林
con_num = file9.readline().split().__len__() - 1
print("con_num:", con_num)
lines = file9.readlines()
sam_num = lines.__len__()
print("sam_num:", sam_num)
x_train = np.empty(shape=(sam_num, con_num), dtype=np.int)
Ejemplo n.º 27
0
class DocumentClustering:
    def __init__(self, k=5):
        self.name = 'k-means'
        self.k = k
        self.X = None
        self.clustering = None
        self.vectorizer = None
        self.dataset_size = 0
        self.doc2vec_matrix = False

    def make_matrix(self,
                    documents=None,
                    n_components=-1,
                    doc2vec_matrix=None):
        if isinstance(doc2vec_matrix, np.ndarray) == False:
            self.vectorizer = TfidfVectorizer()
            # self.vectorizer = CountVectorizer()
            self.X = self.vectorizer.fit_transform(documents)
            self.dataset_size = len(documents)
        else:
            self.X = doc2vec_matrix
            self.dataset_size = len(doc2vec_matrix)
            self.doc2vec_matrix = True

        if (n_components != -1):
            if n_components > len(self.vectorizer.get_feature_names()):
                n_components = len(self.vectorizer.get_feature_names())
            print('n_components ' + str(n_components))
            # Vectorizer results are normalized, which makes KMeans behave as
            # spherical k-means for better results. Since LSA/SVD results are
            # not normalized, we have to redo the normalization.
            print("Performing dimensionality reduction using LSA")
            t0 = time()
            svd = TruncatedSVD(n_components)
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)

            self.X = lsa.fit_transform(self.X)

            print("done in %fs" % (time() - t0))

            explained_variance = svd.explained_variance_ratio_.sum()
            print("Explained variance of the SVD step: {}%".format(
                int(explained_variance * 100)))

            print()

    def cluster(self, cluster_name):
        self.name = cluster_name.strip()
        print('cluster_name ' + self.name)
        if self.name == 'k-means':
            print('cluster_name: ' + self.name)
            self.clustering = KMeans(n_clusters=self.k,
                                     init='k-means++',
                                     max_iter=500,
                                     n_init=1)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif cluster_name == 'agglo':
            self.clustering = AgglomerativeClustering(n_clusters=self.k,
                                                      affinity='euclidean',
                                                      memory=None,
                                                      connectivity=None,
                                                      compute_full_tree='auto',
                                                      linkage='ward',
                                                      distance_threshold=None)

            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            #to make dense matrix
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()
            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()
        elif self.name == 'spectral_cocluster':
            self.clustering = SpectralCoclustering(n_clusters=self.k,
                                                   svd_method='arpack',
                                                   random_state=0)
            print("Clustering sparse data with %s" % self.clustering)
            t0 = time()

            self.clustering.fit(self.X)
            print("done in %0.3fs" % (time() - t0))
            print()

    def print_results(self):
        # print the clustering result
        print(self.name)
        if self.name == 'k-means':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}
            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                print(str(cluster_label) + " -- " + str(document_id))
            order_centroids = self.clustering.cluster_centers_.argsort(
            )[:, ::-1]
            terms = self.vectorizer.get_feature_names()
            for i in range(self.k):
                print("Cluster %d:" % i, end='')
                for ind in order_centroids[i, :10]:
                    print(' %s' % terms[ind], end='')
                print()

        elif self.name == 'agglo':
            cluster_labels = self.clustering.labels_
            clustering_dict = self.clustering.__dict__
            clusters = {}

            for document_id, cluster_label in enumerate(cluster_labels):
                if cluster_label not in clusters:
                    clusters[cluster_label] = []
                clusters[cluster_label].append(document_id)
                #print(str(cluster_label) + " -- " + str(document_id))

            results = self.get_cluster_top_keywords(clusters)
            for _cluster in results:
                key_terms = results[_cluster]
                print("Cluster " + str(_cluster) + " : " +
                      str(len(clusters[_cluster])) + " documents")
                print(key_terms)
            print()

        elif self.name == 'spectral_cocluster':
            target_number = 10
            bicluster_ncuts = list(
                self.bicluster_ncut(i) for i in range(self.k))
            best_idx = np.argsort(bicluster_ncuts)[:target_number]

            feature_names = self.vectorizer.get_feature_names()
            print()
            print("Best biclusters:")
            print("----------------")
            for idx, cluster in enumerate(best_idx):
                n_rows, n_cols = self.clustering.get_shape(cluster)
                cluster_docs, cluster_words = self.clustering.get_indices(
                    cluster)
                if not len(cluster_docs) or not len(cluster_words):
                    continue

                # categories
                counter = defaultdict(int)
                for i in cluster_docs:
                    counter[str(i)] += 1
                cat_string = ", ".join(
                    "{:.0f}% {}".format(float(c) / n_rows * 100, name)
                    for name, c in self.most_common(counter)[:3])

                # words
                out_of_cluster_docs = self.clustering.row_labels_ != cluster
                out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
                word_col = self.X[:, cluster_words]
                word_scores = np.array(word_col[cluster_docs, :].sum(
                    axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0))
                word_scores = word_scores.ravel()
                important_words = list(feature_names[cluster_words[i]]
                                       for i in word_scores.argsort()[:-11:-1])

                print("bicluster {} : {} documents, {} words".format(
                    idx, n_rows, n_cols))
                print("categories   : {}".format(cat_string))
                print("words        : {}\n".format(', '.join(important_words)))

    def bicluster_ncut(self, i):
        rows, cols = self.clustering.get_indices(i)
        if not (np.any(rows) and np.any(cols)):
            import sys
            return sys.float_info.max

        row_complement = np.nonzero(np.logical_not(
            self.clustering.rows_[i]))[0]
        col_complement = np.nonzero(np.logical_not(
            self.clustering.columns_[i]))[0]
        # Note: the following is identical to X[rows[:, np.newaxis],
        # cols].sum() but much faster in scipy <= 0.16
        weight = self.X[rows][:, cols].sum()
        cut = (self.X[row_complement][:, cols].sum() +
               self.X[rows][:, col_complement].sum())

        return cut / weight

    def most_common(self, d):
        """Items of a defaultdict(int) with the highest values.
        """
        return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

    def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10):
        """Shows the top k words for each cluster
        Keyword Arguments:
            keywords_per_cluster {int} -- The k words to show for each cluster (default: {10})
        Returns:
            dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']}
        """
        terms = self.vectorizer.get_feature_names()
        out = {}
        docs_for_cluster = {}
        # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508
        for cluster in clusters:
            # To flatten/combine all documents into one
            docs_for_cluster[cluster] = np.array(
                [self.X[i] for i in clusters[cluster]])
            # Cluster vectors to feature words
            out[cluster] = np.array(terms)[np.flip(
                np.argsort(docs_for_cluster[cluster]), -1)]
            cluster_shape = out[cluster].shape
            out[cluster] = out[cluster].reshape(
                cluster_shape[0] *
                cluster_shape[1])[:keywords_per_cluster].tolist()

        return out

    def visualize(self):
        # The output is a one-dimensional array of N documents corresponding to the clusters
        # assigned to our N data points.
        if self.name == 'spectral_cocluster':
            pca_t = None
            if self.doc2vec_matrix == False:
                pca_t = PCA().fit_transform(self.X.toarray())
            else:
                pca_t = PCA().fit_transform(self.X)
            #pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.row_labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'agglo':
            pca_t = PCA().fit_transform(self.X)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()
        elif self.name == 'k-means':
            if self.doc2vec_matrix == False:
                self.X = self.X.toarray()

            pca_t = PCA().fit_transform(self.X)
            # print(self.clustering.labels_)
            plt.scatter(pca_t[:, 0],
                        pca_t[:, 1],
                        c=self.clustering.labels_,
                        cmap='rainbow')
            plt.show()
Ejemplo n.º 28
0
 def spectral_biclustering(self):
     self.model = SpectralBiclustering(n_clusters=self.n_clusters)
Ejemplo n.º 29
0
'''
print(corr_whisky)

plt.figure(figsize = (10,10))
plt.pcolor(corr_whisky)
plt.axis("tight")
plt.colorbar()
plt.show()
'''


# Spectral co clustering

from sklearn.cluster import SpectralCoclustering

model = SpectralCoclustering(n_clusters= 6, random_state= 0) 
model.fit(corr_whisky) # Data from the correlation matrix

# Every row corresponds to the cluster, every column 
# to the data parameter
print( np.sum(model.rows_, axis= 1) ) # Sumamos las columnas

# How many clusters belonging from each element
print( np.sum(model.rows_, axis= 0) )

# Each element from the array positions belongs from the number
# from this position
print(model.row_labels_)


# Comparing the correlation tables
Ejemplo n.º 30
0
def type_consistent_cocluster(topic_word_dict0,
                              ename2embed_bert,
                              n_cluster_min,
                              print_cls=False,
                              save_file=None):
    topic_word_dict = {}
    all_words = []

    for topic in topic_word_dict0:
        topic_word_dict[topic] = []
        for ename in topic_word_dict0[topic]:
            if ename in ename2embed_bert:
                topic_word_dict[topic].append(ename)
                all_words.append(ename)

    topics = list(topic_word_dict0.keys())
    #     print("topics")
    #     print(topics)

    all_children = [x for x in all_words]
    #     all_words.extend([x for x in topics if x in ename2embed_bert])
    all_embed = [ename2embed_bert[x][0] for x in all_words]
    #     print(all_children)

    all_words_and_their_parents = []
    for word in all_words:
        for topic in topic_word_dict:
            if word in topic_word_dict[topic]:
                word0 = (topic, word)
                break
        all_words_and_their_parents.append(word0)


#     print(all_words_and_their_parents)

# AP
    clustering = AffinityPropagation().fit(all_embed)
    n_clusters = max(clustering.labels_) + 1
    clusters = {}
    col_vectors = np.zeros((len(topic_word_dict), n_clusters), dtype=float)
    for i in range(n_clusters):
        clusters[i] = [
            all_words_and_their_parents[x]
            for x in range(len(clustering.labels_))
            if clustering.labels_[x] == i
        ]
        for word0 in clusters[i]:
            word0_col = int(word0[0])
            col_vectors[word0_col, i] = 1
    col_vectors = np.array(col_vectors)
    col_vectors += 0.1 * np.ones((len(topic_word_dict), n_clusters), dtype=int)

    for n_cluster in range(n_cluster_min, n_cluster_min + 10):

        model = SpectralCoclustering(n_clusters=n_cluster, random_state=0)
        model.fit(col_vectors)

        new_topic_word_dict = {}
        coverage_list = []
        for ind in range(n_cluster):
            # print(ind)
            small_matrix = col_vectors[[
                x for x in range(len(model.row_labels_))
                if model.row_labels_[x] == ind
            ]]
            small_matrix = small_matrix[:, [
                x for x in range(len(model.column_labels_))
                if model.column_labels_[x] == ind
            ]]
            coverage_list.append(
                np.sum(small_matrix) / np.sum(np.ones_like(small_matrix)))
        if max(coverage_list) >= 0.7:
            break

    fit_data = col_vectors[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    cluster_count = [sum(model.row_labels_ == x) for x in range(n_cluster)]
    # print("row cluster count: ", cluster_count)

    cluster_count = [sum(model.column_labels_ == x) for x in range(n_cluster)]
    # print("column cluster count: ", cluster_count)

    coverage_thre = min(max(coverage_list), 0.4)
    # print('coverage: ',coverage_list)

    for ind in range(n_cluster):
        if coverage_list[ind] < coverage_thre:
            # print("del cluster ",ind)
            continue
        for topic in topic_word_dict:
            if model.row_labels_[int(topic)] == ind:
                new_topic_word_dict[topic] = [
                    x for x in topic_word_dict[topic]
                ]

    return new_topic_word_dict