Example #1
0
def test_spectral_biclustering():
    """Test Kluger methods on a checkerboard dataset."""
    param_grid = {'method': ['scale', 'bistochastic', 'log'],
                  'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [3],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=random_state)
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralBiclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)

            if issparse(mat) and kwargs['method'] == 'log':
                # cannot take log of sparse matrix
                assert_raises(ValueError, model.fit, mat)
                continue
            else:
                model.fit(mat)

            assert_equal(model.rows_.shape, (9, 30))
            assert_equal(model.columns_.shape, (9, 30))
            assert_array_equal(model.rows_.sum(axis=0),
                               np.repeat(3, 30))
            assert_array_equal(model.columns_.sum(axis=0),
                               np.repeat(3, 30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
Example #2
0
def test_spectral_biclustering():
    """Test Kluger methods on a checkerboard dataset."""
    param_grid = {'method': ['scale', 'bistochastic', 'log'],
                  'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [10],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=random_state)
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralBiclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)

            if issparse(mat) and kwargs['method'] == 'log':
                # cannot take log of sparse matrix
                assert_raises(ValueError, model.fit, mat)
                continue
            else:
                model.fit(mat)

            assert_equal(model.rows_.shape, (9, 30))
            assert_equal(model.columns_.shape, (9, 30))
            assert_array_equal(model.rows_.sum(axis=0),
                               np.repeat(3, 30))
            assert_array_equal(model.columns_.sum(axis=0),
                               np.repeat(3, 30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
 def run(self, data):
     bc = SpectralBiclustering(n_clusters=(self.n_gene_classes,
                                           self.n_classes))
     bc.fit(data)
     gene_clusters = bc.row_labels_
     cell_clusters = bc.column_labels_
     return cell_clusters
Example #4
0
    def biclustering(matrix, distance, callback=None):
        if min(matrix.shape) <= 2:
            return np.arange(matrix.shape[0]), np.arange(matrix.shape[1])

        best_score = np.iinfo(np.dtype('uint16')).max
        best_model = None

        # find the best biclusters (needs revision)
        limit = int(min(matrix.shape) / 2) - 1
        limit = 3 if limit < 3 else limit
        for i in range(2, limit):
            if callback is not None:
                callback(0.2 + (i - 2) / (limit - 2) * 0.8)
            # perform biclustering
            model = SpectralBiclustering(
                n_clusters=i, method='log', random_state=0)
            model.fit(matrix)
            fit_data = matrix[np.argsort(model.row_labels_)]
            fit_data = fit_data[:, np.argsort(model.column_labels_)]

            # calculate score and save the lowest one
            score = distance(fit_data)
            if score < best_score:
                best_score = score
                best_model = model

        return np.argsort(best_model.row_labels_), np.argsort(best_model.column_labels_)
Example #5
0
    def fit_predict(self, D):
        """Run ConsensusClustering algorithm on data D.
        Return partition of input data and consensus matrix for best k.
        """
        # number of samples
        n = D.shape[0]

        # AUC score for each k
        AUC_scores = np.zeros(len(self.num_clusters))
        i = 0

        for k in self.num_clusters:
            M = self.calc_consensus(n, D, k)
            AUC_scores[i] = self.calc_auc(M)
            i = i + 1

        # find best number of clusters (k_best)
        idx_k_best = np.argmax(AUC_scores)
        k_best = K[idx_k_best]

        # uncomment to see the best k for given input data
        #print("Best number of clusters (k): ", k_best)

        M_k_best = self.calc_consensus(n, D, k_best)

        # partition D into K-best clusters based on M_k_best using
        # SpectralBiclustering
        model = SpectralBiclustering(n_clusters=k_best, method='bistochastic')
        model.fit(M_k_best)
        P = model.row_labels_

        return P, M_k_best
Example #6
0
def create_model(data, n_clusters, method, random):
    model = SpectralBiclustering(n_clusters=n_clusters,
                                 method=method,
                                 random_state=random)
    model.fit(data)

    return model
Example #7
0
def Spectral_BiClustering(M, args):
    '''Function to perform bipartite clustering'''
    # Create model
    try:
        if args.arpack:
            model = SpectralBiclustering(
                n_clusters=args.nClusters, svd_method='arpack')
        else:
            model = SpectralBiclustering(
                n_clusters=args.nClusters)
    except:
        print '-r 1 may cause problems when svd_method has been set to arpack'

    print('Running biclustering')
    model.fit(M.tocsc())
    print('Biclustering done')

    # Fit to data
    # fit_data = M[np.argsort(model.row_labels_)]
    # fit_data = fit_data[:, np.argsort(model.column_labels_)]
    fit_data = M.tocoo()
    fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row]
    fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col]

    save_clusters(model, fit_data, args, '_BiClustering', 1)

    return model, fit_data
Example #8
0
def plotBicluster(df, n_clusters, col_labels=None):
    model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
    model.fit(df)
    
    fitDf = df.iloc[np.argsort(model.row_labels_),:]
    fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)]
    plotCorrHeatmap(dmat=fitDf, col_labels=col_labels)
    return fitDf
def plotBicluster(df, n_clusters):
    model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
    model.fit(df)
    
    fitDf = df.iloc[np.argsort(model.row_labels_), :]
    fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)]
    plotCorrHeatmap(dmat=fitDf)
    return fitDf
Example #10
0
def fi_selection_algo(metadata, settings, X, target_atts_list=None):
    fi_scores = get_fi_scores(X, target_atts_list, metadata)
    n_clusters = (int(settings["selection"]["param"]), 2)
    model = SpectralBiclustering(n_clusters=n_clusters, method="log")
    model.fit(fi_scores)
    cluster_labels = model.row_labels_
    codes = labels_to_codes(cluster_labels, target_atts_list)
    return codes
Example #11
0
	def fit_data_to_model(self,shapey):
		model = SpectralBiclustering(n_clusters=shapey, method='log',random_state=0)
		model.fit(self.data)
		self.fit_data = self.data[np.argsort(model.row_labels_)]
		self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)]
		self.rowl = model.row_labels_
		self.coll = model.column_labels_
		self.shapex = shapey
Example #12
0
def spectral(dataset_name, full, preprocessing, mindf, k1, k2, ngram_min,
             ngram_max, start, end, n):
    if not spectral_directory_exists(dataset_name):
        create_spectral_directory(dataset_name)
    h, c = obtain_file_name_from_dataset(dataset_name, preprocessing)
    corpus = obtain_full_corpus(h, c)
    if full:
        texts = corpus.text.values
        docnames = corpus.text.index.values
        if not tfidf_exists(dataset_name, preprocessing):
            X, v = create_tfidf(texts, mindf, ngram_min, ngram_max)
            words = v.get_feature_names()
            store_data(dataset_name, preprocessing, X, docnames, words)
        tfidf, documents, terms = load_data(dataset_name, preprocessing)
        if not spectral_exists(get_directory_dataset(dataset_name),
                               dataset_name, preprocessing, mindf, k1, k2,
                               ngram_min, ngram_max):
            start = time.time()
            model = SpectralBiclustering(n_clusters=(k1, k2), random_state=0)
            model.fit(tfidf)
            end = time.time()
            print("Biclustering process takes", int(round(end - start)),
                  "seconds")
            save_clasification(get_directory_dataset(dataset_name),
                               dataset_name, preprocessing, mindf, k1, k2,
                               ngram_min, ngram_max, model)
    else:
        time_corpus = split_data_in_time_slices(corpus, start, end, n)
        if not tfidf_periods_exists(dataset_name, preprocessing, start, end,
                                    n):
            os.makedirs(
                get_directory_dataset_periods(dataset_name, preprocessing,
                                              start, end, n))
            for (s, e), corp in time_corpus.items():
                texts = corp.text.values
                docnames = corp.text.index.values
                X, v = create_tfidf(texts, mindf, ngram_min, ngram_max)
                words = v.get_feature_names()
                store_data_periods(dataset_name, preprocessing, start, end, n,
                                   s, e, X, docnames, words)
        for s, e in time_corpus:
            tfidf, documents, terms = load_data_periods(
                dataset_name, preprocessing, start, end, n, s, e)
            if not spectral_periods_exists(dataset_name, preprocessing, mindf,
                                           k1, k2, ngram_min, ngram_max, start,
                                           end, n, s, e):
                st = time.time()
                model = SpectralBiclustering(n_clusters=(k1, k2),
                                             random_state=0)
                model.fit(tfidf)
                ed = time.time()
                print("Biclustering process takes", int(round(ed - st)),
                      "seconds")
                save_clasification_periods(dataset_name, preprocessing, mindf,
                                           k1, k2, ngram_min, ngram_max, model,
                                           start, end, n, s, e)
Example #13
0
 def fit_data_to_model(self, shapey):
     model = SpectralBiclustering(n_clusters=shapey,
                                  method='log',
                                  random_state=0)
     model.fit(self.data)
     self.fit_data = self.data[np.argsort(model.row_labels_)]
     self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)]
     self.rowl = model.row_labels_
     self.coll = model.column_labels_
     self.shapex = shapey
Example #14
0
def SpectralBiCluster(data, n_clusters=(4, 4)):
    from sklearn.datasets import make_checkerboard
    from matplotlib import pyplot as plt
    from sklearn.cluster.bicluster import SpectralBiclustering
    model = SpectralBiclustering(method='log', random_state=0)
    data = np.array(data)
    model.fit(data)
    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
Example #15
0
    def get_bicluster(self, data):
        # Biclustering
        model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0)
        print(data.sum(axis=0))
        print(data.sum(axis=1))
        model.fit(data.fillna(0))
        fit_data = data.iloc[np.argsort(model.row_labels_)]
        fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]

        return fit_data
Example #16
0
def plot_biclustering_with_pearson(time_ms, title):
    sliced_matrix = slice_matrix(matrix, time_ms)
    channels_data = calculate_n_columns(sliced_matrix)
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_%s_biclustering_all_ts_%i.svg' % (time_ms, title))
Example #17
0
def biclustering(filtered, checked) :

	### over 2 
	if len(filtered['data']) >= 2 :
		n_clusters = (2, 2)
	else :
		n_clusters = (1, 1)

	model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
	data = np.asarray(filtered['data'])
	model.fit(data)

	#biclustering
	y_fit_data = data[np.argsort(model.row_labels_)]
	fit_data = y_fit_data[:, np.argsort(model.column_labels_)]

	#set y label
	y = np.argsort(model.row_labels_)
	y_label = [0 for i in range(len(y))]
	for n in range(len(y)) :
		y_label[y[n]] = n

	#set x label
	x = np.argsort(model.column_labels_)
	x_label = [0 for i in range(len(x))]
	for n in range(len(x)) :
		x_label[x[n]] = n

	d1 = bd.draw_graph(group1, group2, checked,
		x = x, x_label = x_label,
		y_label = y_label,
		fit_data = fit_data,
		genus_data = filtered['genus'],
		pvalue_label = filtered['pvalue'],
		title = "After biclustering")
		
	d1.draw()

	# biclustering of fixed x-axis domain 
	d2 = bd.draw_graph(group1, group2, checked,
		x_label = [i for i in range(len(group1+group2))],
		y_label = y_label,
		x = [i for i in range(len(group1+group2))],
		fit_data = y_fit_data,
		genus_data = filtered['genus'],
		pvalue_label = filtered['pvalue'],
		title = "After biclustering; fixed x domins")

	d2.draw()
Example #18
0
def spectral_bi_cluster(data, n_clusters, para_jobs=1, random_state=None):
    from sklearn.cluster.bicluster import SpectralBiclustering
    assert len(
        n_clusters
    ) == 2, "n_cluster should be a tuple or list that contains 2 integer!"
    model = SpectralBiclustering(n_clusters,
                                 random_state=random_state,
                                 n_jobs=para_jobs,
                                 method='bistochastic',
                                 n_best=20,
                                 n_components=40)
    model.fit(data)
    row_labels = model.row_labels_
    col_labels = model.column_labels_
    return row_labels, col_labels
def spectral_biclust(E, ngenes=3, nconditions=1,  spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs):
    n_best = max([int(n*n_best_ratio), 1])

    spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best)

    spectral.fit(standardize(E))

    bics = []
    for columns, rows in zip(spectral.columns_, spectral.rows_):
        genes = E.columns[columns]
        conditions = E.index[rows]

        bics.append(Bicluster(genes, conditions))

    return bics
Example #20
0
def plot_biclustering_raw_data(time_ms, t=False):
    # take the transpose of sliced matrix
    if t:
        channels_data = slice_matrix(matrix, time_ms).T
    else:
        channels_data = slice_matrix(matrix, time_ms)
    print len(channels_data), len(channels_data[1])
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    # plt.savefig('z_score_raw_biclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
    plt.show()
Example #21
0
def plot_biclusters_n_intervals(n_intervals=30000):
    channels_data = [[] for i in range(64)]
    for row in range(64):
        start, end = 0, n_intervals
        row_data = matrix[row]
        while end < len(row_data):
            channels_data[row].append(float(sum(row_data[start:end])) / len(row_data[start:end]))
            start = end
            end += n_intervals
    z_score = stats.zscore(np.array(channels_data))
    plt.title('Z Score Biclustering')
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000))
Example #22
0
def spectral_cluster(dataframe, n_clusters=(30, 30), show_plots=False):
    model = SpectralBiclustering(n_clusters=n_clusters,
                                 method='log',
                                 random_state=0)
    data = dataframe.fillna(0.0).values
    model.fit(data)

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    if show_plots:
        plt.matshow(fit_data, cmap=plt.cm.Blues)
        plt.title("After biclustering; rearranged to show biclusters")
        plt.matshow(np.outer(
            np.sort(model.row_labels_) + 1,
            np.sort(model.column_labels_) + 1),
                    cmap=plt.cm.Blues)
        plt.title("Checkerboard structure of rearranged data")

    return model
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    with pytest.raises(ValueError):
                        model.fit(mat)
                    continue
                else:
                    model.fit(mat)

                assert model.rows_.shape == (9, 30)
                assert model.columns_.shape == (9, 30)
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert consensus_score(model.biclusters_,
                                       (rows, cols)) == 1

                _test_shape_indices(model)
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    assert_raises(ValueError, model.fit, mat)
                    continue
                else:
                    model.fit(mat)

                assert_equal(model.rows_.shape, (9, 30))
                assert_equal(model.columns_.shape, (9, 30))
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_equal(consensus_score(model.biclusters_,
                                             (rows, cols)), 1)

                _test_shape_indices(model)
Example #25
0
def test_perfect_checkerboard():
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def test_perfect_checkerboard():
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
Example #27
0
def test_perfect_checkerboard():
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)
Example #28
0
def test_perfect_checkerboard():
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)
Example #29
0
def test_wrong_shape():
    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    with pytest.raises(ValueError):
        model.fit(data)
Example #30
0
def test_errors(args):
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(**args)
    with pytest.raises(ValueError):
        model.fit(data)
Example #31
0
                                        n_clusters=n_clusters,
                                        noise=10,
                                        shuffle=False,
                                        random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters,
                             method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print "consensus score: {:.1f}".format(score)

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(
    np.sort(model.row_labels_) + 1,
    np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
def Checkerboard_structure(input_path,top_sd,n_clusters,output_path):

    ###input data
    input_dat=pd.read_csv(input_path,index_col=0,sep='\t',comment='#')
    ### get index and sample name
    # get_index = input_dat.index.astype(str)+'_'+input_dat.ix[:,0].astype(str)+'_'+\
    # input_dat.ix[:,1].astype(str)+'_'+input_dat.ix[:,2].astype(str)
    # get_samp_name = input_dat.columns[3:]
    
    pro_dat = input_dat.fillna(0)
    # pro_dat = pro_dat.ix[:,3:]
    # pro_dat.index = get_index
    # pro_dat.columns = get_samp_name
#    pro_dat = 2**pro_dat-1
    
    df_sd = pro_dat.apply(np.std,axis=1)
    df_sd_sort = df_sd.sort_values(ascending = False)
    df_sd_sort_top = df_sd_sort.ix[:int(len(df_sd_sort)*top_sd)]
    pro_dat = pro_dat.ix[df_sd_sort_top.index,:]
    
    sd_index = pro_dat.index
    sd_sample_names = pro_dat.columns
    
#    plt.matshow(common_data3, cmap=plt.cm.Blues)
#    plt.title("Original dataset")
    
    model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                                 random_state=0)
    model.fit(pro_dat)
    
    pro_dat = np.array(pro_dat)
    fit_data = pro_dat[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    
    ### output image
#    plt.figure(figsize=(12,12))
#    plt.matshow(fit_data, cmap=plt.cm.Blues)
#    plt.title("After biclustering; rearranged to show biclusters")
#    out_img_path = os.path.join(os.path.split(input_path)[0],'bicluster.png')
#    plt.savefig(out_img_path)
    
    ### output the model fitting data
    fit_data = pd.DataFrame(fit_data)
    fit_data.index = sd_index[np.argsort(model.row_labels_)]
    fit_data.columns = sd_sample_names[np.argsort(model.column_labels_)]
    
    out_fit_data_path = os.path.join(output_path,'fit_data.csv')
    fit_data.to_csv(out_fit_data_path)
    
    ### output image
    fig = plt.figure(figsize=(20,40))
    ax = fig.add_subplot(111)
    ax.matshow(fit_data, cmap=plt.cm.Blues)
    ax.set_title("After biclustering; rearranged to show biclusters")
    out_img_path = os.path.join(output_path,'bicluster.png')
    fig.savefig(out_img_path)
    
    ### output module
    a11 = pd.Series(model.row_labels_)
    b11 = pd.Series(model.column_labels_)
    c11 = a11.groupby(a11).size()
    c22 = b11.groupby(b11).size()
    d11 = pd.DataFrame(a11.sort_values().values,fit_data.index.values)
    d22 = pd.DataFrame(b11.sort_values().values,fit_data.columns.values)
    d11.columns = ['cpg_module']
    d22.columns = ['sample_module']
    
    out_module_path = os.path.join(output_path,'output.xlsx')
    writer = pd.ExcelWriter(out_module_path)
    d11.to_excel(writer,'Sheet1')
    d22.to_excel(writer,'Sheet2')
    writer.save()
   # 
    print("\n")
    print("cpg module:")
    print(c11)
    print("\n")
    print("sample module:")
    print(c22)
Example #33
0
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000))


def dump_raw_z_scores():
    np.array(stats.zscore(np.array(matrix))).dump('raw_z_npdump.dump')


if __name__ == '__main__':
<<<<<<< HEAD
    z_scores = np.load('raw_z_npdump.dump')
    plt.title('Z Score Biclustering')
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_scores)
    fit_data = z_scores[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_bicluster.svg')
=======
    # plot_biclustering_with_pearson(30000000000)
    # plot_biclustering_raw_data(60000)
    # plot_biclustering_raw_data(60000, t=True)
    # plot_coclusters_raw_data(60000)
    # plot_coclusters_raw_data(60000, t=True)
    # plot_biclusters_n_intervals(15000)
    dump_raw_z_scores()
    print(z_score)
>>>>>>> 51502dc598c9e79326407b5d15302c706bb6cdf2
Example #34
0
n_clusters = (2, 2)
data, rows, columns = make_checkerboard(
    shape=(18, 18), n_clusters=n_clusters, noise=10,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data4, random_state=0)
plt.matshow(data4, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data4)
score = consensus_score(model.biclusters_,(rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data4[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

#plt.matshow(fit_data, cmap=plt.cm.Blues)
#plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
plt.title("Co-clustering of Coordinated Attack Matrix")
Example #35
0
#         train_features[user]['average_set_score'] = sum_set_scores / float(num_sets)
#         # s average score
#         sum_s_scores = 0
#         for i in range(0, num_sets):
#             sum_s_scores += grades_rowdict[key]['s' + str(i)]
#         train_features[user]['average_s_score'] = sum_set_scores / float(num_sets)
#         # rest of the features
#         train_features[user]['course_score'] = grades_rowdict[key]['course']
#         train_features[user]['final_exam_score'] = grades_rowdict[key]['final']
#         train_features[user]['hw_score'] = grades_rowdict[key]['hw']
#         train_features[user]['letter'] = grades_rowdict[key]['letter']
#         train_features[user]['demerit'] = grades_rowdict[key]['demerit']
#     else:
#         pass

# MACHINE LEARNING CLUSTERING
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
kmeans.fit(train_features)

from sklearn.cluster.bicluster import SpectralBiclustering
model = SpectralBiclustering(n_clusters=5, method='log', random_state=0)
model.fit(train_features)

train_features.loc['bf7aa87b-444a-4eff-9f81-b4078e6dccd3']

model.row_labels_


Example #36
0
n_clusters = (4, 3)
data, rows, columns = make_checkerboard(
    shape=(300, 300), n_clusters=n_clusters, noise=10,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
plt.title("Checkerboard structure of rearranged data")
Example #37
0
    except:
        print('FAYOL!')
    media_id_num = picsdict[media_id]
    m[media_id_num, usersdict[db[media_id][3]]] = True
    for user in temp:
        try:
            m[media_id_num, usersdict[user.username]] = True
        except:
#            print(':3 ', user.username)
            other_users.add(user.username)


import pickle
pickle.dump( m, open( "save.p", "wb" ), protocol = 2 )
m = pickle.load( open( "save.p", "rb" ) )

import numpy as np
from matplotlib import pyplot as plt
plt.matshow(m, cmap=plt.cm.Blues)

from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import consensus_score
model = SpectralBiclustering(method='bistochastic', n_jobs = -1)
model.fit(m)

fit_data = m[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")
Example #38
0
def plotClusteringDistribution(lenconstraint,Folder_name,Lenrna):
        D=scipy.zeros([lenconstraint,lenconstraint])
	Dic,B=Load_Probabilities(Folder_name)
	clusters=defaultdict(a)
	#print Dic.values(),"ddddddddddddddddddddd"
        for element in Dic.keys():
		print Dic[element]
	#calculate the Eucledian distance between different entries
        D=Eucledian_distance(B,Lenrna)
        data= np.array(D)
        #rows=[Dic[element][-4:] for element in Dic.keys()]
        #columns=[Dic[element][4:] for element in Dic.keys()]
	rows=[Dic[element] for element in Dic.keys()]
        columns=[Dic[element] for element in Dic.keys()]
        #print "Clustering with kmeans for n=3"#n=8
        # by looking at the reuslt of spectralbilustering, it seeems that a subdivision of 8 is the appropriate one
        algorithm = cluster.MiniBatchKMeans(n_clusters=8)# 8
        #algorithm=cluster.AffinityPropagation(damping=.9, preference=None)
        algorithm.fit(data)
    	if hasattr(algorithm, 'labels_'):
        	y_pred = algorithm.labels_.astype(np.int)
    	else:
        	y_pred = algorithm.predict(data)
    	for i in range(len(y_pred)):
        	clusters[y_pred[i]].append(Dic[i])
	print clusters
        # score = consensus_score(model.biclusters_,(rows[:, row_idx], columns[:, col_idx]))
        # order ['NMIAMg', '1M7ILU', 'DMSMg', 'NO', 'NMIA','Nai','1M7ILUMg', '1M7ILU3Mg', 'CMCTMg','NaiMg', 'NMIAMgCE', 'BzCNMg','1M7', '1M7Mg' ,'1M7ILU3']
        model = SpectralBiclustering( random_state=0)
        model.fit(data)
        fit_data = data[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.column_labels_)]
	#fit_data = data[np.array([6,0,8,13,11,4,10,2,14,1,12,3,9,7,5])]
        #fit_data = fit_data[:, np.array([6,0,8,13,11,4,10,2,14,1,12,3,9,7,5])]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        orig=ax.matshow(data, cmap=plt.cm.Blues)
        fig.colorbar(orig)
        #plt.title("Base pairs eucledian distance between probing conditions") 
        ax.set_xticklabels(rows)
        ax.set_xticks(np.arange(len(rows)))# to show all labels
	ax.set_yticklabels(columns)
        ax.set_yticks(np.arange(len(columns)))
        #plt.show()
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cx2 = cubehelix.cmap(reverse=True)#, "#FC8D62" ,"#8DA0CB", "#E78AC3", "#A6D854"
        #flatui = ["#66C2A5", "#95a5a6","#E78AC3", "#A6D854", "#FC8D62"]
        flatui=["#66C2A5","#457d6b","#365d50","#263e36","#17221e"]
        mycmap = ListedColormap(sns.color_palette(flatui).as_hex())

        b2=ax.matshow(fit_data, cmap=mycmap)
        #b2.ax.tick_params(labelsize=18) 
        cbar=fig.colorbar(b2,ticks=[0,50])
        cbar.set_label(label='Euclidean distance',size=12)

	for font_objects in cbar.ax.yaxis.get_ticklabels():
    		font_objects.set_size(20)
        #plt.title("Eucledian Distance matrix between conditions after bi-clustering") 
        #rows=[Dic[label][4:] for label in np.argsort(model.row_labels_)]
        #columns=[Dic[label][4:] for label in np.argsort(model.column_labels_)]
	rows=[Dic[label][4:] for label in np.argsort(model.row_labels_)]
        columns=[Dic[label][4:] for label in np.argsort(model.column_labels_)]
        #print rows
        ax.set_xticklabels(rows ,rotation=90)
        ax.set_xticks(np.arange(len(rows)))
	ax.set_yticklabels(columns, rotation_mode="anchor")
        ax.set_yticks(np.arange(len(columns)))
        ax.grid(False)
	# Turns off grid on the secondary (right) Axis.
	#ax.right_ax(False)
        plt.tick_params(axis='both', which='major', labelsize=13)
        plt.tight_layout() 
        plt.savefig("bi_clustering.pdf",format="pdf", dpi=None, facecolor='w', edgecolor='w',orientation='portrait', papertype=None,transparent=False, bbox_inches=None, pad_inches=0.1,frameon=None, metadata=None)
	
        
        for i in range(len(B)):
		
		print i,Dic[i], np.mean([ D[i][j] for j in range(len(B)) if i!=j]), np.min([ D[i][j] for j in range(len(B)) if i!=j])
		#%,[ D[i][j] for j in range(len(B))]
                print '\n'
	#print "Distance" , D
        # Clustering process with th plot
        adist = np.array(D)
	amax = np.amax(adist)
	adist /= amax
	mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
	results = mds.fit(adist)
	coords = results.embedding_
	# plot results 
	fig = plt.figure()
	plt.subplots_adjust(bottom = 0.1)
	plt.scatter(coords[:, 0], coords[:, 1], marker = 'o',s=100,c="#66C2A5")# s for marker size, c for color
	k=0
	##print Dic.values()
        listoriented=['NaiMg', 'NMIAMgCE', 'BzCNMg', 'Nai', '1M7ILUMg', '1M7ILU3Mg', '1M7ILU3', '1M7', '1M7Mg', 'CMCTMg', 'NMIAMg', '1M7ILU', 'DMSMg', 'NMIA']
	for label, x, y in zip(listoriented, coords[:, 0], coords[:, 1]):
		k+=1
                Pos=(4,6)
                topbottom='bottom'
		if k%2==0:
			state="right"
		else:
			state="left"
		#if(label=="didyNaiMg"):
		#	state="left"
		#if(label in ["didyNMIAMgCE","didy(-)","didy1M7ILU3Mg","didy1M7ILU","didyBzCNMg"]):
			Pos=(4,-4)
                	topbottom='top'
    		plt.annotate(
			label[4:],
			xy = (x, y), xytext = Pos,
			textcoords = 'offset points', ha = state, va = topbottom, fontsize=11,
			bbox = dict(boxstyle = 'round,pad=0.3', fc = "#66C2A5", alpha = 0.3))
			#arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
        plt.xlim(-0.4, 0.4)
   	plt.ylim(-0.7, 0.6)
        plt.tick_params(axis='both', which='major', labelsize=11)

	#plt.show()
	fig.savefig("Euclidean_distance_dot_plot_Matrix.pdf",format="pdf", dpi=None, facecolor='w', edgecolor='w',orientation='portrait', papertype=None,transparent=False, bbox_inches=None, pad_inches=0.1,frameon=None, metadata=None)
Example #39
0
n = len(Gram)
Di = np.reshape(np.diag(Gram),(n,1))
M = Di.dot(np.ones((1,n)))

D = M + M.T - 2*Gram

C2 = AffinityPropagation(affinity='precomputed')
C1 = KMeans(n_clusters = 5)
C3 = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='average')
C4 = SpectralClustering(n_clusters=5,affinity='precomputed')
C5 = SpectralBiclustering(n_clusters=(5,5))

R1 = C1.fit_predict(D)
R2 = C2.fit_predict(D)
R3 = C3.fit_predict(D)
R4 = C4.fit_predict(Gram +11)
R5 = C5.fit(D)

print(R4)

modèle = TSNE(n_components=2,metric='precomputed')
Trans = modèle.fit_transform(D)

G_ACP = ACP(Gram,precomputed=True)

trace_ACP(G_ACP,[10]*5)
##

import propre_TSNE as pt

r = pt.reduit_dim(np.array([[0,1],[1,0]]),np.array([[0,2],[2,0]]),1)
##0.780023781213

###############################################################################
## Draw  dendrogram
Z = linkage(data, 'ward')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    labels=np.array(authors)
)
plt.show()
 

###############################################################################
## Biclustering
data = data.astype('float')
bc = SpectralBiclustering(n_clusters=(n_authors,5))
bc.fit(data)
## TODO : sort the rows and columns 
bc_data = data[np.argsort(bc.row_labels_)]
bc_data = bc_data[:, np.argsort(bc.column_labels_)]
## How to annotate the words?
plt.matshow(data, cmap = plt.cm.Blues)
plt.title("Original dataset")
plt.matshow(bc_data, cmap = plt.cm.Blues)
plt.title("After biclustering; rearrange to show biclusters")
Example #41
0
        output_mat_name = sys.argv[5]
        tfidf = load_sparse_mat(mat_name,mat_filename).astype(float32)
        data = tfidf.A

        im = plt.matshow(data, aspect='auto', cmap='jet')
        vmax = amax(data)
        vmin = amin(data)
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        m,n = tfidf.shape
        print("Matrix dimensions: ",m,"x",n)
        print("Row clusters:",k)
        print("Column clusters:",l)
        start = time.time()
        model = SpectralBiclustering(n_clusters=(k,l),random_state=0)
        model.fit(tfidf)
        end = time.time()
        print("Biclustering process takes",int(round(end-start)),"seconds")
        fit_data = data[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.column_labels_)]
        im = plt.matshow(fit_data, aspect='auto', cmap='jet')
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        im = plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap='jet',aspect='auto')
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        plt.title("Checkerboard structure of rearranged data")
        plt.show()
#        save(output_mat_name,model.row_labels_.astype(float32))
Example #42
0
    usr_grp = lda2.transform(ad_mat4)

    res_ar = np.concatenate((res_ar, usr_grp), axis=1)


cor_mat = np.corrcoef(res_ar.T)
cor_mat = cosine_similarity(res_ar.T)
plt.matshow(cor_mat)
plt.show()

from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics.pairwise import cosine_similarity

clust_mdl = SpectralBiclustering(n_clusters = 4)
clust1 = clust_mdl.fit(cor_mat)

col_lbls = clust1.column_labels_
col_ord = [list(np.where(col_lbls ==i)[0]) for i in list(range(4))]
col_ord2 = list(itertools.chain.from_iterable(col_ord))

res_mat2 = cor_mat[col_ord2,:][:,col_ord2]
plt.matshow(res_mat2)
plt.show()

# res_mat_nol_iter = res_mat2

# oh well
# at least framework there
# 
# max_iter 50 not enough for reliability