Ejemplo n.º 1
0
def main():
    origin = open('10k.txt', 'r')

    lines = origin.readlines()

    x = []
    label = []

    for l in lines:
        l = l.split(',')
        ip1 = l[2].split('.')
        ip2 = l[3].split('.')
        d = [datetime.fromtimestamp(int(l[1][0:11])).hour, int("%02x%02x%02x%02x"%(int(ip1[0]),int(ip1[1]),int(ip1[2]),int(ip1[3])),16), int("%02x%02x%02x%02x" % (int(ip2[0]),int(ip2[1]),int(ip2[2]),int(ip2[3])),16)] + l[4:6] + l[7:10]
        x.append(d)

    data = np.array(x, dtype='float32')

    model = SpectralCoclustering(n_clusters=5)
    model.fit(data)

    print model.rows_

    for i in range(5):
        print "Cluster" + str(i) + ':'
        for j in range(10000):
            if model.rows_[i][j]:
                print j,
        print ' '
Ejemplo n.º 2
0
    def find_disjoint_biclusters(self, biclusters_number=50):
        data = np.asarray_chkfinite(self.matrix)
        data[data == 0] = 0.000001
        coclustering = SpectralCoclustering(n_clusters=biclusters_number, random_state=0)
        coclustering.fit(data)

        biclusters = set()
        for i in range(biclusters_number):
            rows, columns = coclustering.get_indices(i)
            row_set = set(rows)
            columns_set = set(columns)
            if len(row_set) > 0 and len(columns_set) > 0:
                density = self._calculate_box_cluster_density(row_set, columns_set)
                odd_columns = set()
                for column in columns_set:
                    col_density = self._calculate_column_density(column, row_set)
                    if col_density < density / 4:
                        odd_columns.add(column)
                columns_set.difference_update(odd_columns)
                if len(columns_set) == 0:
                    continue

                odd_rows = set()
                for row in row_set:
                    row_density = self._calculate_row_density(row, columns_set)
                    if row_density < density / 4:
                        odd_rows.add(row)
                row_set.difference_update(odd_rows)

                if len(row_set) > 0 and len(columns_set) > 0:
                    density = self._calculate_box_cluster_density(row_set, columns_set)
                    biclusters.add(Bicluster(row_set, columns_set, density))

        return biclusters
Ejemplo n.º 3
0
 def run(self, data):
     bc = SpectralCoclustering(n_clusters=(self.n_gene_classes,
                                           self.n_classes))
     bc.fit(data)
     gene_clusters = bc.row_labels_
     cell_clusters = bc.column_labels_
     return cell_clusters
Ejemplo n.º 4
0
def test_spectral_coclustering():
    # Test Dhillon's Spectral CoClustering on a simple problem.
    param_grid = {
        'svd_method': ['randomized', 'arpack'],
        'n_svd_vecs': [None, 20],
        'mini_batch': [False, True],
        'init': ['k-means++'],
        'n_init': [10],
        'n_jobs': [1]
    }
    random_state = 0
    S, rows, cols = make_biclusters((30, 30),
                                    3,
                                    noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert_equal(model.rows_.shape, (3, 30))
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

            _test_shape_indices(model)
Ejemplo n.º 5
0
def test_spectral_coclustering():
    """Test Dhillon's Spectral CoClustering on a simple problem."""
    param_grid = {'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [10],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_biclusters((30, 30), 3, noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert_equal(model.rows_.shape, (3, 30))
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
Ejemplo n.º 6
0
def update_bicluster(batch_df, task_df, compound_df, mode='RobustMT', K=5):
    if mode == 'RobustMT':
        n_tasks = task_df.shape[1] - 1
    elif mode == 'ST':
        n_tasks = 1
    elif mode == 'MT':
        n_tasks = task_df.shape[1]

    if not mode == 'ST':
        # cocluster of the minibatch predictive matrix
        X = preprocessing.scale(np.matrix(batch_df)[:, 0:n_tasks])
        cocluster = SpectralCoclustering(n_clusters=K, random_state=0)
        cocluster.fit(X)
        batch_df['batch_label'] = cocluster.row_labels_
    else:
        rank_x = batch_df[batch_df.columns[0]].rank().tolist()
        groups = pd.qcut(rank_x, K, duplicates='drop')
        batch_df['batch_label'] = groups.codes

    # generate color hex for batch_label
    lut = dict(zip(batch_df['batch_label'].unique(), Category20_20))
    batch_df['batch_label_color'] = batch_df['batch_label'].map(lut)

    # generate color hex for compound_df
    lut2 = dict(zip(batch_df['Label_id'], batch_df['batch_label_color']))
    compound_df['batch_label_color'] = compound_df['label'].map(lut2)
    lut22 = dict(zip(batch_df['Label_id'], batch_df['batch_label']))
    compound_df['batch_label'] = compound_df['label'].map(lut22)
    groups = pd.qcut(compound_df['label'].tolist(),
                     len(Category20b_20),
                     duplicates='drop')
    c = [Category20b_20[xx] for xx in groups.codes]
    compound_df['label_color'] = c

    return batch_df, task_df, compound_df
Ejemplo n.º 7
0
def correlation_matrix(df):
    sns.set(style='white', font_scale=.9)
    clusters = 4
    pearson = df.drop(['asset', 'unixtime'], axis=1).corr(method='pearson')
    clust = SpectralCoclustering(n_clusters=clusters, random_state=0)
    clust.fit(pearson)
    pearson = pearson.iloc[np.argsort(clust.row_labels_)[::-1],
                           np.argsort(clust.column_labels_)]

    grid = dict(width_ratios=[1.5, pearson.shape[1]])
    fig, axs = plt.subplots(1, 2, figsize=(10, 8), gridspec_kw=grid)

    sns.heatmap(data=np.sort(clust.row_labels_)[::-1].reshape(-1, 1),
                ax=axs[0],
                cbar=False,
                linewidths=.005,
                cmap=sns.color_palette('Spectral'))
    axs[0].set(xticks=(), yticks=())

    sns.heatmap(data=pearson,
                cmap=sns.diverging_palette(220, 10, n=11),
                linewidths=.005,
                cbar_kws={'shrink': .75},
                vmax=1,
                vmin=-1,
                ax=axs[1])
    axs[1].set_xticklabels(pearson.columns, rotation='vertical')
    axs[1].set_yticklabels(pearson.index, rotation='horizontal')

    fig.suptitle(f'Variable Correlation Matrix in {clusters} Clusters',
                 fontsize=20)
    fig.tight_layout(w_pad=.5, rect=(.03, 0, 1, .95))
    fig.savefig('reports/correlation_matrix.png')
Ejemplo n.º 8
0
def print_similarity_matrix(sphns, model, model2=None):
    print "      ",
    for phn1 in sphns:
        print phn1, " ",
    print ""
    m = np.ndarray((len(sphns), len(sphns)), dtype=np.float32)
    for i, phn1 in enumerate(sphns):
        print phn1.ljust(4) + ":",
        for j, phn2 in enumerate(sphns):
            sim = model.similarity(phn1, phn2)
            if model2 != None:
                sim -= model2.similarity(phn1, phn2)
            print "%0.2f" % sim,
            m[i][j] = sim
        print ""
    phn_order = [phn for phn in sphns]

    if BICLUSTER:
        #model = SpectralBiclustering(n_clusters=4, method='log',
        model = SpectralCoclustering(n_clusters=n_clusters,
                                             random_state=0)
        model.fit(m)
        print "INDICES:",
        indices = [model.get_indices(i) for i in xrange(n_clusters)]
        print indices
        tmp = []
        for i in xrange(n_clusters):
            tmp.extend([phn_order[indices[i][0][j]] for j in xrange(len(indices[i][0]))])
        phn_order = tmp
        fit_data = m[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.column_labels_)]
        m = fit_data

    return phn_order, m
def bi_clustering(data, args):
    print 'clustering...'

    # max_val = np.max(np.max(data))

    # data = -np.exp(data / data.std())

    max_val = np.max(np.max(data))

    data[data == 0] = max_val

    data = data / max_val

    model = SpectralCoclustering(n_clusters=args.k, svd_method='arpack')
    model.fit(data)

    np.savetxt(args.o, model.row_labels_, fmt="%d", newline="\n")

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    if not args.plot:
        return

    plt.matshow(shuffle(data), cmap=plt.cm.Blues)
    plt.title("Org dataset")

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()
Ejemplo n.º 10
0
def bicluster(*cotables):
    table = cotables[0]
    model = SpectralCoclustering(n_clusters=table.shape[1], random_state=0)
    model.fit(table.as_matrix())
    return [
        cotable.iloc[np.argsort(model.row_labels_),
                     np.argsort(model.column_labels_)] for cotable in cotables
    ]
Ejemplo n.º 11
0
def spectral_co_cluster(data, n_clusters, para_jobs=1, random_state=None):
    from sklearn.cluster.bicluster import SpectralCoclustering
    model = SpectralCoclustering(n_clusters,
                                 random_state=random_state,
                                 n_jobs=para_jobs)
    model.fit(data)
    row_labels = model.row_labels_
    col_labels = model.column_labels_
    return row_labels, col_labels
Ejemplo n.º 12
0
def biclustering(input,num_clusters):
	global agent1_dict
	data = np.matrix(input)
	model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) 
	model.fit(data)
	#create agent 1 dictionary
	agent1_dict = {}
	for c in range(num_clusters): 	
		agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices
	return agent1_dict
Ejemplo n.º 13
0
def get_clusters(data):
    coclusters = SpectralCoclustering(n_clusters=5, random_state=0)
    coclusters.fit(data)
    word_clusters = []
    hidden_clusters = []
    for i in range(5):
        wc = coclusters.get_indices(i)[0]
        hc = coclusters.get_indices(i)[1]
        word_clusters.append(wc.tolist())
        hidden_clusters.append(hc.tolist())
    return word_clusters, hidden_clusters
Ejemplo n.º 14
0
def biclustering(data,num_clusters):
	clusters = {}
	data = np.asmatrix(data)
	model = SpectralCoclustering(n_clusters=num_clusters,random_state=0)
	#model = SpectralBiclustering(n_clusters=num_clusters)
	model.fit(data)
	for c in range(num_clusters):
		clusters[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices
	#fit_data = data[np.argsort(model.row_labels_)]
	#fit_data = fit_data[:, np.argsort(model.column_labels_)]
	#plot(fit_data)
	return clusters
Ejemplo n.º 15
0
def main():
    origin = open('kddcup.txt', 'r')

    lines = origin.readlines()

    x = []
    label = []

    for l in lines:
        l = l.split(',')
        d = l[0:1] + l[4:19] + l[21:-1]
        label.append(l[-1])
        x.append(d)

    data = np.array(x, dtype='float32')

    model = SpectralCoclustering(n_clusters=5)
    model.fit(data)

    evaluation = []

    draw_n_x = []
    draw_n_y = []
    draw_a_x = []
    draw_a_y = []

    for cluster in model.rows_:
        normal = 0.0
        attack = 0.0
        graph_x = []
        graph_y = []
        for idx in range(len(cluster)):
            if cluster[idx]:
                if label[idx] == 'normal.\n':
                    normal += 1
                else:
                    attack += 1
                graph_x.append(data[27])
                graph_y.append(data[30])
        evaluation.append(normal / (normal + attack))

        if normal > attack:
            draw_n_x += graph_x
            draw_n_y += graph_y
        else:
            draw_a_x += graph_x
            draw_a_y += graph_y

    pl.plot(draw_n_x, draw_n_y, 'ro')
    pl.plot(draw_a_x, draw_a_y, 'go')

    print evaluation
    pl.show()
Ejemplo n.º 16
0
def biclustering(input,num_clusters):
	global agent1_dict
	data = np.matrix(input)
	model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) 
	model.fit(data)
	#create agent 1 dictionary
	agent1_dict = {}
	for c in range(num_clusters): 	
		agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices
	fit_data = data[np.argsort(model.row_labels_)]
	fit_data = fit_data[:, np.argsort(model.column_labels_)]
	plot(fit_data)
	return agent1_dict
Ejemplo n.º 17
0
def cluster_data(flavors, whisky):
    corr_whisky = pd.DataFrame.corr(flavors.transpose())
    model = SpectralCoclustering(n_clusters=6, random_state=0)
    model.fit(corr_whisky)
    whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
    whisky = whisky.ix[np.argsort(model.row_labels_)]
    whisky = whisky.reset_index(drop=True)
    correlation = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())
    correlation = np.array(correlation)
    # print(np.sum(model.rows_, axis=1))
    # print(np.sum(model.rows_, axis=0))
    # print(model.row_labels_)
    # print(correlation)
    plot_correlations(correlation)
Ejemplo n.º 18
0
def plot_coclusters_raw_data(time_ms, t=False):
    # take the transpose of sliced matrix
    if t:
        channels_data = slice_matrix(matrix, time_ms)
    else:
        channels_data = slice_matrix(matrix, time_ms)
    print len(channels_data), len(channels_data[1])
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralCoclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_coclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
Ejemplo n.º 19
0
 def cocluster(self, mx, blockdiag=False):
     logging.info('Co-clustering Tade..')
     if blockdiag:
         logging.info('blockdiag')
         clusser = SpectralCoclustering(n_jobs=-1)
     else: # checkerboard
         logging.info('checkerboard')
         clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4,3))
         #n_clusters=3, svd_method='randomized',
     clusser.fit(mx)
     logging.info('Argsorting mx rows..')
     mx = mx[np.argsort(clusser.row_labels_)]
     self.prev = self.prev[np.argsort(clusser.row_labels_)]
     logging.info('Argsorting mx cases..')
     mx = mx[:, np.argsort(clusser.column_labels_)]
     self.case = self.case[np.argsort(clusser.column_labels_)]
     return mx
Ejemplo n.º 20
0
def main():
    files = [DATA_DIR + file for file in os.listdir(DATA_DIR) if fnmatch.fnmatch(file, '*.csv')]

    for i in files:
        print('processing', i, '...')
        table = get_data(i)
        cl = SpectralCoclustering(n_clusters=2, random_state=0)
        cl.fit(table)

        # using http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_coclustering.html
        fit_data = table[np.argsort(cl.row_labels_)]
        fit_data = fit_data[:, np.argsort(cl.column_labels_)]

        plt.matshow(fit_data, cmap=plt.cm.Reds)
        plt.title(i[len(DATA_DIR):])
        # plt.show()
        plt.savefig(i[len(DATA_DIR):-4] + '.pdf')
Ejemplo n.º 21
0
def main(model):
    store = pd.HDFStore(model)
    
    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0
    
    trace_fpath = store['trace_fpath'][0][0]
    Theta_zh = store['Theta_zh'].values
    Psi_oz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]

    Psi_oz = Psi_oz / Psi_oz.sum(axis=0)
    Psi_zo = (Psi_oz * count_z).T
    Psi_zo = Psi_zo / Psi_zo.sum(axis=0)
    obj2id = dict(store['source2id'].values)
    hyper2id = dict(store['hyper2id'].values)
    id2obj = dict((v, k) for k, v in obj2id.items())

    ZtZ = Psi_zo.dot(Psi_oz)
    ZtZ = ZtZ / ZtZ.sum(axis=0)
    L = ZtZ
    #ZtZ[ZtZ < (ZtZ.mean())] = 0
    L[ZtZ >= 1.0 / (len(ZtZ))] = 1
    L[L != 1] = 0

    colormap = toyplot.color.brewer.map("Purples", domain_min=0, domain_max=1, reverse=True)
    print(colormap)
    canvas = toyplot.matrix((L.T, colormap), label="P[z' | z]", \
            colorshow=False, tlabel="To z'", llabel="From")[0]
    #canvas.axes(ylabel='From z', xlabel='To z\'')
    toyplot.pdf.render(canvas, 'tmat.pdf')

    model = SpectralCoclustering(n_clusters=3)
    model.fit(L)
    fit_data = L[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    canvas = toyplot.matrix((fit_data, colormap), label="P[z' | z']", \
            colorshow=False)[0]
    toyplot.pdf.render(canvas, 'tmat-cluster.pdf')
    
    #AtA = Psi_oz.dot(Psi_zo)
    #np.fill_diagonal(AtA, 0)
    #AtA = AtA / AtA.sum(axis=0)

    store.close()
Ejemplo n.º 22
0
 def cocluster(self, mx, blockdiag=False):
     logging.info('Co-clustering Tade..')
     if blockdiag:
         logging.info('blockdiag')
         clusser = SpectralCoclustering(n_jobs=-1)
     else:  # checkerboard
         logging.info('checkerboard')
         clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3))
         #n_clusters=3, svd_method='randomized',
     clusser.fit(mx)
     logging.info('Argsorting mx rows..')
     mx = mx[np.argsort(clusser.row_labels_)]
     self.prev = self.prev[np.argsort(clusser.row_labels_)]
     logging.info('Argsorting mx cases..')
     mx = mx[:, np.argsort(clusser.column_labels_)]
     self.case = self.case[np.argsort(clusser.column_labels_)]
     return mx
Ejemplo n.º 23
0
def biclustering(db):
    #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1)
    df = pd.read_csv(
        '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv')
    dma = 501
    #print df.head()
    print df.shape
    dev_list = df.ix[:, 0].values
    prog_list = df.columns.values
    #print type(dev_list)
    #print type(prog_list)
    df.drop(df.columns[0], axis=1, inplace=True)
    #df[df==0] = 1
    df = df.apply(fraction, axis=1)
    #print df.head()
    #print df.values
    #print type(df.values)
    #mydata = df.values
    #mydata=np.delete(mydata, 0, axis=0)
    #mydata=np.delete(mydata, 0, axis=1)
    #mydata[mydata==0] = 0.01
    #print 'data format is:',mydata,type(mydata)
    # model=SpectralCoclustering(n_clusters=5, random_state=0)
    #n_clusters=(1000,20) # 4*3 = 12 clusters

    #model = SpectralBiclustering(random_state=None)
    model = SpectralCoclustering(n_clusters=10)
    model.fit(df)
    #fit_data=mydata[np.argsort(model.row_labels_)]
    #fit_data=fit_data[:,np.argsort(model.column_labels_)]
    #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues)
    # plt.show()
    print model.get_params()
    for i in range(0, 5):
        print 'Size of one cluster:', model.get_shape(i)
        indices = model.get_indices(i)
        #print indices[1]
        print prog_list[indices[1]]
        print model.get_submatrix(i, df.values)
        dev_in_cluster = dev_list[indices[0]]
        #print type(dev_in_cluster)
        print 'number of devices within this cluster:', len(dev_in_cluster)
        get_income(db, dma, dev_in_cluster.tolist())
Ejemplo n.º 24
0
def biclustering(input_list,num_clusters):
	global agent1_dict
	#clustering agent 1
	data = np.matrix(input_list)
	#plot(data)#original data
	
	#model = SpectralBiclustering(n_clusters=num_clusters) #Biclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_biclustering.html#example-bicluster-plot-spectral-biclustering-py

	model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) #Coclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_coclustering.html

	model.fit(data)
	#create agent 1 dictionary
	agent1_dict = {}
	for c in range(num_clusters): 	
		agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices
	fit_data = data[np.argsort(model.row_labels_)]
	fit_data = fit_data[:, np.argsort(model.column_labels_)]
	plot(fit_data)
	return agent1_dict
Ejemplo n.º 25
0
    def spectral_coclustering(cls, *args):
        """
        Wrapper method for the spectral_coclustering algorithm

        :param args: the arguments to be sent to the sci-kit implementation
        :return: returns the Biclustering object
        """

        model = SpectralCoclustering(*args)
        return cls(model)
Ejemplo n.º 26
0
def Spectral_CoClustering(args):
    '''Function to perform bipartite clustering'''
    # Create model
    try:
        if args.arpack:
            model = SpectralCoclustering(
                n_clusters=args.nClusters, svd_method='arpack')
        else:
            model = SpectralCoclustering(
                n_clusters=args.nClusters)
    except:
        print '-r 1 may cause problems when svd_method has been set to arpack'
    print('Running coclustering')
    model.fit(args.M.tocsc())
    print('Coclustering done')

    # Fit to data
    # fit_data = args.M[np.argsort(model.row_labels_)]
    # fit_data = fit_data[:, np.argsort(model.column_labels_)]
    fit_data = args.M.tocoo()
    fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row]
    fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col]

    save_clusters(model, fit_data, args, '_CoClustering')

    return model, fit_data
Ejemplo n.º 27
0
def plot_biclusters():

    co_grid = ParameterGrid(
        {'n_clusters': np.arange(2, 10, 1), 'n_init': [20]}
    )
    _y = pd.read_csv('./../../data_source/to_analysis/original_images/dfs_original_images.csv', index_col=0)
    y_orig = np.squeeze(_y.values)

    X_orig = pd.read_csv('./../../data_source/to_analysis/original_images/all_features_original_images.csv', index_col=0)

    scaler = StandardScaler()
    X_orig_std = scaler.fit_transform(X_orig.values)

    #_run_experiment(co_grid, X_orig_std)

    df_avg_co_scores = pd.read_csv('bic_scores.csv', index_col=0)
    best_co_config = co_grid[
        np.argmin(df_avg_co_scores.loc[:, 'tvr'].values) - 1
    ]
    print(best_co_config, min(df_avg_co_scores.loc[:, 'tvr'].values))

    orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack')
    orig_co_model.set_params(**best_co_config)
    orig_co_model.fit(X_orig_std)

    #plt.figure()
    #_plot_tve(df_avg_co_scores, co_grid)

    plt.figure()
    _plot_bicmaps(X_orig_std, best_co_config)
def cluster_ex_by_feature_matrix(sub_ex_by_feat_mat, plot_file):
    if sub_ex_by_feat_mat.shape[0] > 50000:
        print "Matrix too large to be efficient, pleased reduce number of examples"

    # Subset down to motifs that are used
    plot_df = sub_ex_by_feat_mat[:,
                                 np.apply_along_axis(
                                     np.max, 0, sub_ex_by_feat_mat.toarray()
                                 ) != 0]
    # for numpy array
    plot_df = sub_ex_by_feat_mat_1[np.apply_along_axis(
        lambda row:
        (row != 0).sum(), 1, sub_ex_by_feat_mat_1.toarray()) > 10, :]
    plot_df = plot_df[:,
                      np.apply_along_axis(lambda column:
                                          (column != 0).sum(), 0,
                                          sub_ex_by_feat_mat_1.toarray()) > 50]
    # for pandas
    plot_df = sub_ex_by_feat_df2.ix[
        sub_ex_by_feat_df2.apply(lambda row: (row != 0).sum(), 1) > 10, :]
    plot_df = plot_df.ix[:,
                         plot_df.apply(lambda row: (row != 0).sum(), 0) > 50]
    plot_df = sub_ex_by_feat_df2

    np.apply_along_axis(lambda column: (column != 0).sum(), 0,
                        sub_ex_by_feat_mat_1.toarray())

    model = SpectralCoclustering(n_clusters=50)
    model.fit(plot_df)  # fits for 50K
    fit_data = plot_df.ix[np.argsort(model.row_labels_)]
    fit_data = fit_data.ix[:, np.argsort(model.column_labels_)]

    plt.figure()
    plt.matshow(fit_data.ix[0:500, ], cmap=plt.cm.YlGnBu, aspect='auto')
    plt.savefig(plot_file)

    print "DONE: biclustering plot here: {0}".format(plot_file)

    return "pretty picture"
Ejemplo n.º 29
0
def _plot_bicmaps(X_orig_std, best_co_config):

    # Train model with best config.
    orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack')
    orig_co_model.set_params(**best_co_config)
    orig_co_model.fit(X_orig_std)
    orig_co_row_sorted = X_orig_std[np.argsort(orig_co_model.row_labels_), :]
    orig_co_fit_data = orig_co_row_sorted[:, np.argsort(orig_co_model.column_labels_)]

    hmap = sns.heatmap(
        orig_co_fit_data,
        robust=True,
        cmap=plt.cm.viridis,
        fmt='f',
        vmin=np.min(orig_co_fit_data),
        vmax=np.max(orig_co_fit_data),
        cbar=False
    )
    coords = bic_coords(orig_co_model, best_co_config['n_clusters'])
    for num in coords.index:
        plt.plot(
            (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]),
            (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]),
            c='darkred'
    )
    plt.ylabel('Patients')
    plt.xlabel('Features')

    plt.yticks([], [])
    plt.xticks([], [])

    ax_divider = make_axes_locatable(hmap)
    cax = ax_divider.append_axes('right', size='3%', pad='2%')
    colorbar.colorbar(
        hmap.get_children()[0],
        cax=cax,
        orientation='vertical'
    )
    #cax.xaxis.set_label_text('AUC', fontname='Sans')
    #cax.xaxis.set_label_position('top')
    cbar_ticks = np.linspace(
        np.nanmin(orig_co_fit_data),
        np.nanmax(orig_co_fit_data),
        6
    )
    cax.yaxis.set_ticks(cbar_ticks)
    cax.yaxis.set_ticklabels([f'{num:.01f}' for num in cbar_ticks])

    plt.savefig(
        '../biclustering/bic_map_original_images.pdf',
        bbox_inches='tight',
        transparent=True,
        dpi=CONFIG.DPI,
    )
Ejemplo n.º 30
0
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'):
    if mode=='co':
        model = SpectralCoclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        return clusters
    elif mode=='bi':
        model = SpectralBiclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        repetition_start = clusters[1:].index(clusters[0]) + 1
        return clusters[:repetition_start]
    else:
        raise("Mode wrong?")
Ejemplo n.º 31
0
def _run_experiment(co_grid, X_orig_std):

    np.random.seed(seed=0)
    random_states = np.random.choice(40, size=40)

    avg_co_scores = {}
    for num, co_param_config in enumerate(co_grid):
        orig_co_scores = []
        for random_state in random_states:
            orig_co_model = SpectralCoclustering(random_state=random_state, svd_method='arpack')
            # NOTE: Outputs a TVE score.
            orig_co_clusters = biclusters(orig_co_model, X_orig_std, co_param_config)
            orig_co_scores.append(orig_co_clusters.external_metrics.values)
        avg_co_scores[num] = np.nanmean(orig_co_scores, axis=0)

    avg_orig_co_scores = []
    for num, scores in enumerate(avg_co_scores.values()):
        avg_orig_co_scores.append(np.mean(scores, axis=0))

    df_avg_co_scores = pd.DataFrame(avg_orig_co_scores, columns=['tvr'])
    df_avg_co_scores.index.name = 'ConfigID'

    df_avg_co_scores.to_csv('bic_scores.csv')
def Block_diagonal(input_path,top_sd,n_clusters,output_path):
    
    ###input data
    input_dat=pd.read_csv(input_path,index_col=0,sep='\t',comment='#')
    
    ### get index and sample name
    # get_index = input_dat.index.astype(str)+'_'+input_dat.ix[:,0].astype(str)+'_'+\
    # input_dat.ix[:,1].astype(str)+'_'+input_dat.ix[:,2].astype(str)
    # get_samp_name = input_dat.columns[3:]
    
    pro_dat = input_dat.fillna(0)
    # pro_dat = pro_dat.ix[:,3:]
    # pro_dat.index = get_index
    # pro_dat.columns = get_samp_name
#    pro_dat = 2**pro_dat-1
    
    df_sd = pro_dat.apply(np.std,axis=1)
    df_sd_sort = df_sd.sort_values(ascending = False)
    df_sd_sort_top = df_sd_sort.ix[:int(len(df_sd_sort)*top_sd)]
    pro_dat = pro_dat.ix[df_sd_sort_top.index,:]
    
    sd_index = pro_dat.index
    sd_sample_names = pro_dat.columns
    
    #plt.matshow(pro_dat, cmap=plt.cm.Blues)
    #plt.title("Original dataset")
    
    ### model
    model = SpectralCoclustering(n_clusters=n_clusters, random_state=0)
    model.fit(pro_dat)
    
    pro_dat = np.array(pro_dat)
    fit_data = pro_dat[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    
    
    ### output the model fitting data
    fit_data = pd.DataFrame(fit_data)
    fit_data.index = sd_index[np.argsort(model.row_labels_)]
    fit_data.columns = sd_sample_names[np.argsort(model.column_labels_)]
    
    out_fit_data_path = os.path.join(output_path,'fit_data.csv')
    fit_data.to_csv(out_fit_data_path)
    ### output image
    fig = plt.figure(figsize=(20,40))
    ax = fig.add_subplot(111)
    ax.matshow(fit_data, cmap=plt.cm.Blues)
    #cax = ax.matshow(pro_dat, interpolation='nearest')
    #fig.colorbar(cax)
    ax.set_title("After biclustering; rearranged to show biclusters")
#    ax.set_xticklabels(fit_data.columns )
#    ax.set_yticklabels(fit_data.index)

    out_img_path = os.path.join(output_path,'bicluster.png')
    fig.savefig(out_img_path)
    
    ### output module
    a11 = pd.Series(model.row_labels_)
    b11 = pd.Series(model.column_labels_)
    c11 = a11.groupby(a11).size()
    c22 = b11.groupby(b11).size()
    d11 = pd.DataFrame(a11.sort_values().values,fit_data.index.values)
    d22 = pd.DataFrame(b11.sort_values().values,fit_data.columns.values)
    d11.columns = ['cpg_module']
    d22.columns = ['sample_module']
    
    out_module_path = os.path.join(output_path,'output.xlsx')
    writer = pd.ExcelWriter(out_module_path)
    d11.to_excel(writer,'Sheet1')
    d22.to_excel(writer,'Sheet2')
    writer.save()
   # 
    print("\n")
    print("cpg module:")
    print(c11)
    print("\n")
    print("sample module:")
    print(c22)
Ejemplo n.º 33
0
for row in ratings:
    user_id = user_ids.index(row[0])
    profile_id = profile_ids.index(row[1])
    user_profile_matrix[user_id,profile_id] = row[2]

#find number of users and movies in each bicluster
'''G = nx_graph_from_biadjacency_matrix(user_movie_matrix)
nx.draw(G)
plt.show()'''

#initialize and carry out clustering
K=50


scc = SpectralCoclustering(n_clusters = K,svd_method='arpack')
scc.fit(user_profile_matrix)

#labels
row_labels = scc.row_labels_
column_labels = scc.column_labels_

bicluster_num_users=np.zeros(K)
bicluster_num_profiles=np.zeros(K)

bicluster_list_users=[]

bicluster_list_profiles=[]

for i in range(K):
    bicluster_list_users.append([])
# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
Ejemplo n.º 35
0
import pandas as pd
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, output_file, show
from sklearn.cluster.bicluster import SpectralCoclustering

data = pd.read_csv('docs/whiskies.txt')
data['Region'] = pd.read_csv('docs/regions.txt')

correlations = np.array(data.iloc[:, 2:14].transpose().corr())
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Original")
plt.pcolor(correlations, cmap='inferno')
plt.colorbar()

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(correlations)
data['Group'] = model.row_labels_
data = data.ix[np.argsort(model.row_labels_)]
data = data.reset_index(drop=True)
correlations = correlations[np.argsort(model.row_labels_), :]
correlations = correlations[:, np.argsort(model.row_labels_)]
plt.subplot(122)
plt.title("Rearranged")
plt.pcolor(correlations, cmap='inferno')
plt.colorbar()
plt.savefig('plots/classifying_whiskies_1')

group_colors = ['red', 'yellow', 'green', 'blue', 'purple', 'orange']
correlation_colors = []
for i in range(len(correlations)):
Ejemplo n.º 36
0
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

data, rows, columns = make_biclusters(shape=(300, 300),
                                      n_clusters=5,
                                      noise=5,
                                      shuffle=False,
                                      random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.3f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.show()
Ejemplo n.º 37
0
# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
Ejemplo n.º 38
0
plt.figure(figsize=(15, 25))
sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis],
            yticklabels=cluster_cell_types,
            vmin=0,
            vmax=1,
            linewidths=0.5)
plt.xlabel('UNCURL clusters')
plt.ylabel('Seurat clusters')
plt.title('SCH Cerebellum Clusters')
plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200)

# do a biclustering

from sklearn.cluster.bicluster import SpectralCoclustering

spec = SpectralCoclustering(18)
cluster_counts_subset = np.vstack(
    [cluster_counts[:31, :], cluster_counts[32:, :]])
spec.fit(cluster_counts + 0.0001)
row_labels = spec.row_labels_
column_labels = spec.column_labels_

row_order = np.argsort(row_labels)
col_order = np.argsort(column_labels)

#row_labels = row_labels[row_order]
#col_labels = column_labels[col_order]

cluster_counts_reordered = cluster_counts[row_order, :]
cluster_counts_reordered = cluster_counts_reordered[:, col_order]
cluster_cell_types_2 = np.array(
Ejemplo n.º 39
0
def test_co_clustering():

    import numpy as np
    import nibabel as nb
    from matplotlib import pyplot as plt
    import sklearn as sk
    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    # REAL DATA
    subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz'
    roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz'
    roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz'

    data = nb.load(subject_file).get_data().astype('float32')
    print('Data Loaded')

    print('Setting up NIS')
    roi_mask_file_nb = nb.load(roi_mask_file)
    roi2_mask_file_nb = nb.load(roi2_mask_file)

    roi_mask_nparray = nb.load(roi_mask_file).get_data().astype(
        'float32').astype('bool')
    roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype(
        'float32').astype('bool')

    roi1data = data[roi_mask_nparray]
    roi2data = data[roi2_mask_nparray]

    #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries
    roi1data = sk.preprocessing.normalize(roi1data, norm='l2')
    roi2data = sk.preprocessing.normalize(roi2data, norm='l2')

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0

    sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand(
        len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100
    sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1

    sum(sum(sim_btwn_data_1_2 == np.inf))
    sum(sum(sim_btwn_data_1_2 == np.nan))

    model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100)
    model.fit(sim_btwn_data_1_2)

    fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    #SIMULATION DATA
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    #Creating Simulated Data
    data, rows, columns = make_biclusters(shape=(300, 100),
                                          n_clusters=5,
                                          noise=5,
                                          shuffle=False,
                                          random_state=0)

    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Original dataset")

    data, row_idx, col_idx = sg._shuffle(data, random_state=0)
    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Shuffled dataset")

    #Creating Model
    model = SpectralCoclustering(n_clusters=5, random_state=0)
    model.fit(data)
    score = consensus_score(model.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    ####################################################################
    ####################################################################
    from sklearn import cluster
    import scipy as sp
    import time
    from sklearn import cluster, datasets
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    data1 = generate_simple_blobs(27)
    data2 = generate_simple_blobs(27)
    data2 = data2[0:150, :]

    print("Calculating Cross-clustering")
    print("Calculating pairwise distances between areas")

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0
    co_cluster = cluster.SpectralCoclustering()
    co_cluster.fit(sim_btwn_data_1_2)
    score = consensus_score(co_cluster.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(co_cluster.row_labels_)]
    fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()
Ejemplo n.º 40
0
listOfAbstracts = []
for paper in papers:
    if 'Abstract' in paper['MedlineCitation']['Article'].keys():
        listOfAbstracts.append(mergeAbstract(paper['MedlineCitation']['Article']['Abstract']['AbstractText']))

# Create TF-IDF matrix
vect = TfidfVectorizer(max_df = 1)
tfidf = vect.fit_transform(listOfAbstracts)



# Non-negative Matrix Factorization
num_topics = 2
num_top_words = 5
nmf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = nmf.fit_transform(tfidf)
topic_words = []
vocab = np.array(vect.get_feature_names())

for topic in nmf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

# Coclustering
cocluster = SpectralCoclustering(n_clusters=5,svd_method='arpack', random_state=0)
cocluster.fit(tfidf)
y_cocluster = cocluster.row_labels_
x_cocluster = cocluster.column_labels_

# print(np.array(vect.get_feature_names())[x_cocluster == 4])
Ejemplo n.º 41
0
#mM[np.where(np.logical_and(mM >= 0.25, mM < 1.25))] = 0.5
#mM[mM >= 1.25] = 1


#####

matDF = pd.DataFrame(MatOut).set_index(np.array(indx))
matDF.columns = areas


# Original plot
plt.matshow(MatOut, cmap=plt.cm.Blues)
plt.title("Original dataset")
clusters = 8 #6 

model = SpectralCoclustering(n_clusters=clusters)
#model = SpectralBiclustering(n_clusters=clusters)
model.fit(matDF)
    
    

fitData_c = matDF.columns[np.argsort(model.column_labels_)]
matDF = matDF[fitData_c]
fitData_i = matDF.index[ np.argsort(model.row_labels_)]
matDF = matDF.reindex(fitData_i)

column_names =  np.array([i[13:16] for i in fitData_c])

# plot
fig = plt.figure()
ax = fig.add_subplot(111)
    km = KMeans(n_clusters=k)
    km = km.fit(df.iloc[:,2:14])
    SS.append(km.inertia_)
plt.plot(NC,SS)
plt.xlabel('k')
plt.ylabel('SS')
plt.show()
from sklearn.cluster.bicluster import SpectralCoclustering
flavour=df.iloc[:,2:14] 
corr_whisky=pd.DataFrame.corr(flavour.transpose())
print(corr_whisky)
plt.figure(figsize=(8,8))
plt.pcolor(corr_whisky)
import pandas as pd
plt.colorbar()
model=SpectralCoclustering(n_clusters=5,random_state=45)
x=df["Distillery"]
df["disteliries_group"]=pd.Series(x,index=df.index)

cluster=list(zip(df.iloc[:,1],df.iloc[:,13]))

cluster=sorted(cluster, key=lambda x: x[1])

print("the resultant grouped classified whiskey based on their flavour")
print("\n")

c=pd.DataFrame(cluster)
print(c)
model1=pickle.dump(cluster,open('model1.pkl','wb'))

       
Ejemplo n.º 43
0
        continue
    
    # TODO hack: skip very long lists
    if skip_thresh and len(sources) > skip_thresh:
        continue

    # All events have numbered tweets
    rowSelector = np.array([row_lookup[source] for source in sources])
    data[rowSelector, j] = 1    

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

plt.savefig('%s_original.png' % (identifier), bbox_inches='tight')

model = SpectralCoclustering(n_clusters=n_clusters, random_state=0)
model.fit(data)

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged")

plt.savefig('%s_clustered.png' % (identifier), bbox_inches='tight')

avg_data = np.copy(data)

# Compute average value in each co-cluster for display purposes
for c in range(n_clusters):
    for d in range(n_clusters):                       
Ejemplo n.º 44
0
print(coOccurencesMatrix)
print(coOccurencesMatrix.shape)

hashtags = vectorizer.get_feature_names()
hashtags = np.array(hashtags)

coOccurencesMatrix = np.where(coOccurencesMatrix == 0, 0, coOccurencesMatrix)
#coOccurencesMatrix = StandardScaler().fit_transform(coOccurencesMatrix)

print(coOccurencesMatrix)
import copy
coOccurencesMatrix2 = copy.deepcopy(coOccurencesMatrix)
coOccurencesMatrix2 = np.corrcoef(coOccurencesMatrix2)
coOccurencesMatrix = np.corrcoef(coOccurencesMatrix)
nbClusters = 40
model = SpectralCoclustering(n_clusters=nbClusters, random_state=1)
model.fit(coOccurencesMatrix)
print("fit")
print(coOccurencesMatrix)
fit_data = coOccurencesMatrix[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
hashtagsrow = hashtags[np.argsort(model.row_labels_)]
hashtagscolumns = hashtags[np.argsort(model.column_labels_)]
print("rowlavels")
print(model.row_labels_)
print("columnlzbels")
print(model.column_labels_)
print("hashtags")
print(hashtags)
print(fit_data.shape)
print(fit_data)
Ejemplo n.º 45
0
corr_flavors = pd.DataFrame.corr(flavors)
corr_flavors

plt.figure(figsize=(10,10))
plt.pcolor(corr_flavors)
plt.colorbar()
plt.savefig('./python_case_studies/whisky/corr_flavors.pdf')

corr_whisky = pd.DataFrame.corr(flavors.transpose())
plt.figure(figsize=(10,10))
plt.pcolor(corr_whisky)
plt.axis('tight')
plt.colorbar()
plt.savefig('./python_case_studies/whisky/corr_whisky.pdf')

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(corr_whisky)
model.rows_

np.sum(model.rows_, axis=1)

np.sum(model.rows_, axis=0)

model.row_labels_

whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
whisky = whisky.ix[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)

correlations = pd.DataFrame.corr(whisky.iloc[:,2:14].transpose())
correlations = np.array(correlations)
Ejemplo n.º 46
0
os.mkdir('solution')

#n_clusters = (3, 2)
n_clusters = 20

arq = open('dados_v2.txt')
dados = np.array([map(float, a.split('\t')[:-1]) for a in arq.readlines()])

plt.matshow(zip(*dados), cmap=cm.PiYG)
plt.title("Original dataset")
pl.savefig('solution/original.png', bbox_inches=0)

#model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
model = SpectralCoclustering(n_clusters=n_clusters,
                             svd_method='arpack',
                             random_state=0)
model.fit(dados)
fit_data = dados[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(zip(*fit_data), cmap=cm.PiYG)
pl.savefig('solution/biclustered.png', bbox_inches=0)
plt.title("After biclustering; rearranged to show biclusters")
plt.matshow(zip(*np.outer(
    np.sort(model.row_labels_) + 1,
    np.sort(model.column_labels_) + 1)),
            cmap=plt.cm.PiYG)
plt.title("Checkerboard structure of rearranged data")
pl.savefig('solution/biclustered and rearranged.png', bbox_inches=0)
Ejemplo n.º 47
0
    this form of dimensionality reduction, some methods may perform better.
    """
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)


class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))


dir = ROOT_DIR+'\\processed_data\\'
data = pickle.load(open(dir+'FT_raw_corpus_2013.p', 'rb'));

vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters= 20,
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=20, batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

data, rows, columns = make_biclusters(
    shape=(300, 300), n_clusters=5, noise=5,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralCoclustering(n_clusters=5, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print "consensus score: {:.3f}".format(score)

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.show()
Ejemplo n.º 49
0
cluster_colors = ["red", "orange", "green", "blue", "purple", "gray"]
regions = [
    "Speyside", "Highlands", "Lowlands", "Islands", "Campbelltown", "Islay"
]
import numpy as np
region_colors = dict(zip(regions, cluster_colors))  ## ENTER CODE HERE! ##
print(region_colors)

#from lectures
import pandas as pd
import pylab as plt
whisky = pd.read_csv('whiskies.txt')
whisky['Region'] = pd.read_csv('regions.txt')
from sklearn.cluster.bicluster import SpectralCoclustering
model = SpectralCoclustering(n_clusters=6, random_state=0)
flavors = whisky.iloc[:, 2:14]
corr_flavors = pd.DataFrame.corr(flavors)
corr_whisky = pd.DataFrame.corr(flavors.transpose())
model.fit(corr_whisky)
whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
whisky = whisky.ix[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)
correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())
correlations = np.array(correlations)

distilleries = list(whisky.Distillery)
correlation_colors = []
for i in range(len(distilleries)):
    for j in range(len(distilleries)):
        if correlations[i][
Ejemplo n.º 50
0
print(len(user_movie_matrix))
print(len(user_movie_matrix[0]))
#print(user_movie_matrix)
print(type(user_movie_matrix))

#find number of users and movies in each bicluster
'''G = nx_graph_from_biadjacency_matrix(user_movie_matrix)
nx.draw(G)
plt.show()'''

#initialize and carry out clustering
K=50

#km = KMeans(n_clusters = K)
#km.fit(user_movie_matrix)
scc = SpectralCoclustering(n_clusters = K,svd_method='arpack')
scc.fit(user_movie_matrix)

#labels
row_labels = scc.row_labels_
column_labels = scc.column_labels_

bicluster_num_users=np.zeros(K)
bicluster_num_movies=np.zeros(K)
#maintain a list of users per bicluster
bicluster_list_users=[]
#maintain a list of movies per bicluster
bicluster_list_movies=[]

for i in range(K):
    bicluster_list_users.append([])