def biclustering(db):
    #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1)
    df = pd.read_csv(
        '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv')
    dma = 501
    #print df.head()
    print df.shape
    dev_list = df.ix[:, 0].values
    prog_list = df.columns.values
    #print type(dev_list)
    #print type(prog_list)
    df.drop(df.columns[0], axis=1, inplace=True)
    #df[df==0] = 1
    df = df.apply(fraction, axis=1)
    #print df.head()
    #print df.values
    #print type(df.values)
    #mydata = df.values
    #mydata=np.delete(mydata, 0, axis=0)
    #mydata=np.delete(mydata, 0, axis=1)
    #mydata[mydata==0] = 0.01
    #print 'data format is:',mydata,type(mydata)
    # model=SpectralCoclustering(n_clusters=5, random_state=0)
    #n_clusters=(1000,20) # 4*3 = 12 clusters

    #model = SpectralBiclustering(random_state=None)
    model = SpectralCoclustering(n_clusters=10)
    model.fit(df)
    #fit_data=mydata[np.argsort(model.row_labels_)]
    #fit_data=fit_data[:,np.argsort(model.column_labels_)]
    #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues)
    # plt.show()
    print model.get_params()
    for i in range(0, 5):
        print 'Size of one cluster:', model.get_shape(i)
        indices = model.get_indices(i)
        #print indices[1]
        print prog_list[indices[1]]
        print model.get_submatrix(i, df.values)
        dev_in_cluster = dev_list[indices[0]]
        #print type(dev_in_cluster)
        print 'number of devices within this cluster:', len(dev_in_cluster)
        get_income(db, dma, dev_in_cluster.tolist())
    col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
    weight = X[rows[:, np.newaxis], cols].sum()
    cut = (X[row_complement[:, np.newaxis], cols].sum() +
           X[rows[:, np.newaxis], col_complement].sum())
    return cut / weight


bicluster_ncuts = list(bicluster_ncut(i)
                       for i in xrange(len(newsgroups.target_names)))
best_idx = np.argsort(bicluster_ncuts)[:5]

print()
print("Best biclusters:")
print("----------------")
for idx, cluster in enumerate(best_idx):
    n_rows, n_cols = cocluster.get_shape(cluster)
    cluster_docs, cluster_words = cocluster.get_indices(cluster)
    if not len(cluster_docs) or not len(cluster_words):
        continue

    # categories
    cluster_categories = list(document_names[i] for i in cluster_docs)
    counter = Counter(cluster_categories)
    cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100,
                                               name)
                           for name, c in counter.most_common()[:3])

    # words
    out_of_cluster_docs = cocluster.row_labels_ != cluster
    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
    word_col = X[:, cluster_words]
    """Items of a defaultdict(int) with the highest values.

    Like Counter.most_common in Python >=2.7.
    """
    return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True)


bicluster_ncuts = list(bicluster_ncut(i)
                       for i in range(len(newsgroups.target_names)))
best_idx = np.argsort(bicluster_ncuts)[:5]

print()
print("Best biclusters:")
print("----------------")
for idx, cluster in enumerate(best_idx):
    n_rows, n_cols = cocluster.get_shape(cluster)
    cluster_docs, cluster_words = cocluster.get_indices(cluster)
    if not len(cluster_docs) or not len(cluster_words):
        continue

    # categories
    counter = defaultdict(int)
    for i in cluster_docs:
        counter[document_names[i]] += 1
    cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name)
                           for name, c in most_common(counter)[:3])

    # words
    out_of_cluster_docs = cocluster.row_labels_ != cluster
    out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
    word_col = X[:, cluster_words]
Beispiel #4
0
        avg_data[row_sel, col_sel] = np.average(data[row_sel, col_sel])

avg_data = avg_data[np.argsort(model.row_labels_)]
avg_data = avg_data[:, np.argsort(model.column_labels_)]

plt.matshow(avg_data, cmap=plt.cm.Blues)
plt.title("Average cluster intensity")

plt.savefig('%s_averaged.png' % (identifier), bbox_inches='tight')

if args.write:
    print "Writing clusters to database."
    # No need to clean up here, just overwrite by _id.
    for c in range(n_clusters):
        (nr, nc) = model.get_shape(c)
        (row_ind, col_ind) = model.get_indices(c)

        cluster_val = None
        if nr > 25 or nc > 50:
            print "Nulling cluster %d: shape (%d, %d)" % (c, nr, nc)
        else:
            cluster_val = c

        for ri in row_ind:
            data_list[ri]['cluster'] = cluster_val
            datastream.save(data_list[ri])
        for ci in col_ind:
            events_list[ci]['cluster'] = cluster_val
            events.save(events_list[ci])
           
        avg_data[row_sel, col_sel] = np.average(data[row_sel, col_sel])

avg_data = avg_data[np.argsort(model.row_labels_)]
avg_data = avg_data[:, np.argsort(model.column_labels_)]

plt.matshow(avg_data, cmap=plt.cm.Blues)
plt.title("Average cluster intensity")

plt.savefig('%s_averaged.png' % (identifier), bbox_inches='tight')

if args.write:
    print "Writing clusters to database."
    # No need to clean up here, just overwrite by _id.
    for c in range(n_clusters):
        (nr, nc) = model.get_shape(c)
        (row_ind, col_ind) = model.get_indices(c)
        
        cluster_val = None
        if nr > 25 or nc > 50:
            print "Nulling cluster %d: shape (%d, %d)" % (c, nr, nc)
        else:
            cluster_val = c
            
        for ri in row_ind:
            data_list[ri]['cluster'] = cluster_val
            datastream.save(data_list[ri])
        for ci in col_ind:
            events_list[ci]['cluster'] = cluster_val
            events.save(events_list[ci])