Exemple #1
0
def get_kmedoids(df, dist_matrix, num_clusters):

    clusterid, error, nfound = Cluster.kmedoids(dist_matrix, num_clusters)
    df = df.reset_index()
    df['cluster_id'] = pd.DataFrame(data=clusterid)

    return df
Exemple #2
0
def main(target_id, in_file, api_key):
    cache_dir = os.path.join(os.getcwd(), "cache")
    uniprot_retriever = UniprotRestRetrieval(cache_dir)
    cur_db = shelve.open("%s.db" % os.path.splitext(in_file)[0])
    # load the database
    with open(in_file) as in_handle:
        in_handle.readline() # header
        for index, line in enumerate(in_handle):
            uniprot_id = line.split()[0].strip()
            if uniprot_id not in cur_db.keys():
                cur_terms = get_description_terms(uniprot_retriever,
                        uniprot_id, api_key)
                if len(cur_terms) > 0:
                    cur_db[uniprot_id] = cur_terms
    # cluster and print out cluster details
    term_matrix, uniprot_ids = organize_term_array(cur_db)
    cluster_ids, error, nfound = Cluster.kcluster(term_matrix,
            nclusters=10, npass=20, method='a', dist='e')
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    for cluster_group in cluster_dict.values():
        if target_id in cluster_group:
            for item in cluster_group:
                print item, cur_db[item]
    cur_db.close()
def cluster_kernels(kernels_array, k=kmeans_k, times=1):
    print "start clustering"

    clusterid = []
    error_best = float('inf')
    for i in range(times):
        clusterid_single, error, nfound = Cluster.kcluster(kernels_array, nclusters=k, dist='a')
        if error < error_best:
            clusterid = clusterid_single
            error_best = error
    print 'error:', error_best

    cdata, cmask = Cluster.clustercentroids(kernels_array, clusterid=clusterid, )

    print "end clustering"

    return clusterid, cdata
Exemple #4
0
def cluster(*args):
    #"cyano.txt"
    with open(args[0]) as handle:
        record = Cluster.read(handle)
        genetree = record.treecluster(method='s')
        #genetree.scale()
        exptree = record.treecluster(dist='u', transpose=1)
        record.save(args[1], genetree, exptree)
def main(ipr_number, num_clusters):
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                    float(db_item["charge"]),
                    float(db_item["charge_region"]) * 10.0,
                    len(db_item.get("db_refs", [])) * 5.0,
                    calc_domain_distance(db_item) * 100.0,
                    #max(len(db_item.get("string_interactors", [])) - 1, 0),
                    ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(info_array,
            nclusters=num_clusters, npass=50)#, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                    org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            members.append(dict(organism=o,
                uniprot_id=get_uniprot_links([u]),
                alt_names=get_alt_names(cur_db[u]),
                alt_ids=get_uniprot_links(cur_db[u].get("uniref_children", [])),
                charge=cur_db[u]["charge"],
                charge_region="%0.2f" % cur_db[u]["charge_region"],
                domains=len(cur_db[u].get("db_refs", [])),
                interactions=get_string_link(u,
                    max(len(cur_db[u].get("string_interactors", [])) - 1, 0)),
                description=cur_db[u].get("function_descr", "&nbsp;"),
                c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
            ))
        with open("%s-cluster%s.html" % (ipr_number, index), "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))
Exemple #6
0
def get_kmedoids(input_points, num_clusters):

    """
    df = pd.read_csv(fname)
    x = df.iloc[:, 0]
    y = df.iloc[:, 1]
    inputs = []
    for row in range(len(df)):
        inputs.append((df.iloc[row, 0], df.iloc[row, 1]))
    """

    inputs = input_points

    google_key = "AIzaSyBIrorNpUJ_RyaxZ-8PxC0ZXZ818hRc5hM"

    maps = googlemaps.Client(key=google_key)

    # inputs = inputs[:5]  # for testing
    dist_matrix = tmo.get_dist_matrix(inputs, maps)
    clusterid, error, nfound = Cluster.kmedoids(array(dist_matrix), nclusters=num_clusters)

    # calculate cluster centroid
    clusterid_set = set(clusterid)
    centroids_dict = {}
    centroids_arr = []
    for itm in clusterid_set:
        k = itm
        val = inputs[itm]
        centroids_dict[k] = val
        centroids_arr.append(val)

    # a,b = Cluster.clustercentroids(clusterid=clusterid)


    centroids_arr = np.array(map(np.array, centroids_arr))

    # convert dist_matrix to a dataframe
    yard_index = ['turf'+str(i) for i in range(1, len(inputs)+1)]
    turf_index = ['turf'+str(i) for i in range(1, len(inputs)+1)]
    dist_mat = array(dist_matrix)
    df_dist_matr = pd.DataFrame(data=dist_mat[0:,0:],
                 index=yard_index,
                columns=turf_index)

    return centroids_arr, df_dist_matr
Exemple #7
0
def cluster_and_assign_all_cuts(file, dist):

    with open(file) as handle:
        record = Cluster.read(handle)
    row_tree = record.treecluster(transpose=False, method='a', dist=dist)
    row_tree.scale()

    record.save(name.split('.')[0], geneclusters=row_tree)

    # cut tree into n clusters, for 1 <= n <= n_elements
    n_elements = len(record.geneid)
    mat = np.zeros((n_elements, n_elements))

    for i in range(n_elements):
        mat[:, i] = row_tree.cut(i + 1)  # cut into 1 <= i+1 <= n_elements

    return pd.DataFrame(data=mat,
                        dtype=int,
                        index=record.geneid,
                        columns=range(1, n_elements + 1))
Exemple #8
0
def cluster_dataset(filename,
                    name=None,
                    rows=True,
                    cols=True,
                    method='a',
                    dist='c',
                    row_order=None,
                    col_order=None):

    with open(filename) as handle:
        record = Cluster.read(handle)

    if row_order:
        record.gorder = [row_order.index(row) for row in record.geneid]
    if col_order:
        record.eorder = [col_order.index(col) for col in record.expid]

    # initialize trees as 'None' in case only one axis is being clustered
    row_tree = None
    col_tree = None

    # cluster rows (mutants)
    if rows:
        row_tree = record.treecluster(transpose=False,
                                      method=method,
                                      dist=dist)
        row_tree.scale()  # scale to [0,1] for ease of viewing in Java TreeView

    # cluster columns (library genes)
    if cols:
        col_tree = record.treecluster(transpose=True, method=method, dist=dist)
        col_tree.scale()

    if not name:
        name = filename.split('.')[0]

    record.save(name, row_tree, col_tree)

    return record, row_tree, col_tree
Exemple #9
0
from Bio import Cluster
with open("/home/koreanraichu/cyano.txt") as handle:
    record = Cluster.read(handle)
# 불러왔다
matrix = record.distancematrix()
# Distance matrix 계산
cdata, cmask = record.clustercentroids()
# Cluster 무게중심(Centroid) 계산
distance = record.clusterdistance()
# 클러스터간 거리 계산
tree = record.treecluster()
# hierarchical clustering
# 이거 matplot으로 못뽑나...
clusterid, error, nfound = record.kcluster()
# k-mean clustering
# method='a': k-mean
# method='m': k-median
clusterid, celldata = record.somcluster()
# SOM 계산하기
jobname = "cyano_clustering"
record.save(jobname, record.treecluster(), record.treecluster(transpose=1))
# 내 컴퓨터에 저(별)장
# 기본 형식: record.save(jobname, geneclusters, expclusters)
# geneclusters=record.treecluster()
# expclusters=record.treecluster(transpose=1)
Exemple #10
0
def clust(source):
    with open(source,'r') as handle: data=Cluster.read(handle)
    tree=data.treecluster()
    tree.scale()
    data.save(path.splitext(source)[0],tree)
Exemple #11
0
def main(ipr_number, num_clusters, out_dir):
    charge_window = 75
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                float(db_item["charge"]),
                float(db_item["charge_region"]) * 10.0,
                len(db_item.get("db_refs", [])) * 5.0,
                calc_domain_distance(db_item) * 50.0,
                #max(len(db_item.get("string_interactors", [])) - 1, 0),
            ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(
        info_array, nclusters=num_clusters, npass=50)  #, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number))
    out_seq_handle = open(out_seq_file, "w")
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                                                     org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window,
                                               out_dir)
            base, ext = os.path.splitext(charge_plot_img)
            disorder_plot_img = "%s-idr%s" % (base, ext)
            rec = Fasta.Record()
            rec.title = u
            rec.sequence = cur_db[u]["seq"]
            out_seq_handle.write(str(rec) + "\n")
            members.append(
                dict(
                    organism=o,
                    uniprot_id=get_uniprot_links([u]),
                    alt_names=get_alt_names(cur_db[u]),
                    alt_ids=get_uniprot_links(cur_db[u].get(
                        "uniref_children", [])),
                    charge=cur_db[u]["charge"],
                    charge_region="%0.2f" % cur_db[u]["charge_region"],
                    charge_plot_img=charge_plot_img,
                    disorder_plot_img=disorder_plot_img,
                    domains=len(cur_db[u].get("db_refs", [])),
                    interactions=get_string_link(
                        u,
                        max(
                            len(cur_db[u].get("string_interactors", [])) - 1,
                            0)),
                    description=cur_db[u].get("function_descr", "&nbsp;"),
                    c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
                ))
        with open(
                os.path.join(out_dir,
                             "%s-cluster%s.html" % (ipr_number, index)),
                "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))
Exemple #12
0
kmeans1 = KMeans(n_clusters=5, random_state=0).fit(dist_dot_product)
kmeans2 = KMeans(n_clusters=5, random_state=0).fit(general_dot_product)
image_kmeans_dot = np.zeros((ys[1] - ys[0], xs[1] - xs[0]))
image_kmeans_pixel = np.zeros((ys[1] - ys[0], xs[1] - xs[0]))
print('Transforming in real coordinates...')
for i in ids:
    image_kmeans_dot[parser.coordinates[i][1] -
                     ys[0]][parser.coordinates[i][0] -
                            xs[1]] = kmeans1.labels_[ids.index(i)]
    image_kmeans_pixel[parser.coordinates[i][1] -
                       ys[0]][parser.coordinates[i][0] -
                              xs[1]] = kmeans2.labels_[ids.index(i)]

print('Kmedoid clustering...')
kmedoid_dot = Cluster.kmedoids(dist_dot_product,
                               nclusters=5,
                               npass=5,
                               initialid=None)  #Distance Matrix!!!
kmedoid_pixel = Cluster.kmedoids(general_dot_product,
                                 nclusters=5,
                                 npass=5,
                                 initialid=None)  #Distance Matrix!!!
image_kmedoid_dot = np.zeros((ys[1] - ys[0], xs[1] - xs[0]))
image_kmedoid_pixel = np.zeros((ys[1] - ys[0], xs[1] - xs[0]))
for i in ids:
    image_kmedoid_dot[parser.coordinates[i][1] -
                      ys[0]][parser.coordinates[i][0] -
                             xs[1]] = kmedoid_dot[0][ids.index(i)]
    image_kmedoid_pixel[parser.coordinates[i][1] -
                        ys[0]][parser.coordinates[i][0] -
                               xs[1]] = kmedoid_pixel[0][ids.index(i)]
Exemple #13
0
Z = linkage(Y, 'average')
print Z
dendrogram(Z)

fclust = fcluster(Z, 2, criterion='distance')


clust_dict = defaultdict( list )
for i, row in enumerate(water_list):
    #print fclust[i], str(fclust[i])
    clust_dict[ str(fclust[i]) ].append( row )

#print clust_dict

for c in clust_dict:
    print 'select water and (' + ' or '.join( [ '(~' + w[12] + ' and ' + str(w[5]) + ')' for w in clust_dict[c] ] ) + '); isosurface id "foo' + c + '" color lightblue center {selected} SPHERE @{ [ {selected}.x.stddev, {selected}.y.stddev, {selected}.z.stddev, 0.5  ].max *2 } translucent' + ';'


sys.exit(0)

from Bio import Cluster
lines = "Start\tX\tY\tZ\n" + "\n".join( [ "\t".join( [ row[12] + "|" + str(row[5]), str(row[6]), str(row[7]), str(row[8]) ] ) for row in water_list ] ) + "\n"

import StringIO
handle = StringIO.StringIO(lines)
record = Cluster.read(handle)
tree = record.treecluster( method="c", dist="e" )
record.save( '../data/test_tree', tree )


print tree
"""

from ep_bioinfo_load105 import *

from Bio import Cluster

from matplotlib import pyplot as plt
try:
  mPD
  PD
  ProtNames
except:
  mPD = load_mPD()
  PD,ProtNames = load_PD()

R = Cluster.somcluster(mPD,transpose=1,nxgrid=40,nygrid=40,niter=1)

from minisom import MiniSom
### Initialization and training ###
som = MiniSom(40,40,15,sigma=1.0,learning_rate=0.5)
#som.random_weights_init(mPD)
som.weights
som.random_weights_init(transpose(mPD))
print("Training...")
som.train_random(transpose(mPD),100) # training with 100 iterations
print("\n...ready!")


timg = np.zeros(shape=(40,40))
for c in R[0]:
  timg[c[0],c[1]]=timg[c[0],c[1]]+1
Exemple #15
0
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import Bio.Cluster as bc

f = "/data/Lei_student/Hussain/ML/dm6/peakerror/summary.csv"
o = "/data/Lei_student/Hussain/ML/dm6/peakerror/cluster_out.png"

df = pd.read_csv(f)
data = df[["mcc", "q_value"]]

matrix = bc.distancematrix(data)
cdata, cmask = bc.clustercentroids(data)
distance = bc.clusterdistance(data)
tree = bc.treecluster(data)

print(matrix)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
ax.set_title("MCC distance matrix")
plt.scatter(range(45000), matrix)
plt.savefig(o)