def get_kmedoids(df, dist_matrix, num_clusters): clusterid, error, nfound = Cluster.kmedoids(dist_matrix, num_clusters) df = df.reset_index() df['cluster_id'] = pd.DataFrame(data=clusterid) return df
def main(target_id, in_file, api_key): cache_dir = os.path.join(os.getcwd(), "cache") uniprot_retriever = UniprotRestRetrieval(cache_dir) cur_db = shelve.open("%s.db" % os.path.splitext(in_file)[0]) # load the database with open(in_file) as in_handle: in_handle.readline() # header for index, line in enumerate(in_handle): uniprot_id = line.split()[0].strip() if uniprot_id not in cur_db.keys(): cur_terms = get_description_terms(uniprot_retriever, uniprot_id, api_key) if len(cur_terms) > 0: cur_db[uniprot_id] = cur_terms # cluster and print out cluster details term_matrix, uniprot_ids = organize_term_array(cur_db) cluster_ids, error, nfound = Cluster.kcluster(term_matrix, nclusters=10, npass=20, method='a', dist='e') cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) for cluster_group in cluster_dict.values(): if target_id in cluster_group: for item in cluster_group: print item, cur_db[item] cur_db.close()
def cluster_kernels(kernels_array, k=kmeans_k, times=1): print "start clustering" clusterid = [] error_best = float('inf') for i in range(times): clusterid_single, error, nfound = Cluster.kcluster(kernels_array, nclusters=k, dist='a') if error < error_best: clusterid = clusterid_single error_best = error print 'error:', error_best cdata, cmask = Cluster.clustercentroids(kernels_array, clusterid=clusterid, ) print "end clustering" return clusterid, cdata
def cluster(*args): #"cyano.txt" with open(args[0]) as handle: record = Cluster.read(handle) genetree = record.treecluster(method='s') #genetree.scale() exptree = record.treecluster(dist='u', transpose=1) record.save(args[1], genetree, exptree)
def main(ipr_number, num_clusters): db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 100.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster(info_array, nclusters=num_clusters, npass=50)#, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: members.append(dict(organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get("uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link(u, max(len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open("%s-cluster%s.html" % (ipr_number, index), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))
def get_kmedoids(input_points, num_clusters): """ df = pd.read_csv(fname) x = df.iloc[:, 0] y = df.iloc[:, 1] inputs = [] for row in range(len(df)): inputs.append((df.iloc[row, 0], df.iloc[row, 1])) """ inputs = input_points google_key = "AIzaSyBIrorNpUJ_RyaxZ-8PxC0ZXZ818hRc5hM" maps = googlemaps.Client(key=google_key) # inputs = inputs[:5] # for testing dist_matrix = tmo.get_dist_matrix(inputs, maps) clusterid, error, nfound = Cluster.kmedoids(array(dist_matrix), nclusters=num_clusters) # calculate cluster centroid clusterid_set = set(clusterid) centroids_dict = {} centroids_arr = [] for itm in clusterid_set: k = itm val = inputs[itm] centroids_dict[k] = val centroids_arr.append(val) # a,b = Cluster.clustercentroids(clusterid=clusterid) centroids_arr = np.array(map(np.array, centroids_arr)) # convert dist_matrix to a dataframe yard_index = ['turf'+str(i) for i in range(1, len(inputs)+1)] turf_index = ['turf'+str(i) for i in range(1, len(inputs)+1)] dist_mat = array(dist_matrix) df_dist_matr = pd.DataFrame(data=dist_mat[0:,0:], index=yard_index, columns=turf_index) return centroids_arr, df_dist_matr
def cluster_and_assign_all_cuts(file, dist): with open(file) as handle: record = Cluster.read(handle) row_tree = record.treecluster(transpose=False, method='a', dist=dist) row_tree.scale() record.save(name.split('.')[0], geneclusters=row_tree) # cut tree into n clusters, for 1 <= n <= n_elements n_elements = len(record.geneid) mat = np.zeros((n_elements, n_elements)) for i in range(n_elements): mat[:, i] = row_tree.cut(i + 1) # cut into 1 <= i+1 <= n_elements return pd.DataFrame(data=mat, dtype=int, index=record.geneid, columns=range(1, n_elements + 1))
def cluster_dataset(filename, name=None, rows=True, cols=True, method='a', dist='c', row_order=None, col_order=None): with open(filename) as handle: record = Cluster.read(handle) if row_order: record.gorder = [row_order.index(row) for row in record.geneid] if col_order: record.eorder = [col_order.index(col) for col in record.expid] # initialize trees as 'None' in case only one axis is being clustered row_tree = None col_tree = None # cluster rows (mutants) if rows: row_tree = record.treecluster(transpose=False, method=method, dist=dist) row_tree.scale() # scale to [0,1] for ease of viewing in Java TreeView # cluster columns (library genes) if cols: col_tree = record.treecluster(transpose=True, method=method, dist=dist) col_tree.scale() if not name: name = filename.split('.')[0] record.save(name, row_tree, col_tree) return record, row_tree, col_tree
from Bio import Cluster with open("/home/koreanraichu/cyano.txt") as handle: record = Cluster.read(handle) # 불러왔다 matrix = record.distancematrix() # Distance matrix 계산 cdata, cmask = record.clustercentroids() # Cluster 무게중심(Centroid) 계산 distance = record.clusterdistance() # 클러스터간 거리 계산 tree = record.treecluster() # hierarchical clustering # 이거 matplot으로 못뽑나... clusterid, error, nfound = record.kcluster() # k-mean clustering # method='a': k-mean # method='m': k-median clusterid, celldata = record.somcluster() # SOM 계산하기 jobname = "cyano_clustering" record.save(jobname, record.treecluster(), record.treecluster(transpose=1)) # 내 컴퓨터에 저(별)장 # 기본 형식: record.save(jobname, geneclusters, expclusters) # geneclusters=record.treecluster() # expclusters=record.treecluster(transpose=1)
def clust(source): with open(source,'r') as handle: data=Cluster.read(handle) tree=data.treecluster() tree.scale() data.save(path.splitext(source)[0],tree)
def main(ipr_number, num_clusters, out_dir): charge_window = 75 db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 50.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster( info_array, nclusters=num_clusters, npass=50) #, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number)) out_seq_handle = open(out_seq_file, "w") for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window, out_dir) base, ext = os.path.splitext(charge_plot_img) disorder_plot_img = "%s-idr%s" % (base, ext) rec = Fasta.Record() rec.title = u rec.sequence = cur_db[u]["seq"] out_seq_handle.write(str(rec) + "\n") members.append( dict( organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get( "uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], charge_plot_img=charge_plot_img, disorder_plot_img=disorder_plot_img, domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link( u, max( len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open( os.path.join(out_dir, "%s-cluster%s.html" % (ipr_number, index)), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))
kmeans1 = KMeans(n_clusters=5, random_state=0).fit(dist_dot_product) kmeans2 = KMeans(n_clusters=5, random_state=0).fit(general_dot_product) image_kmeans_dot = np.zeros((ys[1] - ys[0], xs[1] - xs[0])) image_kmeans_pixel = np.zeros((ys[1] - ys[0], xs[1] - xs[0])) print('Transforming in real coordinates...') for i in ids: image_kmeans_dot[parser.coordinates[i][1] - ys[0]][parser.coordinates[i][0] - xs[1]] = kmeans1.labels_[ids.index(i)] image_kmeans_pixel[parser.coordinates[i][1] - ys[0]][parser.coordinates[i][0] - xs[1]] = kmeans2.labels_[ids.index(i)] print('Kmedoid clustering...') kmedoid_dot = Cluster.kmedoids(dist_dot_product, nclusters=5, npass=5, initialid=None) #Distance Matrix!!! kmedoid_pixel = Cluster.kmedoids(general_dot_product, nclusters=5, npass=5, initialid=None) #Distance Matrix!!! image_kmedoid_dot = np.zeros((ys[1] - ys[0], xs[1] - xs[0])) image_kmedoid_pixel = np.zeros((ys[1] - ys[0], xs[1] - xs[0])) for i in ids: image_kmedoid_dot[parser.coordinates[i][1] - ys[0]][parser.coordinates[i][0] - xs[1]] = kmedoid_dot[0][ids.index(i)] image_kmedoid_pixel[parser.coordinates[i][1] - ys[0]][parser.coordinates[i][0] - xs[1]] = kmedoid_pixel[0][ids.index(i)]
Z = linkage(Y, 'average') print Z dendrogram(Z) fclust = fcluster(Z, 2, criterion='distance') clust_dict = defaultdict( list ) for i, row in enumerate(water_list): #print fclust[i], str(fclust[i]) clust_dict[ str(fclust[i]) ].append( row ) #print clust_dict for c in clust_dict: print 'select water and (' + ' or '.join( [ '(~' + w[12] + ' and ' + str(w[5]) + ')' for w in clust_dict[c] ] ) + '); isosurface id "foo' + c + '" color lightblue center {selected} SPHERE @{ [ {selected}.x.stddev, {selected}.y.stddev, {selected}.z.stddev, 0.5 ].max *2 } translucent' + ';' sys.exit(0) from Bio import Cluster lines = "Start\tX\tY\tZ\n" + "\n".join( [ "\t".join( [ row[12] + "|" + str(row[5]), str(row[6]), str(row[7]), str(row[8]) ] ) for row in water_list ] ) + "\n" import StringIO handle = StringIO.StringIO(lines) record = Cluster.read(handle) tree = record.treecluster( method="c", dist="e" ) record.save( '../data/test_tree', tree ) print tree
""" from ep_bioinfo_load105 import * from Bio import Cluster from matplotlib import pyplot as plt try: mPD PD ProtNames except: mPD = load_mPD() PD,ProtNames = load_PD() R = Cluster.somcluster(mPD,transpose=1,nxgrid=40,nygrid=40,niter=1) from minisom import MiniSom ### Initialization and training ### som = MiniSom(40,40,15,sigma=1.0,learning_rate=0.5) #som.random_weights_init(mPD) som.weights som.random_weights_init(transpose(mPD)) print("Training...") som.train_random(transpose(mPD),100) # training with 100 iterations print("\n...ready!") timg = np.zeros(shape=(40,40)) for c in R[0]: timg[c[0],c[1]]=timg[c[0],c[1]]+1
import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd import Bio.Cluster as bc f = "/data/Lei_student/Hussain/ML/dm6/peakerror/summary.csv" o = "/data/Lei_student/Hussain/ML/dm6/peakerror/cluster_out.png" df = pd.read_csv(f) data = df[["mcc", "q_value"]] matrix = bc.distancematrix(data) cdata, cmask = bc.clustercentroids(data) distance = bc.clusterdistance(data) tree = bc.treecluster(data) print(matrix) fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111) ax.set_title("MCC distance matrix") plt.scatter(range(45000), matrix) plt.savefig(o)