def main(target_id, in_file, api_key): cache_dir = os.path.join(os.getcwd(), "cache") uniprot_retriever = UniprotRestRetrieval(cache_dir) cur_db = shelve.open("%s.db" % os.path.splitext(in_file)[0]) # load the database with open(in_file) as in_handle: in_handle.readline() # header for index, line in enumerate(in_handle): uniprot_id = line.split()[0].strip() if uniprot_id not in cur_db.keys(): cur_terms = get_description_terms(uniprot_retriever, uniprot_id, api_key) if len(cur_terms) > 0: cur_db[uniprot_id] = cur_terms # cluster and print out cluster details term_matrix, uniprot_ids = organize_term_array(cur_db) cluster_ids, error, nfound = Cluster.kcluster(term_matrix, nclusters=10, npass=20, method='a', dist='e') cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) for cluster_group in cluster_dict.values(): if target_id in cluster_group: for item in cluster_group: print item, cur_db[item] cur_db.close()
def main(ipr_number, num_clusters): db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 100.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster(info_array, nclusters=num_clusters, npass=50)#, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: members.append(dict(organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get("uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link(u, max(len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open("%s-cluster%s.html" % (ipr_number, index), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))
def cluster_kernels(kernels_array, k=kmeans_k, times=1): print "start clustering" clusterid = [] error_best = float('inf') for i in range(times): clusterid_single, error, nfound = Cluster.kcluster(kernels_array, nclusters=k, dist='a') if error < error_best: clusterid = clusterid_single error_best = error print 'error:', error_best cdata, cmask = Cluster.clustercentroids(kernels_array, clusterid=clusterid, ) print "end clustering" return clusterid, cdata
def main(ipr_number, num_clusters, out_dir): charge_window = 75 db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 50.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster( info_array, nclusters=num_clusters, npass=50) #, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number)) out_seq_handle = open(out_seq_file, "w") for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window, out_dir) base, ext = os.path.splitext(charge_plot_img) disorder_plot_img = "%s-idr%s" % (base, ext) rec = Fasta.Record() rec.title = u rec.sequence = cur_db[u]["seq"] out_seq_handle.write(str(rec) + "\n") members.append( dict( organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get( "uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], charge_plot_img=charge_plot_img, disorder_plot_img=disorder_plot_img, domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link( u, max( len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open( os.path.join(out_dir, "%s-cluster%s.html" % (ipr_number, index)), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))