def test_record_basic(self): """Basic test on Record """ def pbool(b): if b: return 1 return 0 r = Fasta.Record() assert pbool(type(r.title) is StringType) # StringType assert pbool(type(r.sequence) is StringType) # StringType
def __str__(self): """Print out a fasta version of the alignment info.""" return_string = '' for item in self._records: new_f_record = Fasta.Record() new_f_record.title = item.description new_f_record.sequence = item.seq.data return_string = return_string + str(new_f_record) + os.linesep + os.linesep # have a extra newline, so strip two off and add one before returning return return_string.rstrip() + os.linesep
def main(blast_file): db_dir = os.path.join(os.getcwd(), "db") cur_dbs = get_available_dbs(db_dir) length_cutoff = 0.2 blast_clusters, all_lengths = get_blast_clusters(blast_file, length_cutoff) filter_clusters = filter_by_organism(blast_clusters, org_includes, cur_dbs) length_plot(all_lengths, blast_file) cluster_grouper = SimilarityClusterGrouper(2, 200, [(0.9, 10)]) all_groups = cluster_grouper.get_final_groups(filter_clusters) base, ext = os.path.splitext(blast_file) cluster_file = base + "-bcluster%s.txt" for gindex, group in enumerate(all_groups): print '-----------' with open(cluster_file % gindex, "w") as out_handle: for gitem in group: db_rec = get_db_rec(gitem, cur_dbs) print gitem, db_rec["org_scientific_name"] rec = Fasta.Record() rec.title = gitem rec.sequence = db_rec["seq"] out_handle.write(str(rec) + "\n")
def main(ipr_number, num_clusters, out_dir): charge_window = 75 db_dir = os.path.join(os.getcwd(), "db") cur_db = shelve.open(os.path.join(db_dir, ipr_number)) tax_graph = build_tax_graph(cur_db) uniprot_ids = [] info_array = [] for db_domain in cur_db.keys(): if not cur_db[db_domain].get("is_uniref_child", ""): uniprot_ids.append(db_domain) db_item = cur_db[db_domain] cur_cluster_info = [ float(db_item["charge"]), float(db_item["charge_region"]) * 10.0, len(db_item.get("db_refs", [])) * 5.0, calc_domain_distance(db_item) * 50.0, #max(len(db_item.get("string_interactors", [])) - 1, 0), ] info_array.append(cur_cluster_info) info_array = numpy.array(info_array) print 'Num genes', len(info_array), num_clusters cluster_ids, error, nfound = Cluster.kcluster( info_array, nclusters=num_clusters, npass=50) #, method='a', dist='c') #tree = Cluster.treecluster(info_array, method='a', dist='c') #cluster_ids = tree.cut(num_clusters) cluster_dict = collections.defaultdict(lambda: []) for i, cluster_id in enumerate(cluster_ids): cluster_dict[cluster_id].append(uniprot_ids[i]) out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number)) out_seq_handle = open(out_seq_file, "w") for index, cluster_group in enumerate(cluster_dict.values()): print '***********', index org_dists = [] for uniprot_id in cluster_group: org = cur_db[uniprot_id]["org_scientific_name"] distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus', org) org_dists.append((distance, org, uniprot_id)) org_dists.sort() members = [] for d, o, u in org_dists: charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window, out_dir) base, ext = os.path.splitext(charge_plot_img) disorder_plot_img = "%s-idr%s" % (base, ext) rec = Fasta.Record() rec.title = u rec.sequence = cur_db[u]["seq"] out_seq_handle.write(str(rec) + "\n") members.append( dict( organism=o, uniprot_id=get_uniprot_links([u]), alt_names=get_alt_names(cur_db[u]), alt_ids=get_uniprot_links(cur_db[u].get( "uniref_children", [])), charge=cur_db[u]["charge"], charge_region="%0.2f" % cur_db[u]["charge_region"], charge_plot_img=charge_plot_img, disorder_plot_img=disorder_plot_img, domains=len(cur_db[u].get("db_refs", [])), interactions=get_string_link( u, max( len(cur_db[u].get("string_interactors", [])) - 1, 0)), description=cur_db[u].get("function_descr", " "), c_distance="%0.2f" % calc_domain_distance(cur_db[u]), )) with open( os.path.join(out_dir, "%s-cluster%s.html" % (ipr_number, index)), "w") as out_handle: tmpl = Template(cluster_template) out_handle.write(tmpl.render(cluster_members=members))