Beispiel #1
0
    def test_record_basic(self):
        """Basic test on Record
        """
        def pbool(b):
            if b:
                return 1
            return 0

        r = Fasta.Record()
        assert pbool(type(r.title) is StringType)    # StringType
        assert pbool(type(r.sequence) is StringType) # StringType
Beispiel #2
0
    def __str__(self):
        """Print out a fasta version of the alignment info."""
        return_string = ''
        for item in self._records:
            new_f_record = Fasta.Record()
            new_f_record.title = item.description
            new_f_record.sequence = item.seq.data

            return_string = return_string + str(new_f_record) + os.linesep + os.linesep

        # have a extra newline, so strip two off and add one before returning
        return return_string.rstrip() + os.linesep
Beispiel #3
0
def main(blast_file):
    db_dir = os.path.join(os.getcwd(), "db")
    cur_dbs = get_available_dbs(db_dir)
    length_cutoff = 0.2
    blast_clusters, all_lengths = get_blast_clusters(blast_file, length_cutoff)
    filter_clusters = filter_by_organism(blast_clusters, org_includes, cur_dbs)
    length_plot(all_lengths, blast_file)
    cluster_grouper = SimilarityClusterGrouper(2, 200, [(0.9, 10)])
    all_groups = cluster_grouper.get_final_groups(filter_clusters)
    base, ext = os.path.splitext(blast_file)
    cluster_file = base + "-bcluster%s.txt"
    for gindex, group in enumerate(all_groups):
        print '-----------'
        with open(cluster_file % gindex, "w") as out_handle:
            for gitem in group:
                db_rec = get_db_rec(gitem, cur_dbs)
                print gitem, db_rec["org_scientific_name"]
                rec = Fasta.Record()
                rec.title = gitem
                rec.sequence = db_rec["seq"]
                out_handle.write(str(rec) + "\n")
Beispiel #4
0
def main(ipr_number, num_clusters, out_dir):
    charge_window = 75
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                float(db_item["charge"]),
                float(db_item["charge_region"]) * 10.0,
                len(db_item.get("db_refs", [])) * 5.0,
                calc_domain_distance(db_item) * 50.0,
                #max(len(db_item.get("string_interactors", [])) - 1, 0),
            ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(
        info_array, nclusters=num_clusters, npass=50)  #, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number))
    out_seq_handle = open(out_seq_file, "w")
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                                                     org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window,
                                               out_dir)
            base, ext = os.path.splitext(charge_plot_img)
            disorder_plot_img = "%s-idr%s" % (base, ext)
            rec = Fasta.Record()
            rec.title = u
            rec.sequence = cur_db[u]["seq"]
            out_seq_handle.write(str(rec) + "\n")
            members.append(
                dict(
                    organism=o,
                    uniprot_id=get_uniprot_links([u]),
                    alt_names=get_alt_names(cur_db[u]),
                    alt_ids=get_uniprot_links(cur_db[u].get(
                        "uniref_children", [])),
                    charge=cur_db[u]["charge"],
                    charge_region="%0.2f" % cur_db[u]["charge_region"],
                    charge_plot_img=charge_plot_img,
                    disorder_plot_img=disorder_plot_img,
                    domains=len(cur_db[u].get("db_refs", [])),
                    interactions=get_string_link(
                        u,
                        max(
                            len(cur_db[u].get("string_interactors", [])) - 1,
                            0)),
                    description=cur_db[u].get("function_descr", " "),
                    c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
                ))
        with open(
                os.path.join(out_dir,
                             "%s-cluster%s.html" % (ipr_number, index)),
                "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))