Exemple #1
0
def main(target_id, in_file, api_key):
    cache_dir = os.path.join(os.getcwd(), "cache")
    uniprot_retriever = UniprotRestRetrieval(cache_dir)
    cur_db = shelve.open("%s.db" % os.path.splitext(in_file)[0])
    # load the database
    with open(in_file) as in_handle:
        in_handle.readline() # header
        for index, line in enumerate(in_handle):
            uniprot_id = line.split()[0].strip()
            if uniprot_id not in cur_db.keys():
                cur_terms = get_description_terms(uniprot_retriever,
                        uniprot_id, api_key)
                if len(cur_terms) > 0:
                    cur_db[uniprot_id] = cur_terms
    # cluster and print out cluster details
    term_matrix, uniprot_ids = organize_term_array(cur_db)
    cluster_ids, error, nfound = Cluster.kcluster(term_matrix,
            nclusters=10, npass=20, method='a', dist='e')
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    for cluster_group in cluster_dict.values():
        if target_id in cluster_group:
            for item in cluster_group:
                print item, cur_db[item]
    cur_db.close()
def main(ipr_number, num_clusters):
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                    float(db_item["charge"]),
                    float(db_item["charge_region"]) * 10.0,
                    len(db_item.get("db_refs", [])) * 5.0,
                    calc_domain_distance(db_item) * 100.0,
                    #max(len(db_item.get("string_interactors", [])) - 1, 0),
                    ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(info_array,
            nclusters=num_clusters, npass=50)#, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                    org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            members.append(dict(organism=o,
                uniprot_id=get_uniprot_links([u]),
                alt_names=get_alt_names(cur_db[u]),
                alt_ids=get_uniprot_links(cur_db[u].get("uniref_children", [])),
                charge=cur_db[u]["charge"],
                charge_region="%0.2f" % cur_db[u]["charge_region"],
                domains=len(cur_db[u].get("db_refs", [])),
                interactions=get_string_link(u,
                    max(len(cur_db[u].get("string_interactors", [])) - 1, 0)),
                description=cur_db[u].get("function_descr", " "),
                c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
            ))
        with open("%s-cluster%s.html" % (ipr_number, index), "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))
def cluster_kernels(kernels_array, k=kmeans_k, times=1):
    print "start clustering"

    clusterid = []
    error_best = float('inf')
    for i in range(times):
        clusterid_single, error, nfound = Cluster.kcluster(kernels_array, nclusters=k, dist='a')
        if error < error_best:
            clusterid = clusterid_single
            error_best = error
    print 'error:', error_best

    cdata, cmask = Cluster.clustercentroids(kernels_array, clusterid=clusterid, )

    print "end clustering"

    return clusterid, cdata
Exemple #4
0
def main(ipr_number, num_clusters, out_dir):
    charge_window = 75
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                float(db_item["charge"]),
                float(db_item["charge_region"]) * 10.0,
                len(db_item.get("db_refs", [])) * 5.0,
                calc_domain_distance(db_item) * 50.0,
                #max(len(db_item.get("string_interactors", [])) - 1, 0),
            ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(
        info_array, nclusters=num_clusters, npass=50)  #, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number))
    out_seq_handle = open(out_seq_file, "w")
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                                                     org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window,
                                               out_dir)
            base, ext = os.path.splitext(charge_plot_img)
            disorder_plot_img = "%s-idr%s" % (base, ext)
            rec = Fasta.Record()
            rec.title = u
            rec.sequence = cur_db[u]["seq"]
            out_seq_handle.write(str(rec) + "\n")
            members.append(
                dict(
                    organism=o,
                    uniprot_id=get_uniprot_links([u]),
                    alt_names=get_alt_names(cur_db[u]),
                    alt_ids=get_uniprot_links(cur_db[u].get(
                        "uniref_children", [])),
                    charge=cur_db[u]["charge"],
                    charge_region="%0.2f" % cur_db[u]["charge_region"],
                    charge_plot_img=charge_plot_img,
                    disorder_plot_img=disorder_plot_img,
                    domains=len(cur_db[u].get("db_refs", [])),
                    interactions=get_string_link(
                        u,
                        max(
                            len(cur_db[u].get("string_interactors", [])) - 1,
                            0)),
                    description=cur_db[u].get("function_descr", "&nbsp;"),
                    c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
                ))
        with open(
                os.path.join(out_dir,
                             "%s-cluster%s.html" % (ipr_number, index)),
                "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))