Esempio n. 1
0
def uvrd_cas4_links(G, graph_save_dir):

    uvrd_profiles = [
        "COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312",
        "pfam00580", "pfam13361", "pfam13538"
    ]
    cas4_profiles = [
        "cd09637", "cd09659", "cls000170", "COG1468", "COG4343", "pfam01930",
        "pfam06023"
    ]
    all_profiles = uvrd_profiles + cas4_profiles
    profile2gene = t.map_cdd_defense2gene_name()

    print "Extracting subgraphs"
    cas4_g = gt.subgraph(G, cas4_profiles, radius=1)
    uvrd_g = gt.subgraph(G, uvrd_profiles, radius=1)
    print "Joining subgraphs"
    joint_g = nx.compose(cas4_g, uvrd_g)
    print "Writing graph to files"

    nodes_to_write = set()

    with open(os.path.join(graph_save_dir, "edgelist.txt"), "w") as edge_outf:

        for edge in joint_g.edges(data=True):
            (p1, p2, data) = edge

            # if p1 in all_profiles and p2 in all_profiles:
            edge_outf.write("%s\t%s\t%f\n" % (p1, p2, data['weight']))
            nodes_to_write.update([p1, p2])

    with open(os.path.join(graph_save_dir, "nodes.txt"), "w") as node_outf:

        for p in nodes_to_write:
            if p in cas4_profiles:
                _type = 1
            elif p in uvrd_profiles:
                _type = 2
            else:
                _type = 0

            node_outf.write("%s\t%s\t%d\n" % (p, profile2gene[p], _type))

    return joint_g
Esempio n. 2
0
def uvrd_cas4_links(G, graph_save_dir):

    uvrd_profiles = ["COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312", "pfam00580", "pfam13361", "pfam13538"]
    cas4_profiles = ["cd09637", "cd09659", "cls000170", "COG1468", "COG4343", "pfam01930", "pfam06023"]
    all_profiles = uvrd_profiles + cas4_profiles
    profile2gene = t.map_cdd_defense2gene_name()

    print "Extracting subgraphs"
    cas4_g = gt.subgraph(G, cas4_profiles, radius=1)
    uvrd_g = gt.subgraph(G, uvrd_profiles, radius=1)
    print "Joining subgraphs"
    joint_g = nx.compose(cas4_g, uvrd_g)
    print "Writing graph to files"

    nodes_to_write= set()

    with open(os.path.join(graph_save_dir, "edgelist.txt"),"w") as edge_outf:

        for edge in joint_g.edges(data=True):
            (p1, p2, data) = edge

            # if p1 in all_profiles and p2 in all_profiles:
            edge_outf.write("%s\t%s\t%f\n" % (p1, p2, data['weight'])) 
            nodes_to_write.update([p1, p2])

    with open(os.path.join(graph_save_dir, "nodes.txt"),"w") as node_outf:

        for p in nodes_to_write:
            if p in cas4_profiles:
                _type = 1
            elif p in uvrd_profiles:
                _type = 2
            else:
                _type = 0

            node_outf.write("%s\t%s\t%d\n" % (p, profile2gene[p], _type))

    return joint_g
Esempio n. 3
0
def add_annotation_to_pty():

    pty_dir = os.path.join(prok1603_path, "pty")

    print "Loading dictionaries"
    gi2profiles = map_gi2profiles()
    gi2cluster = map_gi2cluster()
    profile2gene = t.map_cdd_defense2gene_name()

    print "Maps loaded. Starting annotations"
    cnt = 1
    for d in os.listdir(pty_dir):

        # if cnt < 1937:
        #     cnt += 1
        #     continue

        print cnt, d

        for f in glob.glob("%s/*.pty" % (os.path.join(pty_dir, d))):
            genes = t.parse_plain_pty_file(f)

            if not genes:
                continue

            for gene in genes:
                if gene.gid in gi2profiles:
                    gene.profiles = gi2profiles[gene.gid]
                    gene.gene_names = " ".join([profile2gene[p] for p in gene.profiles])
                elif gene.gid in gi2cluster:
                    gene.profiles = gi2cluster[gene.gid]

            new_f = f + "2"

            t.write_genes_to_pty(genes, new_f)

        cnt += 1
Esempio n. 4
0
def extract_cas_communities(tree_file, node_file, G=None):

    module2members = infomap.parse_tree_file(tree_file,
                                             node_names=True,
                                             node_desc_file=node_file)

    modules = sorted([(k, v) for (k, v) in module2members.items()],
                     key=lambda x: len(x[1]),
                     reverse=True)

    cl2name = cas_synonyms()
    profile2gene = t.map_cdd_defense2gene_name()

    def_profs_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/profiles/defenseProfiles.tab"
    cas_profiles = [
        l.split()[0] for l in open(def_profs_file) if l.split()[1] == "CRISPR"
    ]

    # uvrd_profiles = ["COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312", "pfam00580", "pfam13361",
    #                  "pfam13538"]

    cas_profiles = cas_profiles + list(cl2name.keys())

    save_dir = os.path.join(work_dir, "cases/crispr_graph")

    for module in modules:
        nodes = set(module[1])
        cas_nodes = nodes.intersection(cas_profiles)

        if cas_nodes:

            # if "crispr" in nodes:
            #     print "CRISPR"

            cas_ratio = len(cas_nodes) / float(len(nodes))

            if cas_ratio < 0.3:
                continue

            print module[0], len(nodes), cas_ratio

            continue

            graph_file = os.path.join(save_dir,
                                      "%s_%d.net" % (module[0], len(nodes)))
            node_file = os.path.join(
                save_dir, "%s_%d.nodes.txt" % (module[0], len(nodes)))

            subgraph = gt.subgraph_crosslink(G, list(nodes))
            gt.write_edgelist(subgraph, graph_file)

            with open(node_file, "w") as outf:
                outf.write("Node\tgene_name\t")
                for node in subgraph:
                    if node in cl2name:
                        _name = cl2name[node]
                    elif node in profile2gene:
                        _name = profile2gene[node]
                    else:
                        _name = node

                    outf.write("%s\t%s\n" % (node, _name))
            break
Esempio n. 5
0
def extract_extract_cas_communities_from_oslom(G):

    oslom_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/prok1603/graph/infomap/merged/consensus_clustering/multi_level/norm_min/3/oslom/tp"

    module2pvalue = {}
    module2members = {}
    with open(oslom_file) as inf:
        l = inf.readline()

        while l:
            assert l.startswith("#")

            parts = l.split()
            name = parts[1]
            size = int(parts[3])
            if size == 1:
                l = inf.readline()
                l = inf.readline()
                continue

            pvalue = float(parts[5])

            module2pvalue[name] = pvalue
            module2members[name] = inf.readline().strip().split()
            l = inf.readline()

    # modules = [l.strip().split() for l in open(oslom_file) if not l.startswith("#")]

    node_file = os.path.join(work_dir, "infomap/merged/nodes.txt")

    id2node = {l.split()[0]: l.split()[1] for l in open(node_file)}

    def_profs_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/profiles/defenseProfiles.tab"
    cas_profiles = [
        l.split()[0] for l in open(def_profs_file) if l.split()[1] == "CRISPR"
    ]
    cl2name = cas_synonyms()
    cas_profiles = cas_profiles + list(cl2name.keys())
    save_dir = os.path.join(work_dir, "cases/crispr_graph_2/")

    profile2gene = t.map_cdd_defense2gene_name()

    for (name, members) in module2members.items():
        # nodes = set(module[1])
        nodes = set([id2node[id] for id in members])
        cas_nodes = nodes.intersection(cas_profiles)

        if cas_nodes:

            cas_ratio = len(cas_nodes) / float(len(nodes))

            if cas_ratio < 0.3:
                continue

            if "crispr" in nodes:
                print "CRISPR"

            # print len(nodes), cas_ratio, " ".join(nodes)
            # print len(nodes), cas_ratio, Counter([profile2gene[n] for n in cas_nodes])
            vals = [
                name,
                str(module2pvalue[name]),
                str(len(nodes)),
                str(cas_ratio), " ".join(nodes)
            ]
            # print "\t".join(vals)

            graph_file = os.path.join(save_dir,
                                      "%s_%d.net" % (name, len(nodes)))
            node_file_s = os.path.join(save_dir,
                                       "%s_%d.nodes.txt" % (name, len(nodes)))

            subgraph = gt.subgraph_crosslink(G, list(nodes))
            gt.write_edgelist(subgraph, graph_file)

            with open(node_file_s, "w") as outf:
                outf.write("Node\tgene_name\tcas\n")
                for node in subgraph:
                    if node in cl2name:
                        _name = cl2name[node]
                    elif node in profile2gene:
                        _name = profile2gene[node]
                    else:
                        _name = node

                    _type = 0
                    if node in cas_nodes:
                        _type = 1

                    outf.write("%s\t%s\t%d\n" % (node, _name, _type))
        os.path.join(os.path.expanduser('~'), 'Projects/SystemFiles/'))
elif sys.platform == 'linux2':
    sys.path.append(
        os.path.join(os.path.expanduser('~'), 'Projects/lib/BioPy/'))
    sys.path.append(
        os.path.join(os.path.expanduser('~'), 'Projects/SystemFiles/'))

import global_variables as gv
sys.path.append(gv.project_code_path)
from lib.utils import tools as t
import glob

root_path = '/home/hudaiber/data/Prok1603/pty/'

print "Loading profile2genes"
profile2gene = t.map_cdd_defense2gene_name()

print "Loading ccp"
map_file = '/panfs/pan1.be-md.ncbi.nlm.nih.gov/prokdata/db_tmp/Prok1603/Annotation/Prok1603.ccp.csv'
gi2cdd = t.map_gid2cdd(map_file)

print "Loading CDD"
cdd2gene = t.map_cdd_defense2gene_name()

cnt = 0

cas4 = 0
cas4_uvrd = 0
uvrd = 0

for dir in os.listdir(root_path):