def uvrd_cas4_links(G, graph_save_dir): uvrd_profiles = [ "COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312", "pfam00580", "pfam13361", "pfam13538" ] cas4_profiles = [ "cd09637", "cd09659", "cls000170", "COG1468", "COG4343", "pfam01930", "pfam06023" ] all_profiles = uvrd_profiles + cas4_profiles profile2gene = t.map_cdd_defense2gene_name() print "Extracting subgraphs" cas4_g = gt.subgraph(G, cas4_profiles, radius=1) uvrd_g = gt.subgraph(G, uvrd_profiles, radius=1) print "Joining subgraphs" joint_g = nx.compose(cas4_g, uvrd_g) print "Writing graph to files" nodes_to_write = set() with open(os.path.join(graph_save_dir, "edgelist.txt"), "w") as edge_outf: for edge in joint_g.edges(data=True): (p1, p2, data) = edge # if p1 in all_profiles and p2 in all_profiles: edge_outf.write("%s\t%s\t%f\n" % (p1, p2, data['weight'])) nodes_to_write.update([p1, p2]) with open(os.path.join(graph_save_dir, "nodes.txt"), "w") as node_outf: for p in nodes_to_write: if p in cas4_profiles: _type = 1 elif p in uvrd_profiles: _type = 2 else: _type = 0 node_outf.write("%s\t%s\t%d\n" % (p, profile2gene[p], _type)) return joint_g
def uvrd_cas4_links(G, graph_save_dir): uvrd_profiles = ["COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312", "pfam00580", "pfam13361", "pfam13538"] cas4_profiles = ["cd09637", "cd09659", "cls000170", "COG1468", "COG4343", "pfam01930", "pfam06023"] all_profiles = uvrd_profiles + cas4_profiles profile2gene = t.map_cdd_defense2gene_name() print "Extracting subgraphs" cas4_g = gt.subgraph(G, cas4_profiles, radius=1) uvrd_g = gt.subgraph(G, uvrd_profiles, radius=1) print "Joining subgraphs" joint_g = nx.compose(cas4_g, uvrd_g) print "Writing graph to files" nodes_to_write= set() with open(os.path.join(graph_save_dir, "edgelist.txt"),"w") as edge_outf: for edge in joint_g.edges(data=True): (p1, p2, data) = edge # if p1 in all_profiles and p2 in all_profiles: edge_outf.write("%s\t%s\t%f\n" % (p1, p2, data['weight'])) nodes_to_write.update([p1, p2]) with open(os.path.join(graph_save_dir, "nodes.txt"),"w") as node_outf: for p in nodes_to_write: if p in cas4_profiles: _type = 1 elif p in uvrd_profiles: _type = 2 else: _type = 0 node_outf.write("%s\t%s\t%d\n" % (p, profile2gene[p], _type)) return joint_g
def add_annotation_to_pty(): pty_dir = os.path.join(prok1603_path, "pty") print "Loading dictionaries" gi2profiles = map_gi2profiles() gi2cluster = map_gi2cluster() profile2gene = t.map_cdd_defense2gene_name() print "Maps loaded. Starting annotations" cnt = 1 for d in os.listdir(pty_dir): # if cnt < 1937: # cnt += 1 # continue print cnt, d for f in glob.glob("%s/*.pty" % (os.path.join(pty_dir, d))): genes = t.parse_plain_pty_file(f) if not genes: continue for gene in genes: if gene.gid in gi2profiles: gene.profiles = gi2profiles[gene.gid] gene.gene_names = " ".join([profile2gene[p] for p in gene.profiles]) elif gene.gid in gi2cluster: gene.profiles = gi2cluster[gene.gid] new_f = f + "2" t.write_genes_to_pty(genes, new_f) cnt += 1
def extract_cas_communities(tree_file, node_file, G=None): module2members = infomap.parse_tree_file(tree_file, node_names=True, node_desc_file=node_file) modules = sorted([(k, v) for (k, v) in module2members.items()], key=lambda x: len(x[1]), reverse=True) cl2name = cas_synonyms() profile2gene = t.map_cdd_defense2gene_name() def_profs_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/profiles/defenseProfiles.tab" cas_profiles = [ l.split()[0] for l in open(def_profs_file) if l.split()[1] == "CRISPR" ] # uvrd_profiles = ["COG0210", "PRK11773", "TIGR01075", "cl22977", "cl26202", "cl28312", "pfam00580", "pfam13361", # "pfam13538"] cas_profiles = cas_profiles + list(cl2name.keys()) save_dir = os.path.join(work_dir, "cases/crispr_graph") for module in modules: nodes = set(module[1]) cas_nodes = nodes.intersection(cas_profiles) if cas_nodes: # if "crispr" in nodes: # print "CRISPR" cas_ratio = len(cas_nodes) / float(len(nodes)) if cas_ratio < 0.3: continue print module[0], len(nodes), cas_ratio continue graph_file = os.path.join(save_dir, "%s_%d.net" % (module[0], len(nodes))) node_file = os.path.join( save_dir, "%s_%d.nodes.txt" % (module[0], len(nodes))) subgraph = gt.subgraph_crosslink(G, list(nodes)) gt.write_edgelist(subgraph, graph_file) with open(node_file, "w") as outf: outf.write("Node\tgene_name\t") for node in subgraph: if node in cl2name: _name = cl2name[node] elif node in profile2gene: _name = profile2gene[node] else: _name = node outf.write("%s\t%s\n" % (node, _name)) break
def extract_extract_cas_communities_from_oslom(G): oslom_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/prok1603/graph/infomap/merged/consensus_clustering/multi_level/norm_min/3/oslom/tp" module2pvalue = {} module2members = {} with open(oslom_file) as inf: l = inf.readline() while l: assert l.startswith("#") parts = l.split() name = parts[1] size = int(parts[3]) if size == 1: l = inf.readline() l = inf.readline() continue pvalue = float(parts[5]) module2pvalue[name] = pvalue module2members[name] = inf.readline().strip().split() l = inf.readline() # modules = [l.strip().split() for l in open(oslom_file) if not l.startswith("#")] node_file = os.path.join(work_dir, "infomap/merged/nodes.txt") id2node = {l.split()[0]: l.split()[1] for l in open(node_file)} def_profs_file = "/panfs/pan1/patternquest/Projects/NewSystems/data/profiles/defenseProfiles.tab" cas_profiles = [ l.split()[0] for l in open(def_profs_file) if l.split()[1] == "CRISPR" ] cl2name = cas_synonyms() cas_profiles = cas_profiles + list(cl2name.keys()) save_dir = os.path.join(work_dir, "cases/crispr_graph_2/") profile2gene = t.map_cdd_defense2gene_name() for (name, members) in module2members.items(): # nodes = set(module[1]) nodes = set([id2node[id] for id in members]) cas_nodes = nodes.intersection(cas_profiles) if cas_nodes: cas_ratio = len(cas_nodes) / float(len(nodes)) if cas_ratio < 0.3: continue if "crispr" in nodes: print "CRISPR" # print len(nodes), cas_ratio, " ".join(nodes) # print len(nodes), cas_ratio, Counter([profile2gene[n] for n in cas_nodes]) vals = [ name, str(module2pvalue[name]), str(len(nodes)), str(cas_ratio), " ".join(nodes) ] # print "\t".join(vals) graph_file = os.path.join(save_dir, "%s_%d.net" % (name, len(nodes))) node_file_s = os.path.join(save_dir, "%s_%d.nodes.txt" % (name, len(nodes))) subgraph = gt.subgraph_crosslink(G, list(nodes)) gt.write_edgelist(subgraph, graph_file) with open(node_file_s, "w") as outf: outf.write("Node\tgene_name\tcas\n") for node in subgraph: if node in cl2name: _name = cl2name[node] elif node in profile2gene: _name = profile2gene[node] else: _name = node _type = 0 if node in cas_nodes: _type = 1 outf.write("%s\t%s\t%d\n" % (node, _name, _type))
os.path.join(os.path.expanduser('~'), 'Projects/SystemFiles/')) elif sys.platform == 'linux2': sys.path.append( os.path.join(os.path.expanduser('~'), 'Projects/lib/BioPy/')) sys.path.append( os.path.join(os.path.expanduser('~'), 'Projects/SystemFiles/')) import global_variables as gv sys.path.append(gv.project_code_path) from lib.utils import tools as t import glob root_path = '/home/hudaiber/data/Prok1603/pty/' print "Loading profile2genes" profile2gene = t.map_cdd_defense2gene_name() print "Loading ccp" map_file = '/panfs/pan1.be-md.ncbi.nlm.nih.gov/prokdata/db_tmp/Prok1603/Annotation/Prok1603.ccp.csv' gi2cdd = t.map_gid2cdd(map_file) print "Loading CDD" cdd2gene = t.map_cdd_defense2gene_name() cnt = 0 cas4 = 0 cas4_uvrd = 0 uvrd = 0 for dir in os.listdir(root_path):