def get_GO_dag(): try: GO_dag = GODag(obo_file=GO_PATH.as_posix()) except Exception: obo_fname = download_go_basic_obo(obo=GO_PATH.as_posix()) GO_dag = GODag(obo_file=GO_PATH.as_posix()) return GO_dag
def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file(os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) dict_result[cur_term] = {"vertices": vertices, "edges": edges} driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Hh123456")) def add_edge(tx, src, dst, score): tx.run(("MERGE (n1: GO{{term:\"{TERM1}\"}})"+ \ "MERGE (n2: GO{{term:\"{TERM2}\"}})"+ \ "MERGE (n1)-[r:SCR {{ score: {SCORE} }}]->(n2)").format(TERM1=src, TERM2=dst, SCORE=score)) def add_node(tx, nd): tx.run(("CREATE (n1: GO{{term:\"{TERM1}\"}})".format(TERM1=nd))) def add_friends(tx, name, friend_name): tx.run("MERGE (a:Person {name: $name}) " "MERGE (a)-[:KNOWS]->(friend:Person {name: $friend_name})", name=name, friend_name=friend_name) def print_friends(tx, name): for record in tx.run("MATCH (a:Person)-[:KNOWS]->(friend) WHERE a.name = $name " "RETURN friend.name ORDER BY friend.name", name=name): print(record["friend.name"]) # with driver.session() as session: # count=0 # for k, v in dict_result['GO:0005575']['vertices'].iteritems(): # if dict_result['GO:0005575']['vertices'].has_key(k) \ # and dict_result['GO:0005575']['vertices'][k]['isleaf']: # session.write_transaction(add_node,k) # count+=1 # print "total vartices: {}".foramt(count) with driver.session() as session: count=0 for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][ 'vertices'].has_key(vertices[1]) and score > 100000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \ dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: count+=1 session.write_transaction(add_edge, vertices[0], vertices[1], score) print "total edges: {}".format(count)
def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file( os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) dict_result[cur_term] = {"vertices": vertices, "edges": edges} go_edges_filtered = {} lines = [] for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575']['vertices'].has_key(vertices[1]) and score > 1000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: go_edges_filtered[cur_edges] = score lines.append("{}\t{}\n".format(cur_edges, score)) print "about to write filtered ppi go edges to file ({} lines)".format( len(lines)) with file( os.path.join(constants.OUTPUT_GLOBAL_DIR, "GO_edges_ppi_filtered.txt"), "w+") as f: f.writelines(lines)
def build_hierarcy(go_folder, roots=['GO:0008150'], ev_exclude=set()): # 0008150 0005575 0003674 go2geneids, geneids2go = fetch_go_hierarcy(go_folder, ev_exclude) """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() dict_result = {} for cur_term in roots: vertices, edges = extract_hier_all(gosubdag, cur_term, go2geneids) # all_go_ids=set(vertices.keys()) # for cur_id in all_go_ids: # if not cur_id in go2geneids: # go2geneids[cur_id]=set() msg = "Elapsed HMS: {}\n\n".format( str(datetime.timedelta(seconds=(toc - tic)))) sys.stdout.write(msg) dict_result[cur_term] = {"vertices": vertices, "edges": edges} return dict_result, go2geneids, geneids2go, get_entrez2ensembl_dictionary()
def gen_anno_small(): """Generate a maller nnotations containing 10% of the oringal genes""" godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo')) name2go = {o.name: o.item_id for o in godag.values()} file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a_small.anno') name2num = {e:i/10 for e, i in NAME2NUM.items()} _get_id2gos(file_id2gos, godag, name2go, name2num) print(name2num)
def test_paths_to_top(): dag = GODag(ROOT + "mini_obo.obo") expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000006', 'GO:0000008', 'GO:0000010']] actual_paths = dag.paths_to_top("GO:0000010") chk_results(actual_paths, expected_paths) print_paths(actual_paths)
def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = GODag(self.resource_manager.get_go_obo()) self.goa = self._load_goa_gaf()
def test_paths_to_top(): #dag = GODag("./tests/data/mini_obo.obo") dag = GODag("./data/mini_obo.obo") expected_paths = [ ['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000006', 'GO:0000008', 'GO:0000010'] ] actual_paths = dag.paths_to_top("GO:0000010") chk_results(actual_paths, expected_paths) prt_paths(actual_paths)
def get_highest_ic(): if not os.path.isfile(HIGHEST_IC_FILE_PATH): go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) compute_highest_inc_parallel(list(go_dag.keys())) ic_file = open(HIGHEST_IC_FILE_PATH, 'r') highest_ic_anc = json.load(ic_file) ic_file.close() return highest_ic_anc
def __init__(self, fin_obo): self.repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") self.fin_obo = os.path.join(self.repo, fin_obo) self.dag = GODag(self.fin_obo) self.go2obj = { go: o for go, o in self.dag.items() if not o.is_obsolete } self.goids_all = self.go2obj.keys()
def test_semantic_similarity(): """Test faster version of sematic similarity""" godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')) name2go = {o.name: o.item_id for o in godag.values()} assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go) tcntobj = TermCounts(godag, assoc) assert tcntobj.gocnts[name2go['I']] == 50 assert tcntobj.gocnts[name2go['L']] == 50 assert tcntobj.gocnts[name2go['M']] == 50 assert tcntobj.gocnts[name2go['N']] == 50
def test_paths_to_top(): #dag = GODag("./tests/data/mini_obo.obo") dag = GODag("./data/mini_obo.obo") expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'], [ 'GO:0000001', 'GO:0000003', 'GO:0000006', 'GO:0000008', 'GO:0000010' ]] actual_paths = dag.paths_to_top("GO:0000010") chk_results(actual_paths, expected_paths) prt_paths(actual_paths)
def intialize_term_counts(): go_freq_dict = dict() go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo")) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(JSON_INDEXED_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
def init(self): _log.debug("Cargando archivo de ontologias:" + self.obo_file) self.go_dag = GODag(self.obo_file) _log.debug("Se cargo el archivo:" + self.obo_file) if os.path.exists(self.graph_file): self.graph = nx.read_gpickle(self.graph_file) else: self._build_graph() nx.write_gpickle(self.graph, self.graph_file) _log.debug("Se genero el grafo de terminos")
def main(): """Print a GO term's lower-level hierarchy.""" import argparse prs = argparse.ArgumentParser(__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) prs.add_argument('go_ids', type=str, nargs='*', help='GO Term, e.g. GO:0070458') prs.add_argument('--o', default=None, type=str, help="Specifies the name of the output file") prs.add_argument('--no_indent', default=False, help="Do not indent GO terms", action='store_true') prs.add_argument('--obo', default="go-basic.obo", type=str, help="Location and name of the obo file") prs.add_argument('--dash_len', default=1, type=int, help="Printed width of the dashes column") prs.add_argument('--max_depth', default=None, type=int, help="max depth for printing relative to GO Term") prs.add_argument('--num_child', default=None, action='store_true', help="Print count of total number of children for each GO") prs.add_argument('--short', default=False, action='store_true', help="If a branch has already been printed, do not re-print." "Print '===' instead of dashes to note the point of compression") args = prs.parse_args() obo_dag = GODag(obo_file=args.obo) file_out = sys.stdout if args.o is None else open(args.o, 'w') lenprt = args.dash_len if not args.no_indent else None if args.go_ids: for go_id in args.go_ids: obo_dag.write_hier( go_id, file_out, len_dash=lenprt, max_depth=args.max_depth, num_child=args.num_child, short_prt=args.short) else: obo_dag.write_hier_all( file_out, len_dash=lenprt, max_depth=args.max_depth, num_child=args.num_child, short_prt=args.short) if args.o is not None: file_out.close() sys.stdout.write(" WROTE: {}\n".format(args.o))
def get_pathway_mapping(organism=9606, ontology='basic', exclude=None, force=False): obo = 'goslim_generic.obo' if 'slim' in ontology else 'go-basic.obo' namespace_filter = get_namespace_filter(exclude) if force & (os.path.isfile(obo)): os.remove(obo) obo_fname = goatools.base.download_go_basic_obo(obo) obodag = GODag(obo_fname) return {term_id:term.name for term_id,term in obodag.items() if namespace_filter(term.namespace)}
def test_gosubdag_relationships(prt=sys.stdout): """Plot both the standard 'is_a' field and the 'part_of' relationship.""" goids = set([ "GO:0032501", "GO:0044707", # alt_id: GO:0032501 # BP 1011 L01 D01 B multicellular organismal process "GO:0050874", "GO:0007608", # sensory perception of smell "GO:0050911" ]) # detection of chemical stimulus involved in sensory perception of smell # Load GO-DAG: Load optional 'relationship' fin_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_obo, prt, loading_bar=None) go2obj_plain = GODag(fin_obo) go2obj_relat = GODag(fin_obo, optional_attrs=['relationship']) print("\nCreate GoSubDag with GO DAG containing no relationships.") tic = timeit.default_timer() # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship gosubdag = GoSubDag(goids, go2obj_plain, relationships=False, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_plain = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) print("\nCreate GoSubDag while IGNORING relationships") # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship gosubdag = GoSubDag(goids, go2obj_relat, relationships=False, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_false = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_plain == goids_false print("\nCreate GoSubDag while loading only the 'part_of' relationship") gosubdag = GoSubDag(goids, go2obj_relat, relationships=['part_of'], prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_part_of = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_plain.intersection(goids_part_of) == goids_plain assert len(goids_part_of) > len(goids_plain) print("\nCreate GoSubDag while loading all relationships") gosubdag = GoSubDag(goids, go2obj_relat, relationships=True, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_true = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_part_of.intersection(goids_true) == goids_part_of assert len(goids_true) >= len(goids_part_of)
def test_all(): obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") godag = GODag(dag_fin) gosubdag = GoSubDag(godag.keys(), godag) out = sys.stdout write_hier_all(gosubdag, out)
def test_semantic_i150(): """Test that comparing two identical GO IDs returns true""" fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo') ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') # Read files godag = GODag(fin_dag) ## objanno = GafReader(fin_gaf) ## gene2gos = objanno.get_id2gos(namespace='CC') ## # Termcounts ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout) # Compare all GO terms with itself for goterm in set(godag.values()): goid = goterm.item_id assert semantic_similarity(goid, goid, godag) == 1.0
def test_i148b_semsim_lin(do_plt=False): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")) annoobj = GafReader(fin_gaf, godag=godag) associations = annoobj.get_id2gos('CC') tcntobj = TermCounts(godag, associations) if do_plt: _do_plt(tcntobj, godag) goids = list(godag.keys()) ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0)) ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0)) ##return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l) return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l)
def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh'])
def test_all(): dag = GODag("./data/mini_obo.obo") out = sys.stdout test_write_hier_all(dag, out) test_write_hier_norep(dag, out) test_write_hier_lim(dag, out) test_write_hier_mrk(dag, out)
def __init__(self, go_obo_path='data/go.obo'): canonical_orfs = paper_orfs self.obodag = GODag(go_obo_path) # read genes containing GO Ontology annotations orfs_with_go = read_sgd_orfs() # only use canonical orfs dataset self.orfs_with_go = orfs_with_go.join(canonical_orfs[[]], how='inner') # create mapping of gene names to set of GO annotaitons assoc = defaultdict(set) for idx, gene in self.orfs_with_go.iterrows(): assoc[gene['name']] = set(gene.ontology.split(',')) self.assoc = assoc self.methods = ['fdr_bh', 'bonferroni'] self.devnull = open('/dev/null', 'w') # create GO enrichment object to run GO self.goeaobj = GOEnrichmentStudy( assoc.keys(), # List of protein-coding genes assoc, # geneid/GO associations self.obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=self.methods, log=self.devnull)
def __GO_enrich__(self): go_file = "go-basic.obo" if not os.path.exists(go_file): download_go_basic_obo() # Load gene ontologies obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples fin_gene2go = download_ncbi_associations() objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # association is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() self.goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-acoding genes ns2assoc, # geneID/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method
def test_semantic_similarity(): """Test initializing TermCounts with annotations made to alternate GO ID""" godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo')) file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno') name2go = {o.name: o.item_id for o in godag.values()} assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM) tcntobj = TermCounts(godag, assoc) # N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations # Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
def show_go_dag_for_terms(terms, add_relationships=True): if type(terms) is pd.core.series.Series: terms = terms.tolist() if not terms: return with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) file_gene2go = download_ncbi_associations(prt=null) if add_relationships: optional_attrs = ['relationship', 'def'] else: optional_attrs = ['def'] obodag = GODag("geneinfo_cache/go-basic.obo", optional_attrs=optional_attrs, prt=null) gosubdag = GoSubDag(terms, obodag, relationships=add_relationships) GoSubDagPlot(gosubdag).plt_dag('geneinfo_cache/plot.png') return Image('geneinfo_cache/plot.png')
def check_group_enrichment(tested_gene_file_name, total_gene_file_name): total_gene_list = load_gene_list(total_gene_file_name) tested_gene = load_gene_list(tested_gene_file_name) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results) else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] output_rows = [("\r\n".join(e2g_convertor(tested_gene)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))] print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
def __init__(self, uniprot_file_path, csv_file_path, save_dir, mode='EC'): self.depth = 4 #specifies the depth of labels to consider self.mode = mode self.GODag = GODag( '/net/data.isilon/igem/2017/data/gene_ontology/go.obo', optional_attrs=['relationship']) self.max_depth = 4 #max_depth in goDAG to consider self.max_write = 1000000 #specify the max amount of labels to be written for one class: self.write_count = 0 #counter to see how much we already worte self.save_dir = save_dir self.uniprot_csv = csv_file_path self.uniprot_file_path = uniprot_file_path self.class_to_id_EC = _recursively_default_dict() self.class_to_id_GO = {} self.filter_minlength = True self.minlength = 175 self.filter_AA = True self.train_dataset_csv_path = '/net/data.isilon/igem/2017/data/uniprot_with_EC/SAfetyNEt/' print(save_dir) # load the dict if it's there: try: with open( os.path.join( self.save_dir, 'csv_by_EC', os.path.join('class2id_{}.p'.format(self.mode))), "rb") as pickle_f: self.class_to_id_EC = pickle.load(pickle_f) # freeze the default dict self.class_to_id_EC.default_factory = None print('Loaded EC-class dict.') except OSError: print('Failed to load EC-class dict. Generating EC-class dict.')
def test_all(): """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") tic = timeit.default_timer() godag = GODag(dag_fin) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = sys.stdout write_hier_all(gosubdag, out) write_hier_norep(gosubdag, out) write_hier_lim(gosubdag, out) write_hier_mrk_lst(gosubdag, out) write_hier_mrk_dct(gosubdag, out) write_hier_up(gosubdag, out) msg = "Elapsed HMS: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic)))) sys.stdout.write(msg)
def plotGO(clusterIDs, clusters, outdir, base): obodag = GODag("../../obo/go.obo") for id in clusterIDs: geneset = clusters[id]['geneset'] goIDs = clusters[id]['go']['terms'] for category in goIDs.keys(): success = False ids = goIDs[category] while not success: try: plot_gos( "{}/{}_{}_{}.png".format(outdir, base, id, category), ids, obodag) success = True except KeyError as e: value = str(e).replace("'", '') goIDs.remove(value)
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag(ROOT + "goslim_generic.obo") assoc = read_associations(ROOT + "slim_association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "small_population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
class _Run(object): """Group entire go-basic.obo""" obo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../go-basic.obo") def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources))) def prt_cnts(self, cnts): """Compare ancestor/descendant counts with relatives=False/True.""" k2v = {k:self.str_stats(v) for k, v in cnts.items()} print(k2v) @staticmethod def str_stats(vals): """Print statistics on values.""" ntd = stats.describe(vals) std = int(round(np.sqrt(ntd.variance))) return "({m} {M}) STD={STD:,}".format(m=ntd.minmax[0], M=ntd.minmax[1], STD=std) def get_gosubdag_r0(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag(goids, self.godag_r0, relationships=None, #rcntobj=self.gosubdag_r0.rcntobj, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_gosubdag_r1(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag(goids, self.godag_r1, relationships=True, #rcntobj=self.gosubdag_r1.rcntobj, prt=None) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_goids_rand(self, qty): """Return N randomly chosen GO IDs.""" shuffle(self.goids) return self.goids[:qty]
def main(): data = collections.defaultdict(set) g = GODag() selection = set() for name, rec in g.items(): if rec.namespace!="biological_process" or rec.level < 1: continue selection.add(rec.id) fp = file("gene_association.tair") for row in fp: if row[0]=="!": continue atoms = row.split("\t") #['TAIR', 'locus:2185485', 'AT5G14850', '', 'GO:0000030', 'TAIR:Communication:501714663', 'ISS', 'NCBI_gi:1552169|NCBI_gi:7634741', 'F', 'AT5G14850', 'AT5G14850|T9L3.150|T9L3_150', 'protein', 'taxon:3702', '20021003', 'TIGR', '', 'TAIR:locus:2185485\n'] domain, name, go = atoms[0], atoms[10], atoms[4] name = name.split("|", 1)[0] if go in selection and domain=="TAIR": data[name].add(go) fw = file("microarray.assoc", "w") print >>fw, "#gene,go_terms" for key, val in sorted(data.items()): print >>fw, "%s,%s" % (key, ";".join(sorted(val)))
class Data(object): """Holds data used in test.""" def __init__(self, fin_obo): self.repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") self.fin_obo = os.path.join(self.repo, fin_obo) self.dag = GODag(self.fin_obo) self.go2obj = {go:o for go, o in self.dag.items() if not o.is_obsolete} self.goids_all = self.go2obj.keys() def get_goids(self, num): """Return N randomly chosen GO IDs.""" shuffle(self.goids_all) return set(self.goids_all[:num])
def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))
help="Do not indent GO terms", action='store_true') p.add_argument('--obo', default="go-basic.obo", type=str, help="Location and name of the obo file") p.add_argument('--dash_len', default=1, type=int, help="Printed width of the dashes column") p.add_argument('--max_depth', default=None, type=int, help="max depth for printing relative to GO Term") p.add_argument('--num_child', default=None, action='store_true', help="Print count of total number of children for each GO") p.add_argument('--short', default=False, action='store_true', help="If a branch has already been printed, do not re-print." "Print '===' instead of dashes to note the point of compression") args = p.parse_args() obo_dag = GODag(obo_file=args.obo) file_out = sys.stdout if args.o is None else open(args.o, 'w') lenprt = args.dash_len if not args.no_indent else None if args.go_ids: for go_id in args.go_ids: obo_dag.write_hier( go_id, file_out, len_dash=lenprt, max_depth=args.max_depth, num_child=args.num_child, short_prt=args.short) else: obo_dag.write_hier_all(
dest='draw_parents', help="Do not draw parents of the query term") p.add_option("--disable-draw-children", action="store_false", dest='draw_children', help="Do not draw children of the query term") p.set_defaults(draw_parents=True) p.set_defaults(draw_children=True) opts, args = p.parse_args() if not len(args): obo_file = "go-basic.obo" else: obo_file = args[0] assert os.path.exists(obo_file), "file %s not found!" % obo_file g = GODag(obo_file) if opts.desc: g.write_dag() # run a test case if opts.term is not None: rec = g.query_term(opts.term, verbose=True) g.draw_lineage([rec], engine=opts.engine, gml=opts.gml, draw_parents=opts.draw_parents, draw_children=opts.draw_children)
class WrSubObo(object): """Read a large GO-DAG from an obo file. Write a subset GO-DAG into a small obo file.""" def __init__(self, fin_obo=None, optional_attrs=None, load_obsolete=None): self.fin_obo = fin_obo self.godag = GODag(fin_obo, optional_attrs, load_obsolete) if fin_obo is not None else None self.relationships = optional_attrs is not None and 'relationship' in optional_attrs def wrobo(self, fout_obo, goid_sources): """Write a subset obo file containing GO ID sources and their parents.""" goids_all = self._get_goids_all(goid_sources) with open(fout_obo, 'w') as prt: self._prt_info(prt, goid_sources, goids_all) self.prt_goterms(prt, self.fin_obo, goids_all) print(" WROTE {N} GO TERMS: {OBO}\n".format(N=len(goids_all), OBO=fout_obo)) @staticmethod def prt_goterms(fin_obo, goids, prt, b_prt=True): """Print the specified GO terms for GO IDs in arg.""" b_trm = False with open(fin_obo) as ifstrm: for line in ifstrm: if not b_trm: if line[:6] == "[Term]": b_trm = True b_prt = False elif line[:6] == "[Typedef]": b_prt = True else: if line[:6] == 'id: GO': b_trm = False b_prt = line[4:14] in goids if b_prt: prt.write("[Term]\n") if b_prt: prt.write(line) @staticmethod def get_goids(fin_obo, name): """Get GO IDs whose name matches given name.""" goids = set() # pylint: disable=unsubscriptable-object goterm = None with open(fin_obo) as ifstrm: for line in ifstrm: if goterm is not None: semi = line.find(':') if semi != -1: goterm[line[:semi]] = line[semi+2:].rstrip() else: if name in goterm['name']: goids.add(goterm['id']) goterm = None elif line[:6] == "[Term]": goterm = {} return goids def _get_goids_all(self, go_sources): """Given GO ID sources and optionally the relationship attribute, return all GO IDs.""" go2obj_user = {} objrel = CurNHigher(self.relationships, self.godag) objrel.get_id2obj_cur_n_high(go2obj_user, go_sources) goids = set(go2obj_user) for goterm in go2obj_user.values(): if goterm.alt_ids: goids.update(goterm.alt_ids) return goids def _prt_info(self, prt, goid_sources, goids_all): """Print information describing how this obo setset was created.""" prt.write("! Contains {N} GO IDs. Created using {M} GO sources:\n".format( N=len(goids_all), M=len(goid_sources))) for goid in goid_sources: prt.write("! {GO}\n".format(GO=str(self.godag.get(goid, "")))) prt.write("\n")
def __init__(self, fin_obo): self.repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") self.fin_obo = os.path.join(self.repo, fin_obo) self.dag = GODag(self.fin_obo) self.go2obj = {go:o for go, o in self.dag.items() if not o.is_obsolete} self.goids_all = self.go2obj.keys()
def __init__(self, fin_obo=None, optional_attrs=None, load_obsolete=None): self.fin_obo = fin_obo self.godag = GODag(fin_obo, optional_attrs, load_obsolete) if fin_obo is not None else None self.relationships = optional_attrs is not None and 'relationship' in optional_attrs
import optparse p = optparse.OptionParser("%prog [obo_file]") p.add_option("--description", dest="desc", help="write term descriptions to stdout" \ " from the obo file specified in args", action="store_true") p.add_option("--term", dest="term", help="write the parents and children" \ "of the query term", action="store", type="string", default=None) (options, args) = p.parse_args() if not len(args): obo_file = None else: obo_file = args[0] assert os.path.exists(obo_file), "file %s not found!" % obo_file if obo_file is None: g = GODag() else: g = GODag(obo_file) if options.desc: g.write_dag() # run a test case if options.term is not None: rec = g.query_term(options.term, verbose=True) g.draw_lineage(rec, dpi=50, verbose=True)
if __name__ == '__main__': import optparse p = optparse.OptionParser("%prog [obo_file]") p.add_option("--description", dest="desc", help="write term descriptions to stdout" " from the obo file specified in args", action="store_true") p.add_option("--term", dest="term", help="write the parents and children" "of the query term", action="store", type="string", default=None) p.add_option("--gml", action="store_true", help="Write GML output (for Cytoscape) [default: %default]") opts, args = p.parse_args() if not len(args): obo_file = "gene_ontology.1_2.obo" else: obo_file = args[0] assert os.path.exists(obo_file), "file %s not found!" % obo_file g = GODag(obo_file) if opts.desc: g.write_dag() # run a test case if opts.term is not None: rec = g.query_term(opts.term, verbose=True) g.draw_lineage([rec], gml=opts.gml)