def test_obsolete_term_repair_withfrom(): vals = ["ZFIN", "ZFIN:ZDB-GENE-980526-362", "acts_upstream_of_or_within", "GO:0007155", "PMID:15494018", "ECO:0000305", "GO:0005913|GO:1,GO:4|ZFIN:ZDB-MRPHLNO-010101-1,MGI:1232453", "", "20041026", "ZFIN", "", "contributor=GOC:zfin_curators|model-state=production|noctua-model-id=gomodel:ZFIN_ZDB-GENE-980526-362" ] ont = OntologyFactory().create(ALT_ID_ONT) config = assocparser.AssocParserConfig(ontology=ont, rule_set=assocparser.RuleSet.ALL) parser = GpadParser(config=config) result = parser.parse_line("\t".join(vals)) assoc = result.associations[0] # GO:0005913 should be repaired to its replacement term, GO:00005912 assert [ConjunctiveSet(elements=[Curie(namespace='GO', identity='0005912')]), # repaired test GO elements ConjunctiveSet(elements=[Curie(namespace='GO', identity='2'), Curie(namespace='GO', identity='3')]), # non GO elements stay the same, could be obsolete or not ConjunctiveSet(elements=[Curie(namespace='ZFIN', identity='ZDB-MRPHLNO-010101-1'), Curie(namespace='MGI', identity='1232453')])] == assoc.evidence.with_support_from
def test_aspect_fill_for_obsolete_terms(): # Test null aspect on an obsolete term # GO:4 is obsolete and has no aspect (hasOBONamespace) in obsolete.json ontology # GO:3 is it's replacement term # Note that GPAD lines contain no aspect data vals = [ "MGI", "MGI:105128", "involved_in", "GO:4", "PMID:25901318", "ECO:0000314", "", "", "20190517", "MGI", "", "contributor=http://orcid.org/0000-0002-9796-7693|model-state=production|noctua-model-id=gomodel:5c4605cc00004132" ] ont = OntologyFactory().create(ALT_ID_ONT) config = assocparser.AssocParserConfig(ontology=ont, rule_set=assocparser.RuleSet.ALL) parser = GpadParser(config=config) result = parser.parse_line("\t".join(vals)) assoc = result.associations[0] assert assoc.object.id == Curie("GO", "3") # GO:4 should be repaired to its replacement term, GO:3 assert assoc.aspect == 'P' # Aspect should not be empty
def test_parse(): p = GpadParser(config=assocparser.AssocParserConfig( group_metadata=yaml.load(open("tests/resources/mgi.dataset.yaml"), Loader=yaml.FullLoader))) test_gpad_file = "tests/resources/mgi.test.gpad" results = p.parse(open(test_gpad_file, "r")) print(p.report.to_markdown())
def test_skim_gpad(): p = GpadParser() p.config.ecomap = EcoMap() results = p.skim(open(POMBASE_GPAD, "r")) assert len(results) == 1984 for r in results: print(str(r)) (s, sn, o) = r assert o.startswith('GO:') assert s.startswith('PomBase:') or s.startswith('PR:')
def test_invalid_goid_in_gpad(): # Note: this ontology is a subset of GO extracted using the GAF, not GPAD p = GpadParser() p.config.ontology = OntologyFactory().create(ONT) results = p.parse(open(POMBASE_GPAD, "r"), skipheader=True) # we expect errors since ONT is not tuned for the GPAD file # for m in p.report.messages: # print("MESSAGE: {}".format(m)) assert len(p.report.messages) > 500 print(p.report.to_markdown())
def filter_rule_validate_lines(annots, assoc_filter): filtered = [] # Converts split GPAD line into ontobio assoc obj for passing into standard FilterRule validation gpad_parser = GpadParser() for a in annots: parse_result = gpad_parser.parse_line("\t".join(a)) if len(parse_result.associations) > 0: # Right now, GpadParser only returns 0 or 1 associations assoc = parse_result.associations[0] assoc = extract_properties(assoc) if "annotation_properties" in assoc: a.append(assoc["annotation_properties"]) if assoc_filter.validate_line(assoc): filtered.append(a) return filtered
def create_from_file(self, file=None, fmt='gaf', skim=True, **args): """ Creates from a file. Arguments --------- file : str or file input file or filename format : str name of format e.g. gaf """ p = None if fmt == 'gaf': p = GafParser() elif fmt == 'gpad': p = GpadParser() elif fmt == 'hpoa': p = HpoaParser() else: logging.error("Format not recognized: {}".format(fmt)) logging.info("Parsing {} with {}/{}".format(file, fmt, p)) if skim: results = p.skim(file) return self.create_from_tuples(results, **args) else: assocs = p.parse(file, skipheader=True) return self.create_from_assocs(assocs, **args)
def test_unmapped_eco_to_gaf_codes(): # By default, ECO codes in GPAD need to be convertible to an ECO GAF code (e.g. IDA, ISO) vals = [ "MGI", "MGI:88276", "is_active_in", "GO:0098831", "PMID:8909549", "ECO:0000164", "", "", "20180711", "SynGO", "part_of(UBERON:0000956)", "" ] parser = GpadParser(config=assocparser.AssocParserConfig()) result = parser.parse_line("\t".join(vals)) assert len(result.associations) == 0 messages = parser.report.messages assert messages[0]["type"] == parser.report.UNKNOWN_EVIDENCE_CLASS parser.config.allow_unmapped_eco = True result = parser.parse_line("\t".join(vals)) assert len(result.associations) == 1 parser.config.allow_unmapped_eco = False vals[5] = "ECO:0000314" # maps to IDA result = parser.parse_line("\t".join(vals)) assert len(result.associations) == 1 vals[5] = "ECO:0006003" # indirectly maps to IDA via gaf-eco-mapping-derived.txt result = parser.parse_line("\t".join(vals)) assert len(result.associations) == 1
def test_convert_gaf_to_gpad(): p = GafParser() p.config.ecomap = EcoMap() w = GpadWriter() p2 = GpadParser() convert(POMBASE, p, w, p2)
def test_skim(): p = GpadParser() results = p.skim(open(POMBASE, "r")) print(str(results))
def test_parse(): p = GpadParser() results = p.parse(open(POMBASE, "r")) for r in results: print(str(r))
import logging from typing import List from ontobio.io import assocparser from ontobio.io.gpadparser import GpadParser from ontobio.model.association import GoAssociation, Date from ontobio.rdfgen.gocamgen import errors from ontobio import ecomap logger = logging.getLogger(__name__) ecomapping = ecomap.EcoMap() ipi_eco = ecomapping.coderef_to_ecoclass("IPI") GPAD_PARSER = GpadParser() BINDING_ROOT = "GO:0005488" # binding IPI_ECO_CODE = ipi_eco class GoAssocWithFrom: """ Separate with/from column values into header vs line arrangement. Used for explicit placement in annotation assertions. """ def __init__(self, header=None, line=None): if header is None: header = [] if line is None: line = [] self.header = sorted(header)
match_score = 4 if sorted(r.upper() for r in source['evidence']['has_supporting_reference']) == \ sorted(r.upper() for r in target['evidence']['has_supporting_reference']): match_score = 5 return match_score if __name__ == '__main__': f = open("compare.txt", "w") print("Starting comparison ") parser = argparse.ArgumentParser() parser.add_argument('-g1', '--gpad_file1', help="Filepath of GPAD file 1", required=True) parser.add_argument('-g2', '--gpad_file2', help="Filepath of GPAD file 2", required=True) args = parser.parse_args() gpad_parser = GpadParser() assocs1 = gpad_parser.parse(args.gpad_file1, skipheader=True) assocs2 = gpad_parser.parse(args.gpad_file2, skipheader=True) for a in assocs1: #gene_id_a = a["subject"]["id"] #ont_id_a = a["object"]["id"] #print("a" + gene_id_a + " "+ont_id_a) match = is_assoc_in_list(a, assocs2) if match.__eq__("exact match"): exact_matches = exact_matches + 1 elif match.__eq__("close match"): close_matches = close_matches + 1 elif match.__eq__("no match"): no_matches = no_matches + 1 f.write(match + "\t" + a["source_line"]) input_lines = input_lines + 1
def __init__(self, gpad_file, filter_rule: FilterRule): gpad_parser = GpadParser() assocs = gpad_parser.parse(gpad_file, skipheader=True) self.assocs = extract_properties_from_assocs(assocs) self.assoc_filter = AssocFilter(filter_rule)
def test_parse_gpad(): parse_with(POMBASE_GPAD, GpadParser())
def main(): """ Wrapper for Assoc Parsing """ parser = argparse.ArgumentParser( description='Wrapper for obographs assocmodel library' """ By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource', type=str, required=False, help='Name of ontology') parser.add_argument( '-f', '--file', type=str, required=False, help='Name of input file for associations - currently GAF is assumed') parser.add_argument('-F', '--format', type=str, required=False, help='Format of assoc file. One of GAF, GPAD or HPOA') parser.add_argument('-o', '--outfile', type=str, required=False, help='Path to output file') parser.add_argument('-m', '--messagefile', type=str, required=False, help='Path to messages (report) markdown file') parser.add_argument('-t', '--to', type=str, required=False, help='Output to (tree, dot, ...)') parser.add_argument( "--filter-out", nargs="+", required=False, default=[], metavar="EVIDENCE", help= "List of any evidence codes to filter out of the GAF. E.G. --filter-out IEA IMP" ) parser.add_argument("--filtered-file", required=False, default=None, metavar="FILTERED_FILE", help="File to write the filtered out evidence GAF to") parser.add_argument( '-T', '--taxon', nargs='*', required=False, help='valid taxon (NCBITaxon ID) - validate against this') parser.add_argument('--subject_prefix', nargs='*', required=False, help='E.g PomBase - validate against this') parser.add_argument('--object_prefix', nargs='*', required=False, help='E.g GO - validate against this') parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') parser_n = subparsers.add_parser('validate', help='Validate associations') parser_n.set_defaults(function=validate_assocs) parser_n = subparsers.add_parser('filter', help='Filter associations') parser_n.set_defaults(function=filter_assocs) parser_n = subparsers.add_parser('convert', help='Convert associations') parser_n.set_defaults(function=convert_assocs) parser_n.add_argument('-t', '--to', type=str, required=True, help='Format to convert to') parser_n = subparsers.add_parser('map2slim', help='Map to a subset/slim') parser_n.set_defaults(function=map2slim) parser_n.add_argument('-p', '--properties', nargs='*', type=str, required=False, help='Properties') parser_n.add_argument('-s', '--subset', type=str, required=True, help='subset (e.g. map2slim)') args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) logging.info("Welcome!") handle = args.resource # Ontology Factory ofactory = OntologyFactory() logging.info("Creating ont object from: {} {}".format(handle, ofactory)) ont = ofactory.create(handle) logging.info("ont: {}".format(ont)) func = args.function # Upper case all evidence codes args.filter_out = [code.upper() for code in args.filter_out] # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None config = assocparser.AssocParserConfig( valid_taxa=args.taxon, ontology=ont, class_idspaces=args.object_prefix, entity_idspaces=args.subject_prefix, filter_out_evidence=args.filter_out, filtered_evidence_file=filtered_evidence_file) p = None fmt = None if args.format is None: fmt = 'gaf' else: fmt = args.format.lower() # TODO: use a factory if fmt == 'gaf': p = GafParser() elif fmt == 'gpad': p = GpadParser() elif fmt == 'hpoa': p = HpoaParser() elif fmt == "gpi": p = entityparser.GpiParser() func = validate_entity p.config = config outfh = None if args.outfile is not None: two_mb = 2097152 outfh = open(args.outfile, "w", buffering=two_mb) func(ont, args.file, outfh, p, args) if filtered_evidence_file: filtered_evidence_file.close() if outfh is not None: outfh.close() if args.messagefile is not None: mfh = open(args.messagefile, "w") mfh.write(p.report.to_markdown()) mfh.close() else: print(p.report.to_markdown())
def main(): """ Wrapper for Assoc Parsing """ parser = argparse.ArgumentParser(description='Wrapper for obographs assocmodel library' """ By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-r', '--resource', type=str, required=False, help='Name of ontology') parser.add_argument('-f', '--file', type=str, required=False, help='Name of input file for associations - currently GAF is assumed') parser.add_argument('-F', '--format', type=str, required=False, help='Format of assoc file. One of GAF, GPAD or HPOA') parser.add_argument('-o', '--outfile', type=str, required=False, help='Path to output file') parser.add_argument("--report-md", type=str, required=False, dest="report_md", help="Path to report markdown file") parser.add_argument("--report-json", type=str, required=False, dest="report_json", help="Path to report JSON file") parser.add_argument('-t', '--to', type=str, required=False, help='Output to (tree, dot, ...)') parser.add_argument("--filter-out", nargs="+", required=False, default=[], metavar="EVIDENCE", help="List of any evidence codes to filter out of the GAF. E.G. --filter-out IEA IMP") parser.add_argument("--filtered-file", required=False, default=None, metavar="FILTERED_FILE", help="File to write the filtered out evidence GAF to") parser.add_argument('-T', '--taxon', nargs='*', required=False, help='valid taxon (NCBITaxon ID) - validate against this') parser.add_argument('--subject_prefix', nargs='*', required=False, help='E.g PomBase - validate against this') parser.add_argument('--object_prefix', nargs='*', required=False, help='E.g GO - validate against this') parser.add_argument("-I", "--gaferencer-file", type=argparse.FileType('r'), required=False, help="Output from Gaferencer run on a set of GAF annotations") parser.add_argument('-v', '--verbosity', default=0, action='count', help='Increase output verbosity') parser.add_argument("--allow_paint", required=False, action="store_const", const=True, help="Allow IBAs in parser") parser.add_argument("-g", "--gpi", type=str, required=False, default=None, help="GPI file") parser.add_argument("-l", "--rule", action="append", required=None, default=[], dest="rule_set", help="Set of rules to be run. Default is no rules to be run, with the exception \ of gorule-0000027 and gorule-0000020. See command line documentation in the \ ontobio project or readthedocs for more information") subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') parser_n = subparsers.add_parser('validate', help='Validate associations') parser_n.set_defaults(function=validate_assocs) parser_n = subparsers.add_parser('filter', help='Filter associations') parser_n.set_defaults(function=filter_assocs) parser_n = subparsers.add_parser('convert', help='Convert associations') parser_n.set_defaults(function=convert_assocs) parser_n.add_argument('-t', '--to', type=str, required=True, choices=["GAF", "GPAD", "gaf", "gpad"], help='Format to convert to') parser_n.add_argument("-n", "--format-version", dest="version", type=str, required=False, default=None, help="Version for the file format. GAF default is 2.1, GPAD default is 1.2") parser_n = subparsers.add_parser('map2slim', help='Map to a subset/slim') parser_n.set_defaults(function=map2slim) parser_n.add_argument('-p', '--properties', nargs='*', type=str, default=['subClassOf', 'BFO:0000050'], help='Properties') parser_n.add_argument('-s', '--subset', type=str, required=True, help='subset (e.g. map2slim)') args = parser.parse_args() if args.verbosity >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) logging.info("Welcome!") # Ontology Factory ont = None if args.resource is not None: ofactory = OntologyFactory() logging.info("Creating ont object from: {} {}".format(args.resource, ofactory)) ont = ofactory.create(args.resource) logging.info("ont: {}".format(ont)) func = args.function # Upper case all evidence codes args.filter_out = [code.upper() for code in args.filter_out] gaferences = None if args.gaferencer_file: gaferences = gaference.build_annotation_inferences(json.load(args.gaferencer_file)) rule_set = args.rule_set if rule_set == ["all"]: rule_set = assocparser.RuleSet.ALL # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None config = assocparser.AssocParserConfig( valid_taxa=args.taxon, ontology=ont, class_idspaces=args.object_prefix, entity_idspaces=args.subject_prefix, filter_out_evidence=args.filter_out, filtered_evidence_file=filtered_evidence_file, annotation_inferences=gaferences, paint=args.allow_paint, gpi_authority_path=args.gpi, rule_set=rule_set ) p = None fmt = None if args.format is None: fmt = 'gaf' else: fmt = args.format.lower() # TODO: use a factory if fmt == 'gaf': p = GafParser(config=config, dataset=args.file) elif fmt == 'gpad': p = GpadParser(config=config) elif fmt == 'hpoa': p = HpoaParser(config=config) elif fmt == "gpi": p = entityparser.GpiParser() func = validate_entity outfh = None if args.outfile is not None: two_mb = 2097152 outfh = open(args.outfile, "w", buffering=two_mb) func(ont, args.file, outfh, p, args) if filtered_evidence_file: filtered_evidence_file.close() if outfh is not None: outfh.close() if args.report_md is not None: report_md = open(args.report_md, "w") report_md.write(p.report.to_markdown()) report_md.close() if args.report_json is not None: report_json = open(args.report_json, "w") report_json.write(json.dumps(p.report.to_report_json(), indent=4)) report_json.close() if not (args.report_md or args.report_json): print(p.report.to_markdown())
def create_parser(config, group, dataset, format="gaf"): if format == "gpad": return GpadParser(config=config, group=group, dataset=dataset) else: # We assume it's gaf as we only support in this instant gaf and gpad return GafParser(config=config, group=group, dataset=dataset)
for fname in os.listdir(args.dir): # print("Loading file:", fname) nono_in_fname = False for nono in d: if nono in fname: nono_in_fname = True if fname.endswith(".tsv") or nono_in_fname: continue # filenames.append(args.dir + fname) filter_name = get_filter_name(fname) filenames[args.dir + fname] = get_filter_rule(filter_name) # data = data + GafParser().parse(fname, skipheader=True) # all_dict = {} extensions_mapper = ExtensionsMapper() gpad_parser = GpadParser() print("Creating extension dictionary...") ext_dict = {} ext_dict['F'] = {} ext_dict['P'] = {} ext_dict['C'] = {} for fname in filenames: with open(fname) as f: data = [] print("Loading file:", fname) for l in f.readlines(): if not l.startswith("!"): parts = l.split("\t") # if parts[15] != "" and parts[6] in acceptable_evidence_codes: data.append(parts) print("# of GPAD lines in file:", len(data))