def test_gaf_2_1_simple_terms(): line = ["SGD", "S000000819", "AFG3", "", "GO:0006259", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002264") line = ["SGD", "S000000819", "AFG3", "", "GO:0042393", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002327") line = ["SGD", "S000000819", "AFG3", "", "GO:0005773", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0001025")
def test_object_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345") print(p.report.to_markdown()) assert len(assoc_result.associations[0].object_extensions) > 0 object_extensions = [ association.ConjunctiveSet([ association.ExtensionUnit(association.Curie("BFO", "0000050"), association.Curie("X", "1")) ]) ] assert assoc_result.associations[0].object_extensions == object_extensions
def test_gorule39(): assoc = make_annotation(db="ComplexPortal", goid="GO:0032991").associations[0] test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig()) assert test_result.result_type == qc.ResultType.ERROR assoc.subject.id = association.Curie("FB", "1234") test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig()) assert test_result.result_type == qc.ResultType.PASS assoc.subject.id = association.Curie("ComplexPortal", "12345") assoc.object.id = association.Curie("GO", "0000023") test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig()) assert test_result.result_type == qc.ResultType.PASS
def test_errors_gaf(): config = assocparser.AssocParserConfig(ecomap=EcoMap()) p = GafParser(config=config) assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True) msgs = p.report.messages print(json.dumps(p.report.to_report_json(), indent=4)) # print("MESSAGES: {}".format(len(msgs))) n_invalid_idspace = 0 for m in msgs: print("MESSAGE: {}".format(m)) if m['type'] == assocparser.Report.INVALID_IDSPACE: n_invalid_idspace += 1 assert len(msgs) == 13 assert n_invalid_idspace == 1 assert len(assocs) == 2 w = GafWriter() w.write(assocs) for a in assocs: if a.object_extensions != []: # our test file has no ORs, so in DNF this is always the first xs = a.object_extensions[0].elements print(xs) for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x.relation == association.Curie("BFO", "0000050") assert x.term == association.Curie.from_str('X:1') assert len(xs) == 1
def protein_complex_sublcass_closure(ontology: Ontology) -> Set[str]: protein_containing_complex = association.Curie(namespace="GO", identity="0032991") children_of_complexes = set( ontology.descendants(str(protein_containing_complex), relations=["subClassOf"], reflexive=True)) return children_of_complexes
def test_gaf_2_1_upconvert_in_parse(): gaf = io.StringIO("!gaf-version: 2.1\nSGD\tS000000819\tAFG3\t\tGO:0005840\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD") ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) # We're 2.1, qualifier blank, cell component term from above, ontology defined: should upgrade assocs = p.parse(gaf, skipheader=True) assert assocs[0].relation == association.Curie(namespace="BFO", identity="0000050")
def obo_uri_to_curie(uri: str): """ OBO URIs and CURIEs follow the same pattern: <base>/<namespace>_<local_id> So this just looks at the last slash separated item, splits on underscore, and we have our Curie. """ full_identifier = uri.rsplit( "/", maxsplit=1)[1] # Throw away the base, grab the second element namespace, identifier = full_identifier.split( "_", maxsplit=1) # Assume 1 underscore return association.Curie(namespace, identifier)
def upgrade_empty_qualifier( self, assoc: association.GoAssociation) -> association.GoAssociation: """ From https://github.com/geneontology/go-site/issues/1558 For GAF 2.1 we will apply an algorithm to find a best fit relation if the qualifier column is empty. If the qualifiers field is empty, then: If the GO Term is exactly GO:008150 Biological Process, then the qualifier should be `involved_in` If the GO Term is exactly GO:0008372 Cellular Component, then the qualifer should be `is_active_in` If the GO Term is a Molecular Function, then the new qualifier should be `enables` If the GO Term is a Biological Process, then the new qualifier should be `acts_upstream_or_within Otherwise for Cellular Component, if it's subclass of anatomical structure, than use `located_in` and if it's a protein-containing complexes, use `part_of` :param assoc: GoAssociation :return: the possibly upgraded GoAssociation """ term = str(assoc.object.id) namespace = self.config.ontology.obo_namespace(term) if term == "GO:0008150": involved_in = association.Curie(namespace="RO", identity="0002331") assoc.qualifiers = [involved_in] assoc.relation = involved_in elif term == "GO:0008372": is_active_in = association.Curie(namespace="RO", identity="0002432") assoc.qualifiers = [is_active_in] assoc.relation = is_active_in elif namespace == "molecular_function": enables = association.Curie(namespace="RO", identity="0002327") assoc.qualifiers = [enables] assoc.relation = enables elif namespace == "biological_process": acts_upstream_or_within = association.Curie(namespace="RO", identity="0002264") assoc.qualifiers = [acts_upstream_or_within] assoc.relation = acts_upstream_or_within elif namespace == "cellular_component": if term in self.cell_component_descendants_closure: part_of = association.Curie(namespace="BFO", identity="0000050") assoc.qualifiers = [part_of] assoc.relation = part_of else: located_in = association.Curie(namespace="RO", identity="0001025") assoc.qualifiers = [located_in] assoc.relation = located_in self.report.warning( assoc.source_line, Report.INVALID_QUALIFIER, "EMPTY", "GORULE:0000059 Upgrading qualifier/relation to {} when reading GAF 2.1" .format(assoc.relation), taxon=str(assoc.subject.taxon), rule=59) return assoc
def test_relations_curie_contract(): curie = relations.obo_uri_to_curie( "http://purl.obolibrary.org/obo/GO_1234567") assert curie == association.Curie(namespace="GO", identity="1234567")
def test_gorule61(): config = all_rules_config(ontology=ontology) assoc = make_annotation(goid="GO:0005554", qualifier="enables", evidence="ECO:0000320", from_gaf=False, version="1.2") assert assoc.report.reporter.messages.get("gorule-0000001", []) == [] test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.PASS # Using `contributes_to`, but should be repaired to RO:0002327 enables assoc = make_annotation(goid="GO:0005554", qualifier="contributes_to", evidence="ECO:0000320", from_gaf=False, version="1.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result.relation == association.Curie("RO", "0002327") assert test_result.result_type == qc.ResultType.WARNING # BP term, qualifier inside allowed BP set assoc = make_annotation(goid="GO:0016192", qualifier="acts_upstream_of_or_within", evidence="ECO:0000320", from_gaf=False, version="1.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.PASS # BP term, unallowed relation, Repair assoc = make_annotation(goid="GO:0016192", qualifier="enables", evidence="ECO:0000320", from_gaf=False, version="1.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.WARNING assert test_result.result.relation == association.Curie("RO", "0002264") # CC complex term, unallowed relation, unrepairable, causes error assoc = make_annotation(goid="GO:0032991", qualifier="enables", evidence="ECO:0000320", from_gaf=False, version="1.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.ERROR # CC root repairs to is_active_in assoc = make_annotation(goid="GO:0005575", qualifier="located_in", evidence="ND", from_gaf=True, version="2.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.WARNING # Active in, rather than located_in assert test_result.result.relation == association.Curie(namespace="RO", identity="0002432") # protein complex + repairable relation repairs to part_of assoc = make_annotation(goid="GO:0032991", qualifier="is_active_in", evidence="ECO:0000320", from_gaf=False, version="1.2") test_result = qc.GoRule61().test(assoc.associations[0], config) assert test_result.result_type == qc.ResultType.WARNING assert test_result.result.relation == association.Curie(namespace="BFO", identity="0000050")
def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities source_line = "\t".join(gaf_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gaf_line) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) gaf_line = gaf_line[:17] if 17 > len(gaf_line) >= 15: gaf_line += [""] * (17 - len(gaf_line)) if len(gaf_line) != 17: report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 TAXON_INDEX = 12 REFERENCE_INDEX = 5 if gaf_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX]) # type: assocparser.ValidateResult if not parsed_taxons_result.valid: report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = parsed_taxons_result.parsed[0] date = assocparser.parse_date(gaf_line[13], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None subject_curie = association.Curie(gaf_line[0], gaf_line[1]) subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon) gpi_entity = bio_entities.get(subject_curie) if gpi_entity is not None and subject != gpi_entity: subject = gpi_entity # column 4 is qualifiers -> index 3 # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object. # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation. parsed_qualifiers = qualifier_parser.validate(gaf_line[3]) if not parsed_qualifiers.valid: report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) aspect = gaf_line[8] negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect) # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers] object = association.Term(association.Curie.from_str(gaf_line[4]), taxon) if isinstance(object, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1) # References references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e] for r in references: if isinstance(r, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None] eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0]) if eco_curie is None: report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7]) if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence_type = association.Curie.from_str(eco_curie) if isinstance(evidence_type, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1) evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms) if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]): first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0] report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) subject_extensions = [] if gaf_line[16]: subject_filler = association.Curie.from_str(gaf_line[16]) if isinstance(subject_filler, association.Error): report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # filler is not an Error, so keep moving subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler)) conjunctions = [] if gaf_line[15]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gaf_line[15], conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) relation_uri = relations.lookup_label(relation_label) if relation_uri is None: report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # We don't have to check that this is well formed because we're grabbing it from the known relations URI map. relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0]) a = association.GoAssociation( source_line="\t".join(gaf_line), subject=subject, relation=relation_curie, object=object, negated=negated, qualifiers=qualifiers, aspect=aspect, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=subject_extensions, object_extensions=conjunctions, provided_by=gaf_line[14], date=date, properties={}) return assocparser.ParseResult(source_line, [a], False, report=report)
def from_2_0(gpad_line: List[str], report=None, group="unknown", dataset="unknown", bio_entities=None): source_line = "\t".join(gpad_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gpad_line) > 12: report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 12 columns in this line. Proceeding by cutting off extra columns.", rule=1) gpad_line = gpad_line[:12] if 12 > len(gpad_line) >= 10: gpad_line += [""] * (12 - len(gpad_line)) if len(gpad_line) != 12: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(gpad_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables SUBJECT_CURIE = 0 RELATION = 2 ONTOLOGY_CLASS_INDEX = 3 REFERENCE_INDEX = 4 EVIDENCE_INDEX = 5 DATE_INDEX = 8 ASSIGNED_BY_INDEX = 9 required = [ SUBJECT_CURIE, RELATION, ONTOLOGY_CLASS_INDEX, REFERENCE_INDEX, EVIDENCE_INDEX, DATE_INDEX, ASSIGNED_BY_INDEX ] for req in required: if gpad_line[req] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "Column {} is empty".format(req + 1), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = association.Curie("NCBITaxon", "0") subject_curie = association.Curie.from_str(gpad_line[SUBJECT_CURIE]) if subject_curie.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[SUBJECT_CURIE], "Problem parsing DB Object", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) subject = association.Subject(subject_curie, "", "", [], "", taxon) entity = bio_entities.get(subject_curie) if entity is not None: # If we found a subject entity, then set `subject` to the found entity subject = entity taxon = subject.taxon negated = gpad_line[1] == "NOT" relation = association.Curie.from_str(gpad_line[RELATION]) if relation.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[RELATION], "Problem parsing Relation", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) go_term = association.Curie.from_str(gpad_line[ONTOLOGY_CLASS_INDEX]) if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[ONTOLOGY_CLASS_INDEX], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(go_term, taxon) evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX]) if evidence_type.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[EVIDENCE_INDEX], "Problem parsing Evidence ECO Curie", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) references = [ association.Curie.from_str(e) for e in gpad_line[REFERENCE_INDEX].split("|") if e ] for r in references: if r.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[REFERENCE_INDEX], "Problem parsing references", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions( gpad_line[6]) # Returns a list of ConjuctiveSets or Error if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[6], "Problem parsing With/From column", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence = association.Evidence(evidence_type, references, withfroms) interacting_taxon = None if gpad_line[7] != "": interacting_taxon = association.Curie.from_str(gpad_line[7]) if interacting_taxon.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[7], "Problem parsing Interacting Taxon", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) date = assocparser.parse_iso_date(gpad_line[DATE_INDEX], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) conjunctions = [] # The elements of the extension units are Curie(Curie) if gpad_line[10]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gpad_line[10], conjunct_element_builder=lambda el: association.ExtensionUnit. from_curie_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie)", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) properties_list = association.parse_annotation_properties(gpad_line[11]) a = association.GoAssociation(source_line=source_line, subject=subject, relation=relation, object=object, negated=negated, qualifiers=[relation], aspect=None, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=[], object_extensions=conjunctions, provided_by=gpad_line[9], date=date, properties=properties_list) return assocparser.ParseResult(source_line, [a], False, report=report)
def from_1_2(gpad_line: List[str], report=None, group="unknown", dataset="unknown", bio_entities=None): source_line = "\t".join(gpad_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gpad_line) > 12: report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 12 columns in this line. Proceeding by cutting off extra columns.", rule=1) gpad_line = gpad_line[:12] if 12 > len(gpad_line) >= 10: gpad_line += [""] * (12 - len(gpad_line)) if len(gpad_line) != 12: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(gpad_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 QUALIFIER = 2 REFERENCE_INDEX = 4 EVIDENCE_INDEX = 5 if gpad_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[QUALIFIER] == "": report.error(source_line, Report.INVALID_TAXON, "EMPTY", "qualifier column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[EVIDENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "Evidence column is empty", rule=1) taxon = association.Curie("NCBITaxon", "0") subject_curie = association.Curie(gpad_line[0], gpad_line[1]) subject = association.Subject(subject_curie, "", [""], [], [], taxon) entity = bio_entities.get(subject_curie) if entity is not None: subject = entity taxon = subject.taxon go_term = association.Curie.from_str(gpad_line[3]) if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[3], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(go_term, taxon) evidence_type = association.Curie.from_str(gpad_line[5]) if evidence_type.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[5], "Problem parsing Evidence ECO Curie", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) references = [ association.Curie.from_str(e) for e in gpad_line[4].split("|") if e ] for r in references: if r.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[4], "Problem parsing references", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions( gpad_line[6]) # Returns a list of ConjuctiveSets or Error if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[6], "Problem parsing With/From column", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence = association.Evidence(evidence_type, references, withfroms) # Guarenteed to have at least one element, from above check raw_qs = gpad_line[QUALIFIER].split("|") negated = "NOT" in raw_qs looked_up_qualifiers = [ relations.lookup_label(q) for q in raw_qs if q != "NOT" ] if None in looked_up_qualifiers: report.error(source_line, Report.INVALID_QUALIFIER, raw_qs, "Could not find a URI for qualifier", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) qualifiers = [ association.Curie.from_str(curie_util.contract_uri(q)[0]) for q in looked_up_qualifiers ] date = assocparser.parse_date(gpad_line[8], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = None if gpad_line[7]: taxon_result = gpad_line_validators["taxon"].validate(gpad_line[7]) if not taxon_result.valid: report.error(source_line, Report.INVALID_TAXON, taxon_result.original, taxon_result.message, taxon=str(taxon_result.original), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) else: interacting_taxon = taxon_result.parsed[0] conjunctions = [] if gpad_line[10]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gpad_line[10], conjunct_element_builder=lambda el: association.ExtensionUnit. from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie)", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) properties_list = association.parse_annotation_properties(gpad_line[11]) # print(properties_list) a = association.GoAssociation(source_line=source_line, subject=subject, relation=qualifiers[0], object=object, negated=negated, qualifiers=qualifiers, aspect=None, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=[], object_extensions=conjunctions, provided_by=gpad_line[9], date=date, properties=properties_list) return assocparser.ParseResult(source_line, [a], False, report=report)
def test_rdfgen_includes_taxon_in_gp_class(): assoc = association.GoAssociation( source_line= "PomBase\tSPAC25B8.17\typf1\t\tGO:1990578\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20150305\tPomBase\t\t", subject=association.Subject( id=association.Curie("PomBase", "SPAC25B8.17"), label="ypf1", type="protein", fullname= "intramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)", synonyms=["ppp81"], taxon=association.Curie("NCBITaxon", "4896")), object=association.Term(id=association.Curie("GO", "0000006"), taxon=association.Curie("NCBITaxon", "4896")), negated=False, qualifiers=[], aspect=association.Aspect("C"), relation=association.Curie("BFO", "0000050"), interacting_taxon=association.Curie("NCBITaxon", "555"), evidence=association.Evidence( type=association.Curie("ECO", "0000266"), has_supporting_reference=[association.Curie("GO_REF", "0000024")], with_support_from=[ association.ConjunctiveSet( elements=[association.Curie("SGD", "S000001583")]) ]), provided_by=association.Provider("PomBase"), date=association.Date("20150305"), subject_extensions=[ association.ExtensionUnit( relation=association.Curie("rdfs", "subClassOf"), term=association.Curie("UniProtKB", "P12345")) ], object_extensions=[ association.ConjunctiveSet(elements=[ association.ExtensionUnit(relation=association.Curie( "BFO", "0000050"), term=association.Curie("X", "1")), association.ExtensionUnit( relation=association.Curie("BFO", "0000066"), term=association.Curie("GO", "0016020")) ]), association.ConjunctiveSet(elements=[ association.ExtensionUnit( relation=association.Curie("RO", "0002233"), term=association.Curie("PomBase", "12345")) ]) ], properties=dict()) rdfWriter = TurtleRdfWriter(label="pombase_single.ttl") gaf_transformer = CamRdfTransform(writer=rdfWriter) gaf_transformer.translate(assoc) gaf_transformer.provenance() gp_res = rdfWriter.graph.query(gene_product_class_query()) for row in gp_res: assert str(row["cls"]) == "http://identifiers.org/pombase/SPAC25B8.17" assert str( row["taxon"]) == "http://purl.obolibrary.org/obo/NCBITaxon_4896"
def test(self, annotation: association.GoAssociation, config: assocparser.AssocParserConfig, group=None) -> TestResult: """ * GO:0003674 "molecular function" * Term: GO:0005554 => relation is RO:0002327 "enables" + repair, * Term: subclass of GO:0005554 => relations: {RO:0002327 "enables", RO:0002326 "contributes_to"} + filter * GO:0008150 "biological process" * Term: GO:0008150 => RO:0002331 "involved_in" + repair * Term: subclass of GO:0008150 => relations: {RO:0002331 "involved_in", RO:0002264 "acts upstream or within", RO:0004032 "acts upstream of or within, positive effect", RO:0004033 "acts upstream of or within, negative effect", RO:0002263 "acts upstream of", RO:0004034 "acts upstream of, positive effect", RO:0004035 "acts upstream of, negative effect"} + filter * GO:0005575 "cellular component" * Term: GO:0005575 => relation is RO:0002432 "is_active_in" + repair * If term is subclass of `GO:0032991 "protein-containing complex"` with relation one of {RO:0002432 "is_active_in", RO:0001025 "located in"} => relation should be repaired to `BFO:0000050 "part of"` * If term is subclass of `GO:0032991` and any other relation, then it should be filtered * Term: any other subclass of `GO:0008372` => allowed relations are {`RO:0001025 "located in"`, `RO:0002432 "is_active_in"`, `RO:0002325 "colocalizes_with"`} and other relations repaired to `RO:0001025 "located in"`. """ if config.ontology is None: return TestResult(ResultType.PASS, "", annotation) term = str(annotation.object.id) namespace = config.ontology.obo_namespace(term) repair_state = RepairState.OKAY relation = annotation.relation allowed = set() repaired_annotation = annotation if term == "GO:0005554": enables = association.Curie(namespace="RO", identity="0002327") if relation != enables: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = enables repaired_annotation.qualifiers = [enables] allowed = set([enables]) repair_state = RepairState.REPAIRED elif namespace == "molecular_function": if relation not in self.allowed_mf: enables = association.Curie(namespace="RO", identity="0002327") repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = enables repaired_annotation.qualifiers = [enables] allowed = self.allowed_mf repair_state = RepairState.REPAIRED elif term == "GO:0008150": involved_in = association.Curie(namespace="RO", identity="0002331") if relation != involved_in: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = involved_in repaired_annotation.qualifiers = [involved_in] allowed = set([involved_in]) repair_state = RepairState.REPAIRED elif namespace == "biological_process": acts_upstream_of_or_within = association.Curie("RO", "0002264") if relation not in self.allowed_bp: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = acts_upstream_of_or_within repaired_annotation.qualifiers = [acts_upstream_of_or_within] allowed = self.allowed_bp repair_state = RepairState.REPAIRED elif term == "GO:0005575": is_active_in = association.Curie(namespace="RO", identity="0002432") if relation != is_active_in: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = is_active_in repaired_annotation.qualifiers = [is_active_in] allowed = set([is_active_in]) repair_state = RepairState.REPAIRED elif namespace == "cellular_component": if term in self.make_protein_complex_descendents_if_not_present( config.ontology): part_of = association.Curie(namespace="BFO", identity="0000050") if relation not in self.allowed_cc_complex: if relation in self.repairable_cc_complex: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = part_of repaired_annotation.qualifiers = [part_of] allowed = self.allowed_cc_complex repair_state = RepairState.REPAIRED else: # Not repairable to part_of, so filter repaired_annotation = annotation allowed = self.allowed_cc_complex repair_state = RepairState.FAILED else: located_in = association.Curie(namespace="RO", identity="0001025") if relation not in self.allowed_cc_other: repaired_annotation = copy.deepcopy(annotation) repaired_annotation.relation = located_in repaired_annotation.qualifiers = [located_in] allowed = self.allowed_cc_other repair_state = RepairState.REPAIRED else: # If we reach here, we're in a weird case where a term is not in either # of the three main GO branches, or does not have a namespace defined. # If this is the case we should just pass along as if the ontology is missing return TestResult( repair_result(RepairState.OKAY, self.fail_mode), "{}: {}".format(self.message(repair_state), "GO term has no namespace"), repaired_annotation) allowed_str = ", ".join([str(a) for a in allowed]) return TestResult( repair_result(repair_state, self.fail_mode), "{}: {} should be one of {}".format(self.message(repair_state), relation, allowed_str), repaired_annotation)
def __init__(self): super().__init__( "GORULE:0000061", "Only certain gene product to term relations are allowed for a given GO term", FailMode.HARD) self.protein_containing_complex_descendents = None self.allowed_mf = set([ association.Curie(namespace="RO", identity="0002327"), association.Curie(namespace="RO", identity="0002326") ]) self.allowed_bp = set([ association.Curie("RO", "0002331"), association.Curie("RO", "0002264"), association.Curie("RO", "0004032"), association.Curie("RO", "0004033"), association.Curie("RO", "0002263"), association.Curie("RO", "0004034"), association.Curie("RO", "0004035") ]) self.allowed_cc_complex = set([association.Curie("BFO", "0000050")]) self.repairable_cc_complex = set([ association.Curie("RO", "0002432"), association.Curie("RO", "0001025") ]) self.allowed_cc_other = set([ association.Curie("RO", "0001025"), association.Curie("RO", "0002432"), association.Curie("RO", "0002325") ])