Exemple #1
0
def test_gaf_2_1_simple_terms():
    line = ["SGD", "S000000819", "AFG3", "", "GO:0006259", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"]
    ontology = OntologyFactory().create("tests/resources/goslim_generic.json")
    p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology))
    p.make_internal_cell_component_closure()

    parsed = gafparser.to_association(line)
    assoc = p.upgrade_empty_qualifier(parsed.associations[0])
    assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002264")

    line = ["SGD", "S000000819", "AFG3", "", "GO:0042393", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P",
            "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene",
            "taxon:559292", "20170428", "SGD"]
    ontology = OntologyFactory().create("tests/resources/goslim_generic.json")
    p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology))
    p.make_internal_cell_component_closure()

    parsed = gafparser.to_association(line)
    assoc = p.upgrade_empty_qualifier(parsed.associations[0])
    assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002327")

    line = ["SGD", "S000000819", "AFG3", "", "GO:0005773", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P",
            "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene",
            "taxon:559292", "20170428", "SGD"]
    ontology = OntologyFactory().create("tests/resources/goslim_generic.json")
    p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology))
    p.make_internal_cell_component_closure()

    parsed = gafparser.to_association(line)
    assoc = p.upgrade_empty_qualifier(parsed.associations[0])
    assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0001025")
Exemple #2
0
def test_object_extensions():
    p = GafParser()
    assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345")
    print(p.report.to_markdown())
    assert len(assoc_result.associations[0].object_extensions) > 0
    object_extensions = [
        association.ConjunctiveSet([
            association.ExtensionUnit(association.Curie("BFO", "0000050"), association.Curie("X", "1"))
        ])
    ]
    assert assoc_result.associations[0].object_extensions == object_extensions
Exemple #3
0
def test_gorule39():
    assoc = make_annotation(db="ComplexPortal", goid="GO:0032991").associations[0]

    test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig())
    assert test_result.result_type == qc.ResultType.ERROR

    assoc.subject.id = association.Curie("FB", "1234")
    test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig())
    assert test_result.result_type == qc.ResultType.PASS

    assoc.subject.id = association.Curie("ComplexPortal", "12345")
    assoc.object.id = association.Curie("GO", "0000023")
    test_result = qc.GoRule39().test(assoc, assocparser.AssocParserConfig())
    assert test_result.result_type == qc.ResultType.PASS
Exemple #4
0
def test_errors_gaf():
    config = assocparser.AssocParserConfig(ecomap=EcoMap())
    p = GafParser(config=config)
    assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True)
    msgs = p.report.messages
    print(json.dumps(p.report.to_report_json(), indent=4))
    # print("MESSAGES: {}".format(len(msgs)))
    n_invalid_idspace = 0
    for m in msgs:
        print("MESSAGE: {}".format(m))
        if m['type'] == assocparser.Report.INVALID_IDSPACE:
            n_invalid_idspace += 1
    assert len(msgs) == 13
    assert n_invalid_idspace == 1
    assert len(assocs) == 2

    w = GafWriter()
    w.write(assocs)
    for a in assocs:
        if a.object_extensions != []:
            # our test file has no ORs, so in DNF this is always the first
            xs = a.object_extensions[0].elements
            print(xs)
            for x in xs:

                print('X: {}'.format(x))
                # ensure that invalid expressions have been eliminated
                assert x.relation == association.Curie("BFO", "0000050")
                assert x.term == association.Curie.from_str('X:1')
            assert len(xs) == 1
Exemple #5
0
def protein_complex_sublcass_closure(ontology: Ontology) -> Set[str]:
    protein_containing_complex = association.Curie(namespace="GO",
                                                   identity="0032991")
    children_of_complexes = set(
        ontology.descendants(str(protein_containing_complex),
                             relations=["subClassOf"],
                             reflexive=True))
    return children_of_complexes
Exemple #6
0
def test_gaf_2_1_upconvert_in_parse():
    gaf = io.StringIO("!gaf-version: 2.1\nSGD\tS000000819\tAFG3\t\tGO:0005840\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD")
    ontology = OntologyFactory().create("tests/resources/goslim_generic.json")
    p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology))

    # We're 2.1, qualifier blank, cell component term from above, ontology defined: should upgrade
    assocs = p.parse(gaf, skipheader=True)
    assert assocs[0].relation == association.Curie(namespace="BFO", identity="0000050")
Exemple #7
0
def obo_uri_to_curie(uri: str):
    """
    OBO URIs and CURIEs follow the same pattern: <base>/<namespace>_<local_id>

    So this just looks at the last slash separated item, splits on underscore, and we have our Curie.
    """
    full_identifier = uri.rsplit(
        "/", maxsplit=1)[1]  # Throw away the base, grab the second element
    namespace, identifier = full_identifier.split(
        "_", maxsplit=1)  # Assume 1 underscore
    return association.Curie(namespace, identifier)
Exemple #8
0
    def upgrade_empty_qualifier(
            self,
            assoc: association.GoAssociation) -> association.GoAssociation:
        """
        From https://github.com/geneontology/go-site/issues/1558

        For GAF 2.1 we will apply an algorithm to find a best fit relation if the qualifier column is empty.
        If the qualifiers field is empty, then:
            If the GO Term is exactly GO:008150 Biological Process, then the qualifier should be `involved_in`
            If the GO Term is exactly GO:0008372 Cellular Component, then the qualifer should be `is_active_in`
            If the GO Term is a Molecular Function, then the new qualifier should be `enables`
            If the GO Term is a Biological Process, then the new qualifier should be `acts_upstream_or_within
            Otherwise for Cellular Component, if it's subclass of anatomical structure, than use `located_in`
                and if it's a protein-containing complexes, use `part_of`
        :param assoc: GoAssociation
        :return: the possibly upgraded GoAssociation
        """
        term = str(assoc.object.id)
        namespace = self.config.ontology.obo_namespace(term)

        if term == "GO:0008150":
            involved_in = association.Curie(namespace="RO", identity="0002331")
            assoc.qualifiers = [involved_in]
            assoc.relation = involved_in
        elif term == "GO:0008372":
            is_active_in = association.Curie(namespace="RO",
                                             identity="0002432")
            assoc.qualifiers = [is_active_in]
            assoc.relation = is_active_in
        elif namespace == "molecular_function":
            enables = association.Curie(namespace="RO", identity="0002327")
            assoc.qualifiers = [enables]
            assoc.relation = enables
        elif namespace == "biological_process":
            acts_upstream_or_within = association.Curie(namespace="RO",
                                                        identity="0002264")
            assoc.qualifiers = [acts_upstream_or_within]
            assoc.relation = acts_upstream_or_within
        elif namespace == "cellular_component":
            if term in self.cell_component_descendants_closure:
                part_of = association.Curie(namespace="BFO",
                                            identity="0000050")
                assoc.qualifiers = [part_of]
                assoc.relation = part_of
            else:
                located_in = association.Curie(namespace="RO",
                                               identity="0001025")
                assoc.qualifiers = [located_in]
                assoc.relation = located_in

        self.report.warning(
            assoc.source_line,
            Report.INVALID_QUALIFIER,
            "EMPTY",
            "GORULE:0000059 Upgrading qualifier/relation to {} when reading GAF 2.1"
            .format(assoc.relation),
            taxon=str(assoc.subject.taxon),
            rule=59)
        return assoc
Exemple #9
0
def test_relations_curie_contract():
    curie = relations.obo_uri_to_curie(
        "http://purl.obolibrary.org/obo/GO_1234567")
    assert curie == association.Curie(namespace="GO", identity="1234567")
Exemple #10
0
def test_gorule61():
    config = all_rules_config(ontology=ontology)
    assoc = make_annotation(goid="GO:0005554",
                            qualifier="enables",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    assert assoc.report.reporter.messages.get("gorule-0000001", []) == []
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.PASS

    # Using `contributes_to`, but should be repaired to RO:0002327 enables
    assoc = make_annotation(goid="GO:0005554",
                            qualifier="contributes_to",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result.relation == association.Curie("RO", "0002327")
    assert test_result.result_type == qc.ResultType.WARNING

    # BP term, qualifier inside allowed BP set
    assoc = make_annotation(goid="GO:0016192",
                            qualifier="acts_upstream_of_or_within",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.PASS

    # BP term, unallowed relation, Repair
    assoc = make_annotation(goid="GO:0016192",
                            qualifier="enables",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.WARNING
    assert test_result.result.relation == association.Curie("RO", "0002264")

    # CC complex term, unallowed relation, unrepairable, causes error
    assoc = make_annotation(goid="GO:0032991",
                            qualifier="enables",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.ERROR

    # CC root repairs to is_active_in
    assoc = make_annotation(goid="GO:0005575",
                            qualifier="located_in",
                            evidence="ND",
                            from_gaf=True,
                            version="2.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.WARNING
    # Active in, rather than located_in
    assert test_result.result.relation == association.Curie(namespace="RO",
                                                            identity="0002432")

    # protein complex + repairable relation repairs to part_of
    assoc = make_annotation(goid="GO:0032991",
                            qualifier="is_active_in",
                            evidence="ECO:0000320",
                            from_gaf=False,
                            version="1.2")
    test_result = qc.GoRule61().test(assoc.associations[0], config)
    assert test_result.result_type == qc.ResultType.WARNING
    assert test_result.result.relation == association.Curie(namespace="BFO",
                                                            identity="0000050")
Exemple #11
0
def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX])  # type: assocparser.ValidateResult
    if not parsed_taxons_result.valid:
        report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = parsed_taxons_result.parsed[0]

    date = assocparser.parse_date(gaf_line[13], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None
    subject_curie = association.Curie(gaf_line[0], gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon)
    gpi_entity = bio_entities.get(subject_curie)
    if gpi_entity is not None and subject != gpi_entity:
        subject = gpi_entity

    # column 4 is qualifiers -> index 3
    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object.
    # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc
    # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation.
    parsed_qualifiers = qualifier_parser.validate(gaf_line[3])
    if not parsed_qualifiers.valid:
        report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    aspect = gaf_line[8]
    negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect)
    # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier
    qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers]

    object = association.Term(association.Curie.from_str(gaf_line[4]), taxon)
    if isinstance(object, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1)

    # References
    references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e]
    for r in references:
        if isinstance(r, association.Error):
            report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None]
    eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0])
    if eco_curie is None:
        report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7])
    if isinstance(withfroms, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence_type = association.Curie.from_str(eco_curie)
    if isinstance(evidence_type, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1)

    evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms)
    if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]):
        first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0]
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject_extensions = []
    if gaf_line[16]:
        subject_filler = association.Curie.from_str(gaf_line[16])
        if isinstance(subject_filler, association.Error):
            report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)
        # filler is not an Error, so keep moving
        subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler))

    conjunctions = []
    if gaf_line[15]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gaf_line[15],
            conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    relation_uri = relations.lookup_label(relation_label)
    if relation_uri is None:
        report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    # We don't have to check that this is well formed because we're grabbing it from the known relations URI map.
    relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0])

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=relation_curie,
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=conjunctions,
        provided_by=gaf_line[14],
        date=date,
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)
Exemple #12
0
def from_2_0(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    SUBJECT_CURIE = 0
    RELATION = 2
    ONTOLOGY_CLASS_INDEX = 3
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    DATE_INDEX = 8
    ASSIGNED_BY_INDEX = 9
    required = [
        SUBJECT_CURIE, RELATION, ONTOLOGY_CLASS_INDEX, REFERENCE_INDEX,
        EVIDENCE_INDEX, DATE_INDEX, ASSIGNED_BY_INDEX
    ]
    for req in required:
        if gpad_line[req] == "":
            report.error(source_line,
                         Report.INVALID_ID,
                         "EMPTY",
                         "Column {} is empty".format(req + 1),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie.from_str(gpad_line[SUBJECT_CURIE])
    if subject_curie.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[SUBJECT_CURIE],
                     "Problem parsing DB Object",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject = association.Subject(subject_curie, "", "", [], "", taxon)
    entity = bio_entities.get(subject_curie)
    if entity is not None:
        # If we found a subject entity, then set `subject` to the found entity
        subject = entity
        taxon = subject.taxon

    negated = gpad_line[1] == "NOT"

    relation = association.Curie.from_str(gpad_line[RELATION])
    if relation.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[RELATION],
                     "Problem parsing Relation",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    go_term = association.Curie.from_str(gpad_line[ONTOLOGY_CLASS_INDEX])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[ONTOLOGY_CLASS_INDEX],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[EVIDENCE_INDEX],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e)
        for e in gpad_line[REFERENCE_INDEX].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[REFERENCE_INDEX],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    interacting_taxon = None
    if gpad_line[7] != "":
        interacting_taxon = association.Curie.from_str(gpad_line[7])
        if interacting_taxon.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[7],
                         "Problem parsing Interacting Taxon",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    date = assocparser.parse_iso_date(gpad_line[DATE_INDEX], report,
                                      source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    conjunctions = []
    # The elements of the extension units are Curie(Curie)
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_curie_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=relation,
                                  object=object,
                                  negated=negated,
                                  qualifiers=[relation],
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)
Exemple #13
0
def from_1_2(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    QUALIFIER = 2
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    if gpad_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[QUALIFIER] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "qualifier column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[EVIDENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "Evidence column is empty",
                     rule=1)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie(gpad_line[0], gpad_line[1])
    subject = association.Subject(subject_curie, "", [""], [], [], taxon)

    entity = bio_entities.get(subject_curie)
    if entity is not None:
        subject = entity
        taxon = subject.taxon

    go_term = association.Curie.from_str(gpad_line[3])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[3],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[5])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[5],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e) for e in gpad_line[4].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[4],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    # Guarenteed to have at least one element, from above check
    raw_qs = gpad_line[QUALIFIER].split("|")
    negated = "NOT" in raw_qs

    looked_up_qualifiers = [
        relations.lookup_label(q) for q in raw_qs if q != "NOT"
    ]
    if None in looked_up_qualifiers:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     raw_qs,
                     "Could not find a URI for qualifier",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    qualifiers = [
        association.Curie.from_str(curie_util.contract_uri(q)[0])
        for q in looked_up_qualifiers
    ]

    date = assocparser.parse_date(gpad_line[8], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = None
    if gpad_line[7]:
        taxon_result = gpad_line_validators["taxon"].validate(gpad_line[7])
        if not taxon_result.valid:
            report.error(source_line,
                         Report.INVALID_TAXON,
                         taxon_result.original,
                         taxon_result.message,
                         taxon=str(taxon_result.original),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)
        else:
            interacting_taxon = taxon_result.parsed[0]

    conjunctions = []
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    # print(properties_list)
    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=qualifiers[0],
                                  object=object,
                                  negated=negated,
                                  qualifiers=qualifiers,
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)
def test_rdfgen_includes_taxon_in_gp_class():
    assoc = association.GoAssociation(
        source_line=
        "PomBase\tSPAC25B8.17\typf1\t\tGO:1990578\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20150305\tPomBase\t\t",
        subject=association.Subject(
            id=association.Curie("PomBase", "SPAC25B8.17"),
            label="ypf1",
            type="protein",
            fullname=
            "intramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)",
            synonyms=["ppp81"],
            taxon=association.Curie("NCBITaxon", "4896")),
        object=association.Term(id=association.Curie("GO", "0000006"),
                                taxon=association.Curie("NCBITaxon", "4896")),
        negated=False,
        qualifiers=[],
        aspect=association.Aspect("C"),
        relation=association.Curie("BFO", "0000050"),
        interacting_taxon=association.Curie("NCBITaxon", "555"),
        evidence=association.Evidence(
            type=association.Curie("ECO", "0000266"),
            has_supporting_reference=[association.Curie("GO_REF", "0000024")],
            with_support_from=[
                association.ConjunctiveSet(
                    elements=[association.Curie("SGD", "S000001583")])
            ]),
        provided_by=association.Provider("PomBase"),
        date=association.Date("20150305"),
        subject_extensions=[
            association.ExtensionUnit(
                relation=association.Curie("rdfs", "subClassOf"),
                term=association.Curie("UniProtKB", "P12345"))
        ],
        object_extensions=[
            association.ConjunctiveSet(elements=[
                association.ExtensionUnit(relation=association.Curie(
                    "BFO", "0000050"),
                                          term=association.Curie("X", "1")),
                association.ExtensionUnit(
                    relation=association.Curie("BFO", "0000066"),
                    term=association.Curie("GO", "0016020"))
            ]),
            association.ConjunctiveSet(elements=[
                association.ExtensionUnit(
                    relation=association.Curie("RO", "0002233"),
                    term=association.Curie("PomBase", "12345"))
            ])
        ],
        properties=dict())

    rdfWriter = TurtleRdfWriter(label="pombase_single.ttl")
    gaf_transformer = CamRdfTransform(writer=rdfWriter)
    gaf_transformer.translate(assoc)
    gaf_transformer.provenance()

    gp_res = rdfWriter.graph.query(gene_product_class_query())
    for row in gp_res:
        assert str(row["cls"]) == "http://identifiers.org/pombase/SPAC25B8.17"
        assert str(
            row["taxon"]) == "http://purl.obolibrary.org/obo/NCBITaxon_4896"
Exemple #15
0
    def test(self,
             annotation: association.GoAssociation,
             config: assocparser.AssocParserConfig,
             group=None) -> TestResult:
        """
        * GO:0003674 "molecular function"
            * Term: GO:0005554 => relation is RO:0002327 "enables" + repair,
            * Term: subclass of GO:0005554 => relations: {RO:0002327 "enables", RO:0002326 "contributes_to"} + filter
        * GO:0008150 "biological process"
            * Term: GO:0008150 => RO:0002331 "involved_in" + repair
            * Term: subclass of GO:0008150 => relations: {RO:0002331 "involved_in", RO:0002264 "acts upstream or within", RO:0004032 "acts upstream of or within, positive effect", RO:0004033 "acts upstream of or within, negative effect", RO:0002263 "acts upstream of", RO:0004034 "acts upstream of, positive effect", RO:0004035 "acts upstream of, negative effect"} + filter
        * GO:0005575 "cellular component"
            * Term: GO:0005575 => relation is RO:0002432 "is_active_in" + repair
            * If term is subclass of `GO:0032991 "protein-containing complex"` with relation one of {RO:0002432 "is_active_in", RO:0001025 "located in"} => relation should be repaired to `BFO:0000050 "part of"`
            * If term is subclass of `GO:0032991` and any other relation, then it should be filtered
            * Term: any other subclass of `GO:0008372` => allowed relations are {`RO:0001025 "located in"`, `RO:0002432 "is_active_in"`, `RO:0002325 "colocalizes_with"`} and other relations repaired to `RO:0001025 "located in"`.
        """
        if config.ontology is None:
            return TestResult(ResultType.PASS, "", annotation)

        term = str(annotation.object.id)
        namespace = config.ontology.obo_namespace(term)
        repair_state = RepairState.OKAY
        relation = annotation.relation
        allowed = set()

        repaired_annotation = annotation
        if term == "GO:0005554":
            enables = association.Curie(namespace="RO", identity="0002327")
            if relation != enables:
                repaired_annotation = copy.deepcopy(annotation)
                repaired_annotation.relation = enables
                repaired_annotation.qualifiers = [enables]
                allowed = set([enables])
                repair_state = RepairState.REPAIRED
        elif namespace == "molecular_function":
            if relation not in self.allowed_mf:
                enables = association.Curie(namespace="RO", identity="0002327")
                repaired_annotation = copy.deepcopy(annotation)
                repaired_annotation.relation = enables
                repaired_annotation.qualifiers = [enables]
                allowed = self.allowed_mf
                repair_state = RepairState.REPAIRED
        elif term == "GO:0008150":
            involved_in = association.Curie(namespace="RO", identity="0002331")
            if relation != involved_in:
                repaired_annotation = copy.deepcopy(annotation)
                repaired_annotation.relation = involved_in
                repaired_annotation.qualifiers = [involved_in]
                allowed = set([involved_in])
                repair_state = RepairState.REPAIRED
        elif namespace == "biological_process":
            acts_upstream_of_or_within = association.Curie("RO", "0002264")
            if relation not in self.allowed_bp:
                repaired_annotation = copy.deepcopy(annotation)
                repaired_annotation.relation = acts_upstream_of_or_within
                repaired_annotation.qualifiers = [acts_upstream_of_or_within]
                allowed = self.allowed_bp
                repair_state = RepairState.REPAIRED
        elif term == "GO:0005575":
            is_active_in = association.Curie(namespace="RO",
                                             identity="0002432")
            if relation != is_active_in:
                repaired_annotation = copy.deepcopy(annotation)
                repaired_annotation.relation = is_active_in
                repaired_annotation.qualifiers = [is_active_in]
                allowed = set([is_active_in])
                repair_state = RepairState.REPAIRED
        elif namespace == "cellular_component":
            if term in self.make_protein_complex_descendents_if_not_present(
                    config.ontology):
                part_of = association.Curie(namespace="BFO",
                                            identity="0000050")
                if relation not in self.allowed_cc_complex:
                    if relation in self.repairable_cc_complex:
                        repaired_annotation = copy.deepcopy(annotation)
                        repaired_annotation.relation = part_of
                        repaired_annotation.qualifiers = [part_of]
                        allowed = self.allowed_cc_complex
                        repair_state = RepairState.REPAIRED
                    else:
                        # Not repairable to part_of, so filter
                        repaired_annotation = annotation
                        allowed = self.allowed_cc_complex
                        repair_state = RepairState.FAILED
            else:
                located_in = association.Curie(namespace="RO",
                                               identity="0001025")
                if relation not in self.allowed_cc_other:
                    repaired_annotation = copy.deepcopy(annotation)
                    repaired_annotation.relation = located_in
                    repaired_annotation.qualifiers = [located_in]
                    allowed = self.allowed_cc_other
                    repair_state = RepairState.REPAIRED
        else:
            # If we reach here, we're in a weird case where a term is not in either
            # of the three main GO branches, or does not have a namespace defined.
            # If this is the case we should just pass along as if the ontology is missing
            return TestResult(
                repair_result(RepairState.OKAY, self.fail_mode),
                "{}: {}".format(self.message(repair_state),
                                "GO term has no namespace"),
                repaired_annotation)

        allowed_str = ", ".join([str(a) for a in allowed])
        return TestResult(
            repair_result(repair_state, self.fail_mode),
            "{}: {} should be one of {}".format(self.message(repair_state),
                                                relation, allowed_str),
            repaired_annotation)
Exemple #16
0
    def __init__(self):
        super().__init__(
            "GORULE:0000061",
            "Only certain gene product to term relations are allowed for a given GO term",
            FailMode.HARD)
        self.protein_containing_complex_descendents = None

        self.allowed_mf = set([
            association.Curie(namespace="RO", identity="0002327"),
            association.Curie(namespace="RO", identity="0002326")
        ])
        self.allowed_bp = set([
            association.Curie("RO", "0002331"),
            association.Curie("RO", "0002264"),
            association.Curie("RO", "0004032"),
            association.Curie("RO", "0004033"),
            association.Curie("RO", "0002263"),
            association.Curie("RO", "0004034"),
            association.Curie("RO", "0004035")
        ])
        self.allowed_cc_complex = set([association.Curie("BFO", "0000050")])
        self.repairable_cc_complex = set([
            association.Curie("RO", "0002432"),
            association.Curie("RO", "0001025")
        ])
        self.allowed_cc_other = set([
            association.Curie("RO", "0001025"),
            association.Curie("RO", "0002432"),
            association.Curie("RO", "0002325")
        ])