Python ParseResult Examples

Programming Language: Python

Namespace/Package Name: ontobio.io.assocparser

Method/Function: ParseResult

Examples at hotexamples.com: 15

Python ParseResult - 15 examples found. These are the top rated real world Python examples of ontobio.io.assocparser.ParseResult extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: gafparser.py Project: biolink/ontobio

def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX])  # type: assocparser.ValidateResult
    if not parsed_taxons_result.valid:
        report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = parsed_taxons_result.parsed[0]

    date = assocparser.parse_date(gaf_line[13], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None
    subject_curie = association.Curie(gaf_line[0], gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon)
    gpi_entity = bio_entities.get(subject_curie)
    if gpi_entity is not None and subject != gpi_entity:
        subject = gpi_entity

    # column 4 is qualifiers -> index 3
    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object.
    # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc
    # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation.
    parsed_qualifiers = qualifier_parser.validate(gaf_line[3])
    if not parsed_qualifiers.valid:
        report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    aspect = gaf_line[8]
    negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect)
    # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier
    qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers]

    object = association.Term(association.Curie.from_str(gaf_line[4]), taxon)
    if isinstance(object, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1)

    # References
    references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e]
    for r in references:
        if isinstance(r, association.Error):
            report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None]
    eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0])
    if eco_curie is None:
        report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7])
    if isinstance(withfroms, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence_type = association.Curie.from_str(eco_curie)
    if isinstance(evidence_type, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1)

    evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms)
    if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]):
        first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0]
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject_extensions = []
    if gaf_line[16]:
        subject_filler = association.Curie.from_str(gaf_line[16])
        if isinstance(subject_filler, association.Error):
            report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)
        # filler is not an Error, so keep moving
        subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler))

    conjunctions = []
    if gaf_line[15]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gaf_line[15],
            conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    relation_uri = relations.lookup_label(relation_label)
    if relation_uri is None:
        report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    # We don't have to check that this is well formed because we're grabbing it from the known relations URI map.
    relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0])

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=relation_curie,
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=conjunctions,
        provided_by=gaf_line[14],
        date=date,
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #2

Show file

File: gafparser.py Project: diatomsRcool/ontobio

    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version
        if 17 > len(vals) >= 15:
            vals += [""] * (17 - len(vals))

        if len(vals) > 17:
            # If we see more than 17 columns, we will just cut off the columns after column 17
            self.report.warning(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
                rule=1)
            vals = vals[:17]

        if len(vals) != 17:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
                .format(columns=len(vals)),
                rule=1)
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon)

        ## check for missing columns
        if db == "":
            self.report.error(line,
                              Report.INVALID_IDSPACE,
                              "EMPTY",
                              "col1 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if db_object_id == "":
            self.report.error(line,
                              Report.INVALID_ID,
                              "EMPTY",
                              "col2 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if taxon == "":
            self.report.error(line,
                              Report.INVALID_TAXON,
                              "EMPTY",
                              "taxon column is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if reference == "":
            self.report.error(line,
                              Report.INVALID_ID,
                              "EMPTY",
                              "reference column 6 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)

        if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace:
            self.report.warning(
                line,
                Report.INVALID_ID,
                assigned_by,
                "GORULE:0000027: assigned_by is not present in groups reference",
                taxon=taxon,
                rule=27)

        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(
                    line,
                    Report.INVALID_ID_DBXREF,
                    db,
                    "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated"
                    .format(db, upgrade),
                    taxon=taxon,
                    rule=27)
                db = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(
                id, split_line, allowed_ids=self.config.entity_idspaces):

            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        if self.gpi is not None:
            entity = self.gpi.get(id, None)
            if entity is not None:
                db_object_symbol = entity["symbol"]
                db_object_name = entity["name"]
                db_object_synonym = entity["synonyms"]
                db_object_type = entity["type"]

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)
        if date == None:
            return assocparser.ParseResult(line, [], True)

        vals[13] = date

        ecomap = self.config.ecomap
        if ecomap is not None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS",
                    rule=1)
                return assocparser.ParseResult(line, [], True)

        # Throw out the line if it uses GO_REF:0000033, see https://github.com/geneontology/go-site/issues/563#event-1519351033
        if "GO_REF:0000033" in reference.split("|"):
            self.report.error(
                line,
                assocparser.Report.INVALID_ID,
                reference,
                msg=
                "Disallowing GO_REF:0000033 in reference field as of 03/13/2018",
                rule=30)
            return assocparser.ParseResult(line, [], True)

        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        references = self.normalize_refs(references, split_line)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, split_line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        if goid.startswith("GO:") and aspect.upper() not in ["C", "F", "P"]:
            self.report.error(line,
                              assocparser.Report.INVALID_ASPECT,
                              aspect,
                              rule=28)
            return assocparser.ParseResult(line, [], True)

        go_rule_results = qc.test_go_rules(vals, self.config)
        for rule_id, result in go_rule_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    goid,
                                    msg="{id}: {message}".format(
                                        id=rule_id, message=result.message),
                                    rule=int(rule_id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  goid,
                                  msg="{id}: {message}".format(
                                      id=rule_id, message=result.message),
                                  rule=int(rule_id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        taxons = taxon.split("|")
        normalized_taxon = self._taxon_id(taxons[0], split_line)
        if normalized_taxon == None:
            self.report.error(line,
                              assocparser.Report.INVALID_TAXON,
                              taxon,
                              msg="Taxon ID is invalid")
            return assocparser.ParseResult(line, [], True)

        self._validate_taxon(normalized_taxon, split_line)

        interacting_taxon = None
        if len(taxons) == 2:
            interacting_taxon = self._taxon_id(taxons[1], split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  taxon,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## parse annotation extension
        ## See appendix in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, aspect)

        ## --
        ## goid
        ## --
        # TODO We shouldn't overload buildin keywords/functions
        object = {'id': goid, 'taxon': normalized_taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'fullname': db_object_name,
            'synonyms': synonyms,
            'taxon': {
                'id': normalized_taxon
            }
        }

        ## --
        ## gene_product_isoform
        ## --
        ## This is mapped to a more generic concept of subject_extensions
        subject_extns = []
        if gene_product_isoform is not None and gene_product_isoform != '':
            subject_extns.append({
                'property': 'isoform',
                'filler': gene_product_isoform
            })

        object_extensions = {}
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            object_extensions['union_of'] = object_or_exprs

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence_obj = {
            'type': evidence,
            'has_supporting_reference': references,
            'with_support_from': withfroms
        }

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': other_qualifiers,
            'aspect': aspect,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': evidence_obj,
            'provided_by': assigned_by,
            'date': date,
            'subject_extensions': subject_extns,
            'object_extensions': object_extensions
        }

        return assocparser.ParseResult(line, [assoc], False, evidence.upper())

Example #3

Show file

    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version
        if 17 > len(vals) >= 15:
            vals += [""] * (17 - len(vals))

        if len(vals) != 17:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            print("skipping cause {} not validated!".format(id))
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, line, ANNOTATION):
            print("skipping cause {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        date = self._normalize_gaf_date(date, line)

        ecomap = self.config.ecomap
        if ecomap != None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS")
                return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        taxa = [self._taxon_id(x) for x in taxon.split("|")]
        taxon = taxa[0]
        in_taxa = taxa[1:]
        self._validate_taxon(taxon, line)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## process associations
        ## --
        ## note that any disjunctions in the annotation extension
        ## will result in the generation of multiple associations
        assocs = []
        xp_ors = annotation_xp.split("|")
        for xp_or in xp_ors:

            # gather conjunctive expressions in extensions field
            xp_ands = xp_or.split(",")
            extns = []
            for xp_and in xp_ands:
                if xp_and != "":
                    expr = self._parse_relationship_expression(xp_and,
                                                               line=line)
                    if expr is not None:
                        extns.append(expr)

            ## --
            ## qualifier
            ## --
            negated, relation, other_qualifiers = self._parse_qualifier(
                qualifier, aspect)

            ## --
            ## goid
            ## --
            # TODO We shouldn't overload buildin keywords/functions
            object = {'id': goid, 'taxon': taxon}

            # construct subject dict
            subject = {
                'id': id,
                'label': db_object_symbol,
                'type': db_object_type,
                'fullname': db_object_name,
                'synonyms': synonyms,
                'taxon': {
                    'id': taxon
                }
            }

            ## --
            ## gene_product_isoform
            ## --
            ## This is mapped to a more generic concept of subject_extensions
            subject_extns = []
            if gene_product_isoform is not None and gene_product_isoform != '':
                subject_extns.append({
                    'property': 'isoform',
                    'filler': gene_product_isoform
                })

            ## --
            ## evidence
            ## reference
            ## withfrom
            ## --
            evidence_obj = {
                'type': evidence,
                'has_supporting_reference': self._split_pipe(reference)
            }
            evidence_obj['with_support_from'] = self._split_pipe(withfrom)

            ## Construct main return dict
            assoc = {
                'source_line': line,
                'subject': subject,
                'object': object,
                'negated': negated,
                'qualifiers': other_qualifiers,
                'aspect': aspect,
                'relation': {
                    'id': relation
                },
                'evidence': evidence_obj,
                'provided_by': assigned_by,
                'date': date,
            }
            if len(subject_extns) > 0:
                assoc['subject_extensions'] = subject_extns
            if len(extns) > 0:
                assoc['object_extensions'] = extns

            self._validate_assoc(assoc, line)

            assocs.append(assoc)

        return assocparser.ParseResult(line, assocs, False, evidence.upper())

Example #4

Show file

File: gafparser.py Project: biolink/ontobio

    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            # Save off version info here
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.2":
                        logger.info("Detected GAF version 2.2")
                        self.version = version
                    else:
                        logger.info("Detected GAF version {}, so using 2.1".format(version))
                        self.version = self.default_version
                        # Compute the cell component subclass closure
                        self.make_internal_cell_component_closure()

            return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version))
            self.version = self.default_version
            self.make_internal_cell_component_closure()

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version

        parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        # Qualifier is index 3
        # If we are 2.1, and qualifier has no relation
        # Also must have an ontology
        # Then upgrade
        # For https://github.com/geneontology/go-site/issues/1558
        if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology:
            assoc = self.upgrade_empty_qualifier(assoc)

        ## Run GO Rules, save split values into individual variables
        # print("Config is {}".format(self.config.__dict__.keys()))
        go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "",
                                    msg="Passing Rule", rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation
        split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon))

        if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
            self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
                "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)

        db = assoc.subject.id.namespace
        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
                assoc.subject.id.namespace = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --assigned_by
        if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces):
            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        # if self.gpi is not None:
        #     entity = self.gpi.get(str(assoc.subject.id), None)
        #     if entity is not None:
        #         assoc.subject.label = entity["symbol"]
        #         assoc.subject.fullname = entity["name"]
        #         assoc.subject.synonyms = entity["synonyms"].split("|")
        #         assoc.subject.type = entity["type"]

        if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(assoc.object.id))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)
        with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf')
        if with_support_from is None:
            return assocparser.ParseResult(line, [], True)
        assoc.evidence.with_support_from = with_support_from
        # validation
        self._validate_symbol(assoc.subject.label, split_line)


        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line)
        valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True
        if not valid_taxon:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27)
        if not valid_interacting:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27)
        if (not valid_taxon) or (not valid_interacting):
            return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False, vals[6])

Example #5

Show file

File: gpadparser.py Project: kshefchek/ontobio

def from_2_0(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    SUBJECT_CURIE = 0
    RELATION = 2
    ONTOLOGY_CLASS_INDEX = 3
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    DATE_INDEX = 8
    ASSIGNED_BY_INDEX = 9
    required = [
        SUBJECT_CURIE, RELATION, ONTOLOGY_CLASS_INDEX, REFERENCE_INDEX,
        EVIDENCE_INDEX, DATE_INDEX, ASSIGNED_BY_INDEX
    ]
    for req in required:
        if gpad_line[req] == "":
            report.error(source_line,
                         Report.INVALID_ID,
                         "EMPTY",
                         "Column {} is empty".format(req + 1),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie.from_str(gpad_line[SUBJECT_CURIE])
    if subject_curie.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[SUBJECT_CURIE],
                     "Problem parsing DB Object",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject = association.Subject(subject_curie, "", "", [], "", taxon)
    entity = bio_entities.get(subject_curie)
    if entity is not None:
        # If we found a subject entity, then set `subject` to the found entity
        subject = entity
        taxon = subject.taxon

    negated = gpad_line[1] == "NOT"

    relation = association.Curie.from_str(gpad_line[RELATION])
    if relation.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[RELATION],
                     "Problem parsing Relation",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    go_term = association.Curie.from_str(gpad_line[ONTOLOGY_CLASS_INDEX])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[ONTOLOGY_CLASS_INDEX],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[EVIDENCE_INDEX],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e)
        for e in gpad_line[REFERENCE_INDEX].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[REFERENCE_INDEX],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    interacting_taxon = None
    if gpad_line[7] != "":
        interacting_taxon = association.Curie.from_str(gpad_line[7])
        if interacting_taxon.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[7],
                         "Problem parsing Interacting Taxon",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    date = assocparser.parse_iso_date(gpad_line[DATE_INDEX], report,
                                      source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    conjunctions = []
    # The elements of the extension units are Curie(Curie)
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_curie_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=relation,
                                  object=object,
                                  negated=negated,
                                  qualifiers=[relation],
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #6

Show file

    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = [el.strip() for el in line.split("\t")]
        if len(vals) < 10 or len(vals) > 12:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be between 10 and 12"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        if len(vals) < 12:
            vals += [""] * (12 - len(vals))

        [
            db, db_object_id, qualifier, goid, reference, evidence, withfrom,
            interacting_taxon_id, date, assigned_by, annotation_xp,
            annotation_properties
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)

        if reference == "":
            self.report.error(line, Report.INVALID_ID, "EMPTY",
                              "reference column 6 is empty")
            return assocparser.ParseResult(line, [], True)

        self._validate_id(evidence, split_line)

        interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id
        if interacting_taxon != None:
            interacting_taxon = self._taxon_id(interacting_taxon_id,
                                               split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  interacting_taxon_id,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        # Reference Column
        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        ## --
        ## parse annotation extension
        ## See appending in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        assoc = {
            'source_line': line,
            'subject': {
                'id': id
            },
            'object': {
                'id': goid
            },
            'negated': negated,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': {
                'type': evidence,
                'with_support_from': withfroms,
                'has_supporting_reference': references
            },
            'provided_by': assigned_by,
            'date': date,
        }
        if len(other_qualifiers) > 0:
            assoc['qualifiers'] = other_qualifiers
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            assoc['object']['extensions'] = {'union_of': object_or_exprs}

        return assocparser.ParseResult(line, [assoc], False)

Example #7

Show file

File: gpadparser.py Project: kshefchek/ontobio

    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.0":
                        logger.info("Detected GPAD version 2.0")
                        self.version = version
                    else:
                        logger.info(
                            "Detected GPAD version {}, so defaulting to 1.2".
                            format(version))
                        self.version = self.default_version

            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assume GPAD version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = [el.strip() for el in line.split("\t")]

        parsed = to_association(list(vals),
                                report=self.report,
                                version=self.gpad_version(),
                                bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        go_rule_results = qc.test_go_rules(assoc, self.config)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        if not self._validate_id(
                str(assoc.subject.id), split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(
                str(assoc.object.id), split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id),
                                                      split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        if not self._validate_id(str(assoc.evidence.type), split_line):
            return assocparser.ParseResult(line, [], True)

        if assoc.interacting_taxon:
            if not self._validate_taxon(str(assoc.interacting_taxon),
                                        split_line):
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  str(assoc.interacting_taxon),
                                  "Taxon ID is invalid",
                                  rule=27)
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        # Reference Column
        references = self.validate_curie_ids(
            assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False)

Example #8

Show file

File: gpadparser.py Project: kshefchek/ontobio

def from_1_2(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    QUALIFIER = 2
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    if gpad_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[QUALIFIER] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "qualifier column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[EVIDENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "Evidence column is empty",
                     rule=1)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie(gpad_line[0], gpad_line[1])
    subject = association.Subject(subject_curie, "", [""], [], [], taxon)

    entity = bio_entities.get(subject_curie)
    if entity is not None:
        subject = entity
        taxon = subject.taxon

    go_term = association.Curie.from_str(gpad_line[3])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[3],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[5])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[5],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e) for e in gpad_line[4].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[4],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    # Guarenteed to have at least one element, from above check
    raw_qs = gpad_line[QUALIFIER].split("|")
    negated = "NOT" in raw_qs

    looked_up_qualifiers = [
        relations.lookup_label(q) for q in raw_qs if q != "NOT"
    ]
    if None in looked_up_qualifiers:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     raw_qs,
                     "Could not find a URI for qualifier",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    qualifiers = [
        association.Curie.from_str(curie_util.contract_uri(q)[0])
        for q in looked_up_qualifiers
    ]

    date = assocparser.parse_date(gpad_line[8], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = None
    if gpad_line[7]:
        taxon_result = gpad_line_validators["taxon"].validate(gpad_line[7])
        if not taxon_result.valid:
            report.error(source_line,
                         Report.INVALID_TAXON,
                         taxon_result.original,
                         taxon_result.message,
                         taxon=str(taxon_result.original),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)
        else:
            interacting_taxon = taxon_result.parsed[0]

    conjunctions = []
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    # print(properties_list)
    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=qualifiers[0],
                                  object=object,
                                  negated=negated,
                                  qualifiers=qualifiers,
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #9

Show file

File: gafparser.py Project: robertdigital/ontobio

def to_association(gaf_line: List[str],
                   report=None,
                   group="unknown",
                   dataset="unknown") -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
            .format(columns=len(gaf_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[TAXON_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "taxon column is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column 6 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = gaf_line[12].split("|")
    taxon_curie = taxon[0].replace("taxon", "NCBITaxon")
    interacting_taxon = taxon[1].replace(
        "taxon", "NCBITaxon") if len(taxon) == 2 else None
    subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9],
                                  gaf_line[10].split("|"), gaf_line[11],
                                  taxon_curie)
    aspect = gaf_line[8]
    negated, relation, qualifiers = assocparser._parse_qualifier(
        gaf_line[3], aspect)

    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    for q in qualifiers:

        if q not in allowed_qualifiers:
            report.error(
                source_line,
                Report.INVALID_QUALIFIER,
                q,
                "Qualifiers must be `contributes_to`, `colocalizes_with`, or `NOT`",
                taxon=gaf_line[TAXON_INDEX],
                rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    object = association.Term(gaf_line[4], taxon_curie)
    evidence = association.Evidence(ecomap.coderef_to_ecoclass(gaf_line[6]),
                                    [e for e in gaf_line[5].split("|") if e],
                                    [e for e in gaf_line[7].split("|") if e])
    subject_extensions = [
        association.ExtensionUnit("rdfs:subClassOf", gaf_line[16])
    ] if gaf_line[16] else []

    conjunctions = []
    if gaf_line[15]:
        for conjuncts in gaf_line[15].split("|"):
            extension_units = []
            for u in conjuncts.split(","):
                parsed = relation_tuple.findall(u)
                if len(parsed) == 1:
                    rel, term = parsed[0]
                    extension_units.append(association.ExtensionUnit(
                        rel, term))
                else:
                    # Otherwise, something went bad with the regex, and it's a bad parse
                    report.error(source_line,
                                 Report.EXTENSION_SYNTAX_ERROR,
                                 u,
                                 "extensions should be relation(curie)",
                                 taxon=taxon,
                                 rule=1)
                    return assocparser.ParseResult(source_line, [],
                                                   True,
                                                   report=report)

            conjunction = association.ExtensionConjunctions(extension_units)
            conjunctions.append(conjunction)
    object_extensions = association.ExtensionExpression(conjunctions)
    looked_up_rel = relations.lookup_label(relation)
    if looked_up_rel is None:
        report.error(
            source_line,
            assocparser.Report.INVALID_QUALIFIER,
            relation,
            "Qualifer must be \"colocalizes_with\", \"contributes_to\", or \"NOT\"",
            taxon=taxon,
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=curie_util.contract_uri(looked_up_rel)[0],
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=object_extensions,
        provided_by=gaf_line[14],
        date=gaf_line[13],
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #10

Show file

    def parse_line(self, line):
        """
        Parses a single line of a HPOA file

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        config = self.config

        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        # http://human-phenotype-ontology.github.io/documentation.html#annot
        vals = line.split("\t")
        if len(vals) != 14:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 14"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, hpoid, reference,
            evidence, onset, frequency, withfrom, aspect, db_object_synonym,
            date, assigned_by
        ] = vals

        # hardcode this, as HPOA is currently human-only
        taxon = 'NCBITaxon:9606'

        # hardcode this, as HPOA is currently disease-only
        db_object_type = 'disease'

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(hpoid, line, ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        # validation
        #self._validate_symbol(db_object_symbol, line)

        #TODO: HPOA has different date styles
        #date = self._normalize_gaf_date(date, line)

        # Example use case: mapping from OMIM to Orphanet
        if config.entity_map is not None:
            id = self.map_id(id, config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## qualifier
        ## --
        ## we generate both qualifier and relation field
        relation = None
        qualifiers = qualifier.split("|")
        if qualifier == '':
            qualifiers = []
        negated = 'NOT' in qualifiers
        other_qualifiers = [q for q in qualifiers if q != 'NOT']

        ## CURRENTLY NOT USED
        if len(other_qualifiers) > 0:
            relation = other_qualifiers[0]
        else:
            if aspect == 'O':
                relation = 'has_phenotype'
            elif aspect == 'I':
                relation = 'has_inheritance'
            elif aspect == 'M':
                relation = 'mortality'
            elif aspect == 'C':
                relation = 'has_onset'
            else:
                relation = None

        ## --
        ## hpoid
        ## --
        object = {'id': hpoid, 'taxon': taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'synonyms': synonyms,
            'taxon': {
                'id': taxon
            }
        }

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence = {
            'type': evidence,
            'has_supporting_reference': reference.split("; ")
        }
        evidence['with_support_from'] = self._split_pipe(withfrom)

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': qualifiers,
            'relation': {
                'id': relation
            },
            'evidence': evidence,
            'provided_by': assigned_by,
            'date': date,
        }

        self._validate_assoc(assoc, line)

        return assocparser.ParseResult(line, [assoc], False)

Example #11

Show file

    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """

        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = line.split("\t")
        if len(vals) != 12:
            self.report.error(
                line,
                Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 12"
                .format(columns=len(vals)))
            return assocparser.ParseReslt(line, [], True)

        [
            db,
            db_object_id,
            qualifier,
            goid,
            reference,
            evidence,
            withfrom,
            interacting_taxon_id,  # TODO
            date,
            assigned_by,
            annotation_xp,
            annotation_properties
        ] = vals

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, line, ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        date = self._normalize_gaf_date(date, line)

        self._validate_id(evidence, line, None)
        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        assocs = []
        xp_ors = annotation_xp.split("|")
        for xp_or in xp_ors:
            xp_ands = xp_or.split(",")
            extns = []
            for xp_and in xp_ands:
                if xp_and != "":
                    expr = self._parse_class_expression(xp_and, line=line)
                    if expr is not None:
                        extns.append(expr)
            assoc = {
                'source_line': line,
                'subject': {
                    'id': id
                },
                'object': {
                    'id': goid,
                    'extensions': extns
                },
                'negated': negated,
                'relation': {
                    'id': relation
                },
                'evidence': {
                    'type': evidence,
                    'with_support_from': self._split_pipe(withfrom),
                    'has_supporting_reference': self._split_pipe(reference)
                },
                'provided_by': assigned_by,
                'date': date,
            }
            if len(other_qualifiers) > 0:
                assoc['qualifiers'] = other_qualifiers

            self._validate_assoc(assoc, line)

            assocs.append(assoc)
        return assocparser.ParseResult(line, assocs, False)

Example #12

Show file

File: gpadparser.py Project: meftaul/ontobio

    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        vals = [el.strip() for el in line.split("\t")]

        parsed = to_association(list(vals), report=self.report)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        go_rule_results = qc.test_go_rules(assoc, self.config)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        vals = list(go_rule_results.annotation.to_gpad_tsv())
        [
            db, db_object_id, qualifier, goid, reference, evidence, withfrom,
            interacting_taxon_id, date, assigned_by, annotation_xp,
            annotation_properties
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)

        if reference == "":
            self.report.error(line, Report.INVALID_ID, "EMPTY",
                              "reference column 6 is empty")
            return assocparser.ParseResult(line, [], True)

        self._validate_id(evidence, split_line)

        interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id
        if interacting_taxon != None:
            interacting_taxon = self._taxon_id(interacting_taxon_id,
                                               split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  interacting_taxon_id,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        # Reference Column
        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        ## --
        ## parse annotation extension
        ## See appending in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        subject_symbol = id
        subject_fullname = id
        subject_synonyms = []
        if self.gpi is not None:
            gp = self.gpi.get(id, {})
            if gp is not {}:
                subject_symbol = gp["symbol"]
                subject_fullname = gp["name"]
                subject_synonyms = gp["synonyms"].split("|")

        assoc = {
            'source_line': line,
            'subject': {
                'id': id,
                'label': subject_symbol,
                'fullname': subject_fullname,
                'synonyms': subject_synonyms,
                'taxon': {
                    'id': interacting_taxon
                },
            },
            'object': {
                'id': goid
            },
            'negated': negated,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': {
                'type': evidence,
                'with_support_from': withfroms,
                'has_supporting_reference': references
            },
            'subject_extensions': [],
            'object_extensions': {},
            'aspect': self.compute_aspect(goid),
            'provided_by': assigned_by,
            'date': date,
        }
        if len(other_qualifiers) > 0:
            assoc['qualifiers'] = other_qualifiers
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            assoc['object_extensions'] = {'union_of': object_or_exprs}

        return assocparser.ParseResult(line, [assoc], False)

Example #13

Show file

File: gpadparser.py Project: meftaul/ontobio

def to_association(gpad_line: List[str],
                   report=None,
                   group="unknown",
                   dataset="unknown") -> assocparser.ParseResult:

    report = Report(group=group, dataset=dataset) if report is None else report

    source_line = "\t".join(gpad_line)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)))
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    QUALIFIER = 2
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    if gpad_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[QUALIFIER] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "qualifier column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[EVIDENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "Evidence column is empty",
                     rule=1)

    taxon = ""
    subject_curie = "{db}:{id}".format(db=gpad_line[0], id=gpad_line[1])
    subject = association.Subject(subject_curie, "", "", [], "", "")
    object = association.Term(gpad_line[3], "")
    evidence = association.Evidence(gpad_line[5],
                                    [e for e in gpad_line[4].split("|") if e],
                                    [e for e in gpad_line[6].split("|") if e])

    raw_qs = gpad_line[2].split("|")
    negated = "NOT" in raw_qs
    looked_up_qualifiers = [
        relations.lookup_label(q) for q in raw_qs if q != "NOT"
    ]
    if None in looked_up_qualifiers:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     raw_qs,
                     "Could not find a URI for qualifier",
                     taxon=taxon,
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    qualifiers = [curie_util.contract_uri(q)[0] for q in looked_up_qualifiers]

    conjunctions = []
    if gpad_line[11]:
        for conjuncts in gpad_line[11].split("|"):
            extension_units = []
            for u in conjuncts.split(","):
                parsed = relation_tuple.findall(u)
                if len(parsed) == 1:
                    rel, term = parsed[0]
                    extension_units.append(association.ExtensionUnit(
                        rel, term))
                else:
                    # Otherwise, something went bad with the regex, and it's a bad parse
                    report.error(source_line,
                                 Report.EXTENSION_SYNTAX_ERROR,
                                 u,
                                 "extensions should be relation(curie)",
                                 taxon=taxon,
                                 rule=1)
                    return assocparser.ParseResult(source_line, [],
                                                   True,
                                                   report=report)

            conjunction = association.ExtensionConjunctions(extension_units)
            conjunctions.append(conjunction)
    object_extensions = association.ExtensionExpression(conjunctions)

    properties_list = [
        prop.split("=") for prop in gpad_line[11].split("|") if prop
    ]
    # print(properties_list)
    a = association.GoAssociation(
        source_line="\t".join(gpad_line),
        subject=subject,
        relation="",
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=None,
        interacting_taxon=gpad_line[7],
        evidence=evidence,
        subject_extensions=[],
        object_extensions=object_extensions,
        provided_by=gpad_line[9],
        date=gpad_line[8],
        properties={prop[0]: prop[1]
                    for prop in properties_list if prop})

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #14

Show file

File: gafparser.py Project: valearna/ontobio

def to_association(
    gaf_line: List[str],
    report=None,
    group="unknown",
    dataset="unknown",
    qualifier_parser=Qualifier2_1()) -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
            .format(columns=len(gaf_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[TAXON_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "taxon column is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column 6 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = gaf_line[12].split("|")
    taxon_curie = taxon[0].replace("taxon", "NCBITaxon")
    date = assocparser._normalize_gaf_date(gaf_line[13], report, taxon_curie,
                                           source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = taxon[1].replace(
        "taxon", "NCBITaxon") if len(taxon) == 2 else None
    subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9],
                                  gaf_line[10].split("|"), gaf_line[11],
                                  taxon_curie)
    aspect = gaf_line[8]
    negated, relation, qualifiers = assocparser._parse_qualifier(
        gaf_line[3], aspect)

    # column 4 is qualifiers -> index 3
    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    parsed_qualifiers = qualifier_parser.validate(gaf_line[3])
    if not parsed_qualifiers.valid:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     parsed_qualifiers.original,
                     parsed_qualifiers.message,
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(gaf_line[4], taxon_curie)
    evidence = association.Evidence(
        ecomap.coderef_to_ecoclass(gaf_line[6]),
        [e for e in gaf_line[5].split("|") if e],
        association.ConjunctiveSet.str_to_conjunctions(gaf_line[7]))

    subject_extensions = [
        association.ExtensionUnit("rdfs:subClassOf", gaf_line[16])
    ] if gaf_line[16] else []

    conjunctions = []
    if gaf_line[15]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gaf_line[15],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=taxon,
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    looked_up_rel = relations.lookup_label(relation)
    if looked_up_rel is None:
        report.error(source_line,
                     assocparser.Report.INVALID_QUALIFIER,
                     relation,
                     "Could not find CURIE for relation `{}`".format(relation),
                     taxon=taxon,
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=curie_util.contract_uri(looked_up_rel)[0],
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=conjunctions,
        provided_by=gaf_line[14],
        date=date,
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)

Example #15

Show file

File: gafparser.py Project: valearna/ontobio

    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            # Save off version info here
            if self.version is None:
                # We are still looking
                parsed = assocparser.parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.2":
                        logger.info("Detected GAF version 2.2")
                        self.version = version
                    else:
                        logger.info(
                            "Detected GAF version {}, so using 2.1".format(
                                version))
                        self.version = self.default_version

            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assum GAF version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version

        parsed = to_association(list(vals),
                                report=self.report,
                                qualifier_parser=self.qualifier_parser())
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]
        # self.report = parsed.report
        ## Run GO Rules, save split values into individual variables
        go_rule_results = qc.test_go_rules(assoc,
                                           self.config,
                                           group=self.group)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        vals = list(go_rule_results.annotation.to_gaf_tsv())
        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals
        split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon)

        if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace:
            self.report.warning(
                line,
                Report.INVALID_ID,
                assigned_by,
                "GORULE:0000027: assigned_by is not present in groups reference",
                taxon=taxon,
                rule=27)

        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(
                    line,
                    Report.INVALID_ID_DBXREF,
                    db,
                    "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated"
                    .format(db, upgrade),
                    taxon=taxon,
                    rule=27)
                db = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --assigned_by
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(
                id, split_line, allowed_ids=self.config.entity_idspaces):

            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        if self.gpi is not None:
            entity = self.gpi.get(id, None)
            if entity is not None:
                db_object_symbol = entity["symbol"]
                db_object_name = entity["name"]
                db_object_synonym = entity["synonyms"]
                db_object_type = entity["type"]

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        ecomap = self.config.ecomap
        if ecomap is not None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS",
                    rule=1)
                return assocparser.ParseResult(line, [], True)

        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, split_line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        taxons = taxon.split("|")
        normalized_taxon = self._taxon_id(taxons[0], split_line)
        if normalized_taxon == None:
            self.report.error(line,
                              assocparser.Report.INVALID_TAXON,
                              taxon,
                              msg="Taxon ID is invalid")
            return assocparser.ParseResult(line, [], True)

        self._validate_taxon(normalized_taxon, split_line)

        interacting_taxon = None
        if len(taxons) == 2:
            interacting_taxon = self._taxon_id(taxons[1], split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  taxon,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## parse annotation extension
        ## See appendix in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, aspect)

        ## --
        ## goid
        ## --
        # TODO We shouldn't overload buildin keywords/functions
        object = {'id': goid, 'taxon': normalized_taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'fullname': db_object_name,
            'synonyms': synonyms,
            'taxon': {
                'id': normalized_taxon
            }
        }

        ## --
        ## gene_product_isoform
        ## --
        ## This is mapped to a more generic concept of subject_extensions
        subject_extns = []
        if gene_product_isoform is not None and gene_product_isoform != '':
            subject_extns.append({
                'property': 'isoform',
                'filler': gene_product_isoform
            })

        object_extensions = {}
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            object_extensions['union_of'] = object_or_exprs

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence_obj = {
            'type': evidence,
            'has_supporting_reference': references,
            'with_support_from': withfroms
        }

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': other_qualifiers,  # should be either 0 or 1 item
            'aspect': aspect,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': evidence_obj,
            'provided_by': assigned_by,
            'date': date,
            'subject_extensions': subject_extns,
            'object_extensions': object_extensions
        }

        return assocparser.ParseResult(line, [assoc], False, evidence.upper())