def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities source_line = "\t".join(gaf_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gaf_line) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) gaf_line = gaf_line[:17] if 17 > len(gaf_line) >= 15: gaf_line += [""] * (17 - len(gaf_line)) if len(gaf_line) != 17: report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 TAXON_INDEX = 12 REFERENCE_INDEX = 5 if gaf_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX]) # type: assocparser.ValidateResult if not parsed_taxons_result.valid: report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = parsed_taxons_result.parsed[0] date = assocparser.parse_date(gaf_line[13], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None subject_curie = association.Curie(gaf_line[0], gaf_line[1]) subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon) gpi_entity = bio_entities.get(subject_curie) if gpi_entity is not None and subject != gpi_entity: subject = gpi_entity # column 4 is qualifiers -> index 3 # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object. # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation. parsed_qualifiers = qualifier_parser.validate(gaf_line[3]) if not parsed_qualifiers.valid: report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) aspect = gaf_line[8] negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect) # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers] object = association.Term(association.Curie.from_str(gaf_line[4]), taxon) if isinstance(object, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1) # References references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e] for r in references: if isinstance(r, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None] eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0]) if eco_curie is None: report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7]) if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence_type = association.Curie.from_str(eco_curie) if isinstance(evidence_type, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1) evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms) if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]): first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0] report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) subject_extensions = [] if gaf_line[16]: subject_filler = association.Curie.from_str(gaf_line[16]) if isinstance(subject_filler, association.Error): report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # filler is not an Error, so keep moving subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler)) conjunctions = [] if gaf_line[15]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gaf_line[15], conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) relation_uri = relations.lookup_label(relation_label) if relation_uri is None: report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # We don't have to check that this is well formed because we're grabbing it from the known relations URI map. relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0]) a = association.GoAssociation( source_line="\t".join(gaf_line), subject=subject, relation=relation_curie, object=object, negated=negated, qualifiers=qualifiers, aspect=aspect, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=subject_extensions, object_extensions=conjunctions, provided_by=gaf_line[14], date=date, properties={}) return assocparser.ParseResult(source_line, [a], False, report=report)
def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version if 17 > len(vals) >= 15: vals += [""] * (17 - len(vals)) if len(vals) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 self.report.warning( line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) vals = vals[:17] if len(vals) != 17: self.report.error( line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)" .format(columns=len(vals)), rule=1) return assocparser.ParseResult(line, [], True) [ db, db_object_id, db_object_symbol, qualifier, goid, reference, evidence, withfrom, aspect, db_object_name, db_object_synonym, db_object_type, taxon, date, assigned_by, annotation_xp, gene_product_isoform ] = vals split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon) ## check for missing columns if db == "": self.report.error(line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=taxon, rule=1) return assocparser.ParseResult(line, [], True) if db_object_id == "": self.report.error(line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=taxon, rule=1) return assocparser.ParseResult(line, [], True) if taxon == "": self.report.error(line, Report.INVALID_TAXON, "EMPTY", "taxon column is empty", taxon=taxon, rule=1) return assocparser.ParseResult(line, [], True) if reference == "": self.report.error(line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=taxon, rule=1) return assocparser.ParseResult(line, [], True) if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace: self.report.warning( line, Report.INVALID_ID, assigned_by, "GORULE:0000027: assigned_by is not present in groups reference", taxon=taxon, rule=27) if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: # Are we a synonym? upgrade = self.config.entity_idspaces.reverse(db) if upgrade is not None: # If we found a synonym self.report.warning( line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated" .format(db, upgrade), taxon=taxon, rule=27) db = upgrade ## -- ## db + db_object_id. CARD=1 ## -- id = self._pair_to_id(db, db_object_id) if not self._validate_id( id, split_line, allowed_ids=self.config.entity_idspaces): return assocparser.ParseResult(line, [], True) # Using a given gpi file to validate the gene object if self.gpi is not None: entity = self.gpi.get(id, None) if entity is not None: db_object_symbol = entity["symbol"] db_object_name = entity["name"] db_object_synonym = entity["synonyms"] db_object_type = entity["type"] if not self._validate_id(goid, split_line, context=ANNOTATION): print("skipping because {} not validated!".format(goid)) return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(goid, split_line) if valid_goid == None: return assocparser.ParseResult(line, [], True) goid = valid_goid date = self._normalize_gaf_date(date, split_line) if date == None: return assocparser.ParseResult(line, [], True) vals[13] = date ecomap = self.config.ecomap if ecomap is not None: if ecomap.coderef_to_ecoclass(evidence, reference) is None: self.report.error( line, assocparser.Report.UNKNOWN_EVIDENCE_CLASS, evidence, msg="Expecting a known ECO GAF code, e.g ISS", rule=1) return assocparser.ParseResult(line, [], True) # Throw out the line if it uses GO_REF:0000033, see https://github.com/geneontology/go-site/issues/563#event-1519351033 if "GO_REF:0000033" in reference.split("|"): self.report.error( line, assocparser.Report.INVALID_ID, reference, msg= "Disallowing GO_REF:0000033 in reference field as of 03/13/2018", rule=30) return assocparser.ParseResult(line, [], True) references = self.validate_pipe_separated_ids(reference, split_line) if references == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) references = self.normalize_refs(references, split_line) # With/From withfroms = self.validate_pipe_separated_ids(withfrom, split_line, empty_allowed=True, extra_delims=",") if withfroms == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # validation self._validate_symbol(db_object_symbol, split_line) # Example use case: mapping from UniProtKB to MOD ID if self.config.entity_map is not None: id = self.map_id(id, self.config.entity_map) toks = id.split(":") db = toks[0] db_object_id = toks[1:] vals[1] = db_object_id if goid.startswith("GO:") and aspect.upper() not in ["C", "F", "P"]: self.report.error(line, assocparser.Report.INVALID_ASPECT, aspect, rule=28) return assocparser.ParseResult(line, [], True) go_rule_results = qc.test_go_rules(vals, self.config) for rule_id, result in go_rule_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, goid, msg="{id}: {message}".format( id=rule_id, message=result.message), rule=int(rule_id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, goid, msg="{id}: {message}".format( id=rule_id, message=result.message), rule=int(rule_id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) ## -- ## end of line re-processing ## -- # regenerate line post-mapping line = "\t".join(vals) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon ## We do not use the second value taxons = taxon.split("|") normalized_taxon = self._taxon_id(taxons[0], split_line) if normalized_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, taxon, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) self._validate_taxon(normalized_taxon, split_line) interacting_taxon = None if len(taxons) == 2: interacting_taxon = self._taxon_id(taxons[1], split_line) if interacting_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, taxon, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) ## -- ## db_object_synonym CARD=0..* ## -- synonyms = db_object_synonym.split("|") if db_object_synonym == "": synonyms = [] ## -- ## parse annotation extension ## See appendix in http://doi.org/10.1186/1471-2105-15-155 ## -- object_or_exprs = self._parse_full_extension_expression( annotation_xp, line=split_line) ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, aspect) ## -- ## goid ## -- # TODO We shouldn't overload buildin keywords/functions object = {'id': goid, 'taxon': normalized_taxon} # construct subject dict subject = { 'id': id, 'label': db_object_symbol, 'type': db_object_type, 'fullname': db_object_name, 'synonyms': synonyms, 'taxon': { 'id': normalized_taxon } } ## -- ## gene_product_isoform ## -- ## This is mapped to a more generic concept of subject_extensions subject_extns = [] if gene_product_isoform is not None and gene_product_isoform != '': subject_extns.append({ 'property': 'isoform', 'filler': gene_product_isoform }) object_extensions = {} if object_or_exprs is not None and len(object_or_exprs) > 0: object_extensions['union_of'] = object_or_exprs ## -- ## evidence ## reference ## withfrom ## -- evidence_obj = { 'type': evidence, 'has_supporting_reference': references, 'with_support_from': withfroms } ## Construct main return dict assoc = { 'source_line': line, 'subject': subject, 'object': object, 'negated': negated, 'qualifiers': other_qualifiers, 'aspect': aspect, 'relation': { 'id': relation }, 'interacting_taxon': interacting_taxon, 'evidence': evidence_obj, 'provided_by': assigned_by, 'date': date, 'subject_extensions': subject_extns, 'object_extensions': object_extensions } return assocparser.ParseResult(line, [assoc], False, evidence.upper())
def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [], False) vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version if 17 > len(vals) >= 15: vals += [""] * (17 - len(vals)) if len(vals) != 17: self.report.error( line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)" .format(columns=len(vals))) return assocparser.ParseResult(line, [], True) [ db, db_object_id, db_object_symbol, qualifier, goid, reference, evidence, withfrom, aspect, db_object_name, db_object_synonym, db_object_type, taxon, date, assigned_by, annotation_xp, gene_product_isoform ] = vals ## -- ## db + db_object_id. CARD=1 ## -- id = self._pair_to_id(db, db_object_id) if not self._validate_id(id, line, ENTITY): print("skipping cause {} not validated!".format(id)) return assocparser.ParseResult(line, [], True) if not self._validate_id(goid, line, ANNOTATION): print("skipping cause {} not validated!".format(goid)) return assocparser.ParseResult(line, [], True) date = self._normalize_gaf_date(date, line) ecomap = self.config.ecomap if ecomap != None: if ecomap.coderef_to_ecoclass(evidence, reference) is None: self.report.error( line, assocparser.Report.UNKNOWN_EVIDENCE_CLASS, evidence, msg="Expecting a known ECO GAF code, e.g ISS") return assocparser.ParseResult(line, [], True) # validation self._validate_symbol(db_object_symbol, line) # Example use case: mapping from UniProtKB to MOD ID if self.config.entity_map is not None: id = self.map_id(id, self.config.entity_map) toks = id.split(":") db = toks[0] db_object_id = toks[1:] vals[1] = db_object_id ## -- ## end of line re-processing ## -- # regenerate line post-mapping line = "\t".join(vals) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon taxa = [self._taxon_id(x) for x in taxon.split("|")] taxon = taxa[0] in_taxa = taxa[1:] self._validate_taxon(taxon, line) ## -- ## db_object_synonym CARD=0..* ## -- synonyms = db_object_synonym.split("|") if db_object_synonym == "": synonyms = [] ## -- ## process associations ## -- ## note that any disjunctions in the annotation extension ## will result in the generation of multiple associations assocs = [] xp_ors = annotation_xp.split("|") for xp_or in xp_ors: # gather conjunctive expressions in extensions field xp_ands = xp_or.split(",") extns = [] for xp_and in xp_ands: if xp_and != "": expr = self._parse_relationship_expression(xp_and, line=line) if expr is not None: extns.append(expr) ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, aspect) ## -- ## goid ## -- # TODO We shouldn't overload buildin keywords/functions object = {'id': goid, 'taxon': taxon} # construct subject dict subject = { 'id': id, 'label': db_object_symbol, 'type': db_object_type, 'fullname': db_object_name, 'synonyms': synonyms, 'taxon': { 'id': taxon } } ## -- ## gene_product_isoform ## -- ## This is mapped to a more generic concept of subject_extensions subject_extns = [] if gene_product_isoform is not None and gene_product_isoform != '': subject_extns.append({ 'property': 'isoform', 'filler': gene_product_isoform }) ## -- ## evidence ## reference ## withfrom ## -- evidence_obj = { 'type': evidence, 'has_supporting_reference': self._split_pipe(reference) } evidence_obj['with_support_from'] = self._split_pipe(withfrom) ## Construct main return dict assoc = { 'source_line': line, 'subject': subject, 'object': object, 'negated': negated, 'qualifiers': other_qualifiers, 'aspect': aspect, 'relation': { 'id': relation }, 'evidence': evidence_obj, 'provided_by': assigned_by, 'date': date, } if len(subject_extns) > 0: assoc['subject_extensions'] = subject_extns if len(extns) > 0: assoc['object_extensions'] = extns self._validate_assoc(assoc, line) assocs.append(assoc) return assocparser.ParseResult(line, assocs, False, evidence.upper())
def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): # Save off version info here if self.version is None: # We are still looking parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.2": logger.info("Detected GAF version 2.2") self.version = version else: logger.info("Detected GAF version {}, so using 2.1".format(version)) self.version = self.default_version # Compute the cell component subclass closure self.make_internal_cell_component_closure() return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version)) self.version = self.default_version self.make_internal_cell_component_closure() vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities) if parsed.associations == []: return parsed assoc = parsed.associations[0] # Qualifier is index 3 # If we are 2.1, and qualifier has no relation # Also must have an ontology # Then upgrade # For https://github.com/geneontology/go-site/issues/1558 if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology: assoc = self.upgrade_empty_qualifier(assoc) ## Run GO Rules, save split values into individual variables # print("Config is {}".format(self.config.__dict__.keys())) go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) assoc = go_rule_results.annotation # type: association.GoAssociation split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon)) if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace: self.report.warning(line, Report.INVALID_ID, assoc.provided_by, "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27) db = assoc.subject.id.namespace if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: # Are we a synonym? upgrade = self.config.entity_idspaces.reverse(db) if upgrade is not None: # If we found a synonym self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27) assoc.subject.id.namespace = upgrade ## -- ## db + db_object_id. CARD=1 ## --assigned_by if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces): return assocparser.ParseResult(line, [], True) # Using a given gpi file to validate the gene object # if self.gpi is not None: # entity = self.gpi.get(str(assoc.subject.id), None) # if entity is not None: # assoc.subject.label = entity["symbol"] # assoc.subject.fullname = entity["name"] # assoc.subject.synonyms = entity["synonyms"].split("|") # assoc.subject.type = entity["type"] if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION): print("skipping because {} not validated!".format(assoc.object.id)) return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line) if valid_goid is None: return assocparser.ParseResult(line, [], True) assoc.object.id = association.Curie.from_str(valid_goid) references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line) if references is None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: validated = self.validate_curie_ids(wf.elements, split_line) if validated is None: return assocparser.ParseResult(line, [], True) with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf') if with_support_from is None: return assocparser.ParseResult(line, [], True) assoc.evidence.with_support_from = with_support_from # validation self._validate_symbol(assoc.subject.label, split_line) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon ## We do not use the second value valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line) valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True if not valid_taxon: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27) if not valid_interacting: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27) if (not valid_taxon) or (not valid_interacting): return assocparser.ParseResult(line, [], True) return assocparser.ParseResult(line, [assoc], False, vals[6])
def from_2_0(gpad_line: List[str], report=None, group="unknown", dataset="unknown", bio_entities=None): source_line = "\t".join(gpad_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gpad_line) > 12: report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 12 columns in this line. Proceeding by cutting off extra columns.", rule=1) gpad_line = gpad_line[:12] if 12 > len(gpad_line) >= 10: gpad_line += [""] * (12 - len(gpad_line)) if len(gpad_line) != 12: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(gpad_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables SUBJECT_CURIE = 0 RELATION = 2 ONTOLOGY_CLASS_INDEX = 3 REFERENCE_INDEX = 4 EVIDENCE_INDEX = 5 DATE_INDEX = 8 ASSIGNED_BY_INDEX = 9 required = [ SUBJECT_CURIE, RELATION, ONTOLOGY_CLASS_INDEX, REFERENCE_INDEX, EVIDENCE_INDEX, DATE_INDEX, ASSIGNED_BY_INDEX ] for req in required: if gpad_line[req] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "Column {} is empty".format(req + 1), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = association.Curie("NCBITaxon", "0") subject_curie = association.Curie.from_str(gpad_line[SUBJECT_CURIE]) if subject_curie.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[SUBJECT_CURIE], "Problem parsing DB Object", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) subject = association.Subject(subject_curie, "", "", [], "", taxon) entity = bio_entities.get(subject_curie) if entity is not None: # If we found a subject entity, then set `subject` to the found entity subject = entity taxon = subject.taxon negated = gpad_line[1] == "NOT" relation = association.Curie.from_str(gpad_line[RELATION]) if relation.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[RELATION], "Problem parsing Relation", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) go_term = association.Curie.from_str(gpad_line[ONTOLOGY_CLASS_INDEX]) if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[ONTOLOGY_CLASS_INDEX], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(go_term, taxon) evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX]) if evidence_type.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[EVIDENCE_INDEX], "Problem parsing Evidence ECO Curie", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) references = [ association.Curie.from_str(e) for e in gpad_line[REFERENCE_INDEX].split("|") if e ] for r in references: if r.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[REFERENCE_INDEX], "Problem parsing references", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions( gpad_line[6]) # Returns a list of ConjuctiveSets or Error if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[6], "Problem parsing With/From column", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence = association.Evidence(evidence_type, references, withfroms) interacting_taxon = None if gpad_line[7] != "": interacting_taxon = association.Curie.from_str(gpad_line[7]) if interacting_taxon.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[7], "Problem parsing Interacting Taxon", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) date = assocparser.parse_iso_date(gpad_line[DATE_INDEX], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) conjunctions = [] # The elements of the extension units are Curie(Curie) if gpad_line[10]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gpad_line[10], conjunct_element_builder=lambda el: association.ExtensionUnit. from_curie_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie)", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) properties_list = association.parse_annotation_properties(gpad_line[11]) a = association.GoAssociation(source_line=source_line, subject=subject, relation=relation, object=object, negated=negated, qualifiers=[relation], aspect=None, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=[], object_extensions=conjunctions, provided_by=gpad_line[9], date=date, properties=properties_list) return assocparser.ParseResult(source_line, [a], False, report=report)
def parse_line(self, line): """Parses a single line of a GPAD. Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [], False) vals = [el.strip() for el in line.split("\t")] if len(vals) < 10 or len(vals) > 12: self.report.error( line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(vals))) return assocparser.ParseResult(line, [], True) if len(vals) < 12: vals += [""] * (12 - len(vals)) [ db, db_object_id, qualifier, goid, reference, evidence, withfrom, interacting_taxon_id, date, assigned_by, annotation_xp, annotation_properties ] = vals split_line = assocparser.SplitLine(line=line, values=vals, taxon="") id = self._pair_to_id(db, db_object_id) if not self._validate_id(id, split_line, context=ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id(goid, split_line, context=ANNOTATION): return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(goid, split_line) if valid_goid == None: return assocparser.ParseResult(line, [], True) goid = valid_goid date = self._normalize_gaf_date(date, split_line) if reference == "": self.report.error(line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty") return assocparser.ParseResult(line, [], True) self._validate_id(evidence, split_line) interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id if interacting_taxon != None: interacting_taxon = self._taxon_id(interacting_taxon_id, split_line) if interacting_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, interacting_taxon_id, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) #TODO: ecomap is currently one-way only #ecomap = self.config.ecomap #if ecomap != None: # if ecomap.ecoclass_to_coderef(evidence) == (None,None): # self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence, # msg="Expecting a known ECO class ID") ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, None) # Reference Column references = self.validate_pipe_separated_ids(reference, split_line) if references == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From withfroms = self.validate_pipe_separated_ids(withfrom, split_line, empty_allowed=True, extra_delims=",") if withfroms == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) ## -- ## parse annotation extension ## See appending in http://doi.org/10.1186/1471-2105-15-155 ## -- object_or_exprs = self._parse_full_extension_expression( annotation_xp, line=split_line) assoc = { 'source_line': line, 'subject': { 'id': id }, 'object': { 'id': goid }, 'negated': negated, 'relation': { 'id': relation }, 'interacting_taxon': interacting_taxon, 'evidence': { 'type': evidence, 'with_support_from': withfroms, 'has_supporting_reference': references }, 'provided_by': assigned_by, 'date': date, } if len(other_qualifiers) > 0: assoc['qualifiers'] = other_qualifiers if object_or_exprs is not None and len(object_or_exprs) > 0: assoc['object']['extensions'] = {'union_of': object_or_exprs} return assocparser.ParseResult(line, [assoc], False)
def parse_line(self, line): """Parses a single line of a GPAD. Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): if self.version is None: # We are still looking parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.0": logger.info("Detected GPAD version 2.0") self.version = version else: logger.info( "Detected GPAD version {}, so defaulting to 1.2". format(version)) self.version = self.default_version return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning( "No version number found for this file so we will assume GPAD version: {}" .format(self.default_version)) self.version = self.default_version vals = [el.strip() for el in line.split("\t")] parsed = to_association(list(vals), report=self.report, version=self.gpad_version(), bio_entities=self.bio_entities) if parsed.associations == []: return parsed assoc = parsed.associations[0] go_rule_results = qc.test_go_rules(assoc, self.config) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) assoc = go_rule_results.annotation # type: association.GoAssociation split_line = assocparser.SplitLine(line=line, values=vals, taxon="") if not self._validate_id( str(assoc.subject.id), split_line, context=ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id( str(assoc.object.id), split_line, context=ANNOTATION): return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line) if valid_goid is None: return assocparser.ParseResult(line, [], True) assoc.object.id = association.Curie.from_str(valid_goid) if not self._validate_id(str(assoc.evidence.type), split_line): return assocparser.ParseResult(line, [], True) if assoc.interacting_taxon: if not self._validate_taxon(str(assoc.interacting_taxon), split_line): self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27) return assocparser.ParseResult(line, [], True) #TODO: ecomap is currently one-way only #ecomap = self.config.ecomap #if ecomap != None: # if ecomap.ecoclass_to_coderef(evidence) == (None,None): # self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence, # msg="Expecting a known ECO class ID") # Reference Column references = self.validate_curie_ids( assoc.evidence.has_supporting_reference, split_line) if references is None: return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: validated = self.validate_curie_ids(wf.elements, split_line) if validated is None: return assocparser.ParseResult(line, [], True) return assocparser.ParseResult(line, [assoc], False)
def from_1_2(gpad_line: List[str], report=None, group="unknown", dataset="unknown", bio_entities=None): source_line = "\t".join(gpad_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gpad_line) > 12: report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 12 columns in this line. Proceeding by cutting off extra columns.", rule=1) gpad_line = gpad_line[:12] if 12 > len(gpad_line) >= 10: gpad_line += [""] * (12 - len(gpad_line)) if len(gpad_line) != 12: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(gpad_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 QUALIFIER = 2 REFERENCE_INDEX = 4 EVIDENCE_INDEX = 5 if gpad_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[QUALIFIER] == "": report.error(source_line, Report.INVALID_TAXON, "EMPTY", "qualifier column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[EVIDENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "Evidence column is empty", rule=1) taxon = association.Curie("NCBITaxon", "0") subject_curie = association.Curie(gpad_line[0], gpad_line[1]) subject = association.Subject(subject_curie, "", [""], [], [], taxon) entity = bio_entities.get(subject_curie) if entity is not None: subject = entity taxon = subject.taxon go_term = association.Curie.from_str(gpad_line[3]) if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[3], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(go_term, taxon) evidence_type = association.Curie.from_str(gpad_line[5]) if evidence_type.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[5], "Problem parsing Evidence ECO Curie", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) references = [ association.Curie.from_str(e) for e in gpad_line[4].split("|") if e ] for r in references: if r.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[4], "Problem parsing references", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions( gpad_line[6]) # Returns a list of ConjuctiveSets or Error if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[6], "Problem parsing With/From column", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence = association.Evidence(evidence_type, references, withfroms) # Guarenteed to have at least one element, from above check raw_qs = gpad_line[QUALIFIER].split("|") negated = "NOT" in raw_qs looked_up_qualifiers = [ relations.lookup_label(q) for q in raw_qs if q != "NOT" ] if None in looked_up_qualifiers: report.error(source_line, Report.INVALID_QUALIFIER, raw_qs, "Could not find a URI for qualifier", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) qualifiers = [ association.Curie.from_str(curie_util.contract_uri(q)[0]) for q in looked_up_qualifiers ] date = assocparser.parse_date(gpad_line[8], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = None if gpad_line[7]: taxon_result = gpad_line_validators["taxon"].validate(gpad_line[7]) if not taxon_result.valid: report.error(source_line, Report.INVALID_TAXON, taxon_result.original, taxon_result.message, taxon=str(taxon_result.original), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) else: interacting_taxon = taxon_result.parsed[0] conjunctions = [] if gpad_line[10]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gpad_line[10], conjunct_element_builder=lambda el: association.ExtensionUnit. from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie)", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) properties_list = association.parse_annotation_properties(gpad_line[11]) # print(properties_list) a = association.GoAssociation(source_line=source_line, subject=subject, relation=qualifiers[0], object=object, negated=negated, qualifiers=qualifiers, aspect=None, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=[], object_extensions=conjunctions, provided_by=gpad_line[9], date=date, properties=properties_list) return assocparser.ParseResult(source_line, [a], False, report=report)
def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown") -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report source_line = "\t".join(gaf_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gaf_line) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) gaf_line = gaf_line[:17] if 17 > len(gaf_line) >= 15: gaf_line += [""] * (17 - len(gaf_line)) if len(gaf_line) != 17: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)" .format(columns=len(gaf_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 TAXON_INDEX = 12 REFERENCE_INDEX = 5 if gaf_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[TAXON_INDEX] == "": report.error(source_line, Report.INVALID_TAXON, "EMPTY", "taxon column is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = gaf_line[12].split("|") taxon_curie = taxon[0].replace("taxon", "NCBITaxon") interacting_taxon = taxon[1].replace( "taxon", "NCBITaxon") if len(taxon) == 2 else None subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1]) subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9], gaf_line[10].split("|"), gaf_line[11], taxon_curie) aspect = gaf_line[8] negated, relation, qualifiers = assocparser._parse_qualifier( gaf_line[3], aspect) # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers for q in qualifiers: if q not in allowed_qualifiers: report.error( source_line, Report.INVALID_QUALIFIER, q, "Qualifiers must be `contributes_to`, `colocalizes_with`, or `NOT`", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(gaf_line[4], taxon_curie) evidence = association.Evidence(ecomap.coderef_to_ecoclass(gaf_line[6]), [e for e in gaf_line[5].split("|") if e], [e for e in gaf_line[7].split("|") if e]) subject_extensions = [ association.ExtensionUnit("rdfs:subClassOf", gaf_line[16]) ] if gaf_line[16] else [] conjunctions = [] if gaf_line[15]: for conjuncts in gaf_line[15].split("|"): extension_units = [] for u in conjuncts.split(","): parsed = relation_tuple.findall(u) if len(parsed) == 1: rel, term = parsed[0] extension_units.append(association.ExtensionUnit( rel, term)) else: # Otherwise, something went bad with the regex, and it's a bad parse report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, u, "extensions should be relation(curie)", taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) conjunction = association.ExtensionConjunctions(extension_units) conjunctions.append(conjunction) object_extensions = association.ExtensionExpression(conjunctions) looked_up_rel = relations.lookup_label(relation) if looked_up_rel is None: report.error( source_line, assocparser.Report.INVALID_QUALIFIER, relation, "Qualifer must be \"colocalizes_with\", \"contributes_to\", or \"NOT\"", taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) a = association.GoAssociation( source_line="\t".join(gaf_line), subject=subject, relation=curie_util.contract_uri(looked_up_rel)[0], object=object, negated=negated, qualifiers=qualifiers, aspect=aspect, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=subject_extensions, object_extensions=object_extensions, provided_by=gaf_line[14], date=gaf_line[13], properties={}) return assocparser.ParseResult(source_line, [a], False, report=report)
def parse_line(self, line): """ Parses a single line of a HPOA file Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ config = self.config parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [], False) # http://human-phenotype-ontology.github.io/documentation.html#annot vals = line.split("\t") if len(vals) != 14: self.report.error( line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 14" .format(columns=len(vals))) return assocparser.ParseResult(line, [], True) [ db, db_object_id, db_object_symbol, qualifier, hpoid, reference, evidence, onset, frequency, withfrom, aspect, db_object_synonym, date, assigned_by ] = vals # hardcode this, as HPOA is currently human-only taxon = 'NCBITaxon:9606' # hardcode this, as HPOA is currently disease-only db_object_type = 'disease' ## -- ## db + db_object_id. CARD=1 ## -- id = self._pair_to_id(db, db_object_id) if not self._validate_id(id, line, ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id(hpoid, line, ANNOTATION): return assocparser.ParseResult(line, [], True) # validation #self._validate_symbol(db_object_symbol, line) #TODO: HPOA has different date styles #date = self._normalize_gaf_date(date, line) # Example use case: mapping from OMIM to Orphanet if config.entity_map is not None: id = self.map_id(id, config.entity_map) toks = id.split(":") db = toks[0] db_object_id = toks[1:] vals[1] = db_object_id ## -- ## end of line re-processing ## -- # regenerate line post-mapping line = "\t".join(vals) ## -- ## db_object_synonym CARD=0..* ## -- synonyms = db_object_synonym.split("|") if db_object_synonym == "": synonyms = [] ## -- ## qualifier ## -- ## we generate both qualifier and relation field relation = None qualifiers = qualifier.split("|") if qualifier == '': qualifiers = [] negated = 'NOT' in qualifiers other_qualifiers = [q for q in qualifiers if q != 'NOT'] ## CURRENTLY NOT USED if len(other_qualifiers) > 0: relation = other_qualifiers[0] else: if aspect == 'O': relation = 'has_phenotype' elif aspect == 'I': relation = 'has_inheritance' elif aspect == 'M': relation = 'mortality' elif aspect == 'C': relation = 'has_onset' else: relation = None ## -- ## hpoid ## -- object = {'id': hpoid, 'taxon': taxon} # construct subject dict subject = { 'id': id, 'label': db_object_symbol, 'type': db_object_type, 'synonyms': synonyms, 'taxon': { 'id': taxon } } ## -- ## evidence ## reference ## withfrom ## -- evidence = { 'type': evidence, 'has_supporting_reference': reference.split("; ") } evidence['with_support_from'] = self._split_pipe(withfrom) ## Construct main return dict assoc = { 'source_line': line, 'subject': subject, 'object': object, 'negated': negated, 'qualifiers': qualifiers, 'relation': { 'id': relation }, 'evidence': evidence, 'provided_by': assigned_by, 'date': date, } self._validate_assoc(assoc, line) return assocparser.ParseResult(line, [assoc], False)
def parse_line(self, line): """Parses a single line of a GPAD. Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [], False) vals = line.split("\t") if len(vals) != 12: self.report.error( line, Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 12" .format(columns=len(vals))) return assocparser.ParseReslt(line, [], True) [ db, db_object_id, qualifier, goid, reference, evidence, withfrom, interacting_taxon_id, # TODO date, assigned_by, annotation_xp, annotation_properties ] = vals id = self._pair_to_id(db, db_object_id) if not self._validate_id(id, line, ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id(goid, line, ANNOTATION): return assocparser.ParseResult(line, [], True) date = self._normalize_gaf_date(date, line) self._validate_id(evidence, line, None) #TODO: ecomap is currently one-way only #ecomap = self.config.ecomap #if ecomap != None: # if ecomap.ecoclass_to_coderef(evidence) == (None,None): # self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence, # msg="Expecting a known ECO class ID") ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, None) assocs = [] xp_ors = annotation_xp.split("|") for xp_or in xp_ors: xp_ands = xp_or.split(",") extns = [] for xp_and in xp_ands: if xp_and != "": expr = self._parse_class_expression(xp_and, line=line) if expr is not None: extns.append(expr) assoc = { 'source_line': line, 'subject': { 'id': id }, 'object': { 'id': goid, 'extensions': extns }, 'negated': negated, 'relation': { 'id': relation }, 'evidence': { 'type': evidence, 'with_support_from': self._split_pipe(withfrom), 'has_supporting_reference': self._split_pipe(reference) }, 'provided_by': assigned_by, 'date': date, } if len(other_qualifiers) > 0: assoc['qualifiers'] = other_qualifiers self._validate_assoc(assoc, line) assocs.append(assoc) return assocparser.ParseResult(line, assocs, False)
def parse_line(self, line): """Parses a single line of a GPAD. Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) vals = [el.strip() for el in line.split("\t")] parsed = to_association(list(vals), report=self.report) if parsed.associations == []: return parsed assoc = parsed.associations[0] go_rule_results = qc.test_go_rules(assoc, self.config) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) vals = list(go_rule_results.annotation.to_gpad_tsv()) [ db, db_object_id, qualifier, goid, reference, evidence, withfrom, interacting_taxon_id, date, assigned_by, annotation_xp, annotation_properties ] = vals split_line = assocparser.SplitLine(line=line, values=vals, taxon="") id = self._pair_to_id(db, db_object_id) if not self._validate_id(id, split_line, context=ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id(goid, split_line, context=ANNOTATION): return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(goid, split_line) if valid_goid == None: return assocparser.ParseResult(line, [], True) goid = valid_goid date = self._normalize_gaf_date(date, split_line) if reference == "": self.report.error(line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty") return assocparser.ParseResult(line, [], True) self._validate_id(evidence, split_line) interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id if interacting_taxon != None: interacting_taxon = self._taxon_id(interacting_taxon_id, split_line) if interacting_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, interacting_taxon_id, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) #TODO: ecomap is currently one-way only #ecomap = self.config.ecomap #if ecomap != None: # if ecomap.ecoclass_to_coderef(evidence) == (None,None): # self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence, # msg="Expecting a known ECO class ID") ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, None) # Reference Column references = self.validate_pipe_separated_ids(reference, split_line) if references == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From withfroms = self.validate_pipe_separated_ids(withfrom, split_line, empty_allowed=True, extra_delims=",") if withfroms == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) ## -- ## parse annotation extension ## See appending in http://doi.org/10.1186/1471-2105-15-155 ## -- object_or_exprs = self._parse_full_extension_expression( annotation_xp, line=split_line) subject_symbol = id subject_fullname = id subject_synonyms = [] if self.gpi is not None: gp = self.gpi.get(id, {}) if gp is not {}: subject_symbol = gp["symbol"] subject_fullname = gp["name"] subject_synonyms = gp["synonyms"].split("|") assoc = { 'source_line': line, 'subject': { 'id': id, 'label': subject_symbol, 'fullname': subject_fullname, 'synonyms': subject_synonyms, 'taxon': { 'id': interacting_taxon }, }, 'object': { 'id': goid }, 'negated': negated, 'relation': { 'id': relation }, 'interacting_taxon': interacting_taxon, 'evidence': { 'type': evidence, 'with_support_from': withfroms, 'has_supporting_reference': references }, 'subject_extensions': [], 'object_extensions': {}, 'aspect': self.compute_aspect(goid), 'provided_by': assigned_by, 'date': date, } if len(other_qualifiers) > 0: assoc['qualifiers'] = other_qualifiers if object_or_exprs is not None and len(object_or_exprs) > 0: assoc['object_extensions'] = {'union_of': object_or_exprs} return assocparser.ParseResult(line, [assoc], False)
def to_association(gpad_line: List[str], report=None, group="unknown", dataset="unknown") -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report source_line = "\t".join(gpad_line) if len(gpad_line) > 12: report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 12 columns in this line. Proceeding by cutting off extra columns.", rule=1) gpad_line = gpad_line[:12] if 12 > len(gpad_line) >= 10: gpad_line += [""] * (12 - len(gpad_line)) if len(gpad_line) != 12: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be between 10 and 12" .format(columns=len(gpad_line))) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 QUALIFIER = 2 REFERENCE_INDEX = 4 EVIDENCE_INDEX = 5 if gpad_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[QUALIFIER] == "": report.error(source_line, Report.INVALID_TAXON, "EMPTY", "qualifier column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column is empty", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gpad_line[EVIDENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "Evidence column is empty", rule=1) taxon = "" subject_curie = "{db}:{id}".format(db=gpad_line[0], id=gpad_line[1]) subject = association.Subject(subject_curie, "", "", [], "", "") object = association.Term(gpad_line[3], "") evidence = association.Evidence(gpad_line[5], [e for e in gpad_line[4].split("|") if e], [e for e in gpad_line[6].split("|") if e]) raw_qs = gpad_line[2].split("|") negated = "NOT" in raw_qs looked_up_qualifiers = [ relations.lookup_label(q) for q in raw_qs if q != "NOT" ] if None in looked_up_qualifiers: report.error(source_line, Report.INVALID_QUALIFIER, raw_qs, "Could not find a URI for qualifier", taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) qualifiers = [curie_util.contract_uri(q)[0] for q in looked_up_qualifiers] conjunctions = [] if gpad_line[11]: for conjuncts in gpad_line[11].split("|"): extension_units = [] for u in conjuncts.split(","): parsed = relation_tuple.findall(u) if len(parsed) == 1: rel, term = parsed[0] extension_units.append(association.ExtensionUnit( rel, term)) else: # Otherwise, something went bad with the regex, and it's a bad parse report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, u, "extensions should be relation(curie)", taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) conjunction = association.ExtensionConjunctions(extension_units) conjunctions.append(conjunction) object_extensions = association.ExtensionExpression(conjunctions) properties_list = [ prop.split("=") for prop in gpad_line[11].split("|") if prop ] # print(properties_list) a = association.GoAssociation( source_line="\t".join(gpad_line), subject=subject, relation="", object=object, negated=negated, qualifiers=qualifiers, aspect=None, interacting_taxon=gpad_line[7], evidence=evidence, subject_extensions=[], object_extensions=object_extensions, provided_by=gpad_line[9], date=gpad_line[8], properties={prop[0]: prop[1] for prop in properties_list if prop}) return assocparser.ParseResult(source_line, [a], False, report=report)
def to_association( gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=Qualifier2_1()) -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report source_line = "\t".join(gaf_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gaf_line) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 report.warning( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) gaf_line = gaf_line[:17] if 17 > len(gaf_line) >= 15: gaf_line += [""] * (17 - len(gaf_line)) if len(gaf_line) != 17: report.error( source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg= "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)" .format(columns=len(gaf_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 TAXON_INDEX = 12 REFERENCE_INDEX = 5 if gaf_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[TAXON_INDEX] == "": report.error(source_line, Report.INVALID_TAXON, "EMPTY", "taxon column is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = gaf_line[12].split("|") taxon_curie = taxon[0].replace("taxon", "NCBITaxon") date = assocparser._normalize_gaf_date(gaf_line[13], report, taxon_curie, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = taxon[1].replace( "taxon", "NCBITaxon") if len(taxon) == 2 else None subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1]) subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9], gaf_line[10].split("|"), gaf_line[11], taxon_curie) aspect = gaf_line[8] negated, relation, qualifiers = assocparser._parse_qualifier( gaf_line[3], aspect) # column 4 is qualifiers -> index 3 # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers parsed_qualifiers = qualifier_parser.validate(gaf_line[3]) if not parsed_qualifiers.valid: report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(gaf_line[4], taxon_curie) evidence = association.Evidence( ecomap.coderef_to_ecoclass(gaf_line[6]), [e for e in gaf_line[5].split("|") if e], association.ConjunctiveSet.str_to_conjunctions(gaf_line[7])) subject_extensions = [ association.ExtensionUnit("rdfs:subClassOf", gaf_line[16]) ] if gaf_line[16] else [] conjunctions = [] if gaf_line[15]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gaf_line[15], conjunct_element_builder=lambda el: association.ExtensionUnit. from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie)", taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) looked_up_rel = relations.lookup_label(relation) if looked_up_rel is None: report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation, "Could not find CURIE for relation `{}`".format(relation), taxon=taxon, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) a = association.GoAssociation( source_line="\t".join(gaf_line), subject=subject, relation=curie_util.contract_uri(looked_up_rel)[0], object=object, negated=negated, qualifiers=qualifiers, aspect=aspect, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=subject_extensions, object_extensions=conjunctions, provided_by=gaf_line[14], date=date, properties={}) return assocparser.ParseResult(source_line, [a], False, report=report)
def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): # Save off version info here if self.version is None: # We are still looking parsed = assocparser.parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.2": logger.info("Detected GAF version 2.2") self.version = version else: logger.info( "Detected GAF version {}, so using 2.1".format( version)) self.version = self.default_version return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning( "No version number found for this file so we will assum GAF version: {}" .format(self.default_version)) self.version = self.default_version vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser()) if parsed.associations == []: return parsed assoc = parsed.associations[0] # self.report = parsed.report ## Run GO Rules, save split values into individual variables go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) vals = list(go_rule_results.annotation.to_gaf_tsv()) [ db, db_object_id, db_object_symbol, qualifier, goid, reference, evidence, withfrom, aspect, db_object_name, db_object_synonym, db_object_type, taxon, date, assigned_by, annotation_xp, gene_product_isoform ] = vals split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon) if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace: self.report.warning( line, Report.INVALID_ID, assigned_by, "GORULE:0000027: assigned_by is not present in groups reference", taxon=taxon, rule=27) if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: # Are we a synonym? upgrade = self.config.entity_idspaces.reverse(db) if upgrade is not None: # If we found a synonym self.report.warning( line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated" .format(db, upgrade), taxon=taxon, rule=27) db = upgrade ## -- ## db + db_object_id. CARD=1 ## --assigned_by id = self._pair_to_id(db, db_object_id) if not self._validate_id( id, split_line, allowed_ids=self.config.entity_idspaces): return assocparser.ParseResult(line, [], True) # Using a given gpi file to validate the gene object if self.gpi is not None: entity = self.gpi.get(id, None) if entity is not None: db_object_symbol = entity["symbol"] db_object_name = entity["name"] db_object_synonym = entity["synonyms"] db_object_type = entity["type"] if not self._validate_id(goid, split_line, context=ANNOTATION): print("skipping because {} not validated!".format(goid)) return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(goid, split_line) if valid_goid == None: return assocparser.ParseResult(line, [], True) goid = valid_goid ecomap = self.config.ecomap if ecomap is not None: if ecomap.coderef_to_ecoclass(evidence, reference) is None: self.report.error( line, assocparser.Report.UNKNOWN_EVIDENCE_CLASS, evidence, msg="Expecting a known ECO GAF code, e.g ISS", rule=1) return assocparser.ParseResult(line, [], True) references = self.validate_pipe_separated_ids(reference, split_line) if references == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From withfroms = self.validate_pipe_separated_ids(withfrom, split_line, empty_allowed=True, extra_delims=",") if withfroms == None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # validation self._validate_symbol(db_object_symbol, split_line) # Example use case: mapping from UniProtKB to MOD ID if self.config.entity_map is not None: id = self.map_id(id, self.config.entity_map) toks = id.split(":") db = toks[0] db_object_id = toks[1:] vals[1] = db_object_id ## -- ## end of line re-processing ## -- # regenerate line post-mapping line = "\t".join(vals) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon ## We do not use the second value taxons = taxon.split("|") normalized_taxon = self._taxon_id(taxons[0], split_line) if normalized_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, taxon, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) self._validate_taxon(normalized_taxon, split_line) interacting_taxon = None if len(taxons) == 2: interacting_taxon = self._taxon_id(taxons[1], split_line) if interacting_taxon == None: self.report.error(line, assocparser.Report.INVALID_TAXON, taxon, msg="Taxon ID is invalid") return assocparser.ParseResult(line, [], True) ## -- ## db_object_synonym CARD=0..* ## -- synonyms = db_object_synonym.split("|") if db_object_synonym == "": synonyms = [] ## -- ## parse annotation extension ## See appendix in http://doi.org/10.1186/1471-2105-15-155 ## -- object_or_exprs = self._parse_full_extension_expression( annotation_xp, line=split_line) ## -- ## qualifier ## -- negated, relation, other_qualifiers = self._parse_qualifier( qualifier, aspect) ## -- ## goid ## -- # TODO We shouldn't overload buildin keywords/functions object = {'id': goid, 'taxon': normalized_taxon} # construct subject dict subject = { 'id': id, 'label': db_object_symbol, 'type': db_object_type, 'fullname': db_object_name, 'synonyms': synonyms, 'taxon': { 'id': normalized_taxon } } ## -- ## gene_product_isoform ## -- ## This is mapped to a more generic concept of subject_extensions subject_extns = [] if gene_product_isoform is not None and gene_product_isoform != '': subject_extns.append({ 'property': 'isoform', 'filler': gene_product_isoform }) object_extensions = {} if object_or_exprs is not None and len(object_or_exprs) > 0: object_extensions['union_of'] = object_or_exprs ## -- ## evidence ## reference ## withfrom ## -- evidence_obj = { 'type': evidence, 'has_supporting_reference': references, 'with_support_from': withfroms } ## Construct main return dict assoc = { 'source_line': line, 'subject': subject, 'object': object, 'negated': negated, 'qualifiers': other_qualifiers, # should be either 0 or 1 item 'aspect': aspect, 'relation': { 'id': relation }, 'interacting_taxon': interacting_taxon, 'evidence': evidence_obj, 'provided_by': assigned_by, 'date': date, 'subject_extensions': subject_extns, 'object_extensions': object_extensions } return assocparser.ParseResult(line, [assoc], False, evidence.upper())