def add_association_to_graph(self): """ The reified relationship between a disease and a phenotype is decorated with some provenance information. This makes the assumption that both the disease and phenotype are classes. :param g: :return: """ # add the basic association nodes # if rel == self.globaltt[['has disposition']: Assoc.add_association_to_graph(self) # anticipating trouble with onsets ranges that look like curies if self.onset is not None and self.onset != '': self.graph.addTriple(self.assoc_id, self.globaltt['onset'], self.onset) if self.frequency is not None and self.frequency != '': self.graph.addTriple( self.assoc_id, self.globaltt['frequency'], self.frequency) return
def add_association_to_graph(self, association_category=None): """ The reified relationship between a disease and a phenotype is decorated with some provenance information. This makes the assumption that both the disease and phenotype are classes. :param g: :param disease_category: a biolink category CURIE for disease_id (defaults to biolink:Disease via the constructor) :param phenotype_category: a biolink category CURIE for phenotype_id (defaults to biolink:PhenotypicFeature via the constructor) :return: """ # add the basic association nodes # if rel == self.globaltt[['has disposition']: Assoc.add_association_to_graph(self) # anticipating trouble with onsets ranges that look like curies if self.onset is not None and self.onset != '': self.graph.addTriple(self.assoc_id, self.globaltt['onset'], self.onset, object_category=self.object_category) if self.frequency is not None and self.frequency != '': self.graph.addTriple( self.assoc_id, self.globaltt['frequency'], self.frequency, object_category=self.object_category) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ model = Model(self.graph) record['relation']['id'] = self.resolve("has phenotype") # define the triple gene = record['subject']['id'] relation = record['relation']['id'] phenotype = record['object']['id'] # instantiate the association g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation) # add the references references = record['evidence']['has_supporting_reference'] # created RGDRef prefix in curie map to route to proper reference URL in RGD references = [ x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references ] if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], self.globaltt['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list # This seems to be specific to this source and # there could be non-equivalent references in this list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add the date created on g2p_assoc.add_date(date=record['date']) g2p_assoc.add_evidence(self.resolve( record['evidence']['type'])) # ?set where? g2p_assoc.add_association_to_graph() return
def add_association_to_graph(self, entity_category=None, phenotype_category=None): """ Overrides Association by including bnode support The reified relationship between a genotype (or any genotype part) and a phenotype is decorated with some provenance information. This makes the assumption that both the genotype and phenotype are classes. currently hardcoded to map the annotation to the monarch namespace :param g: :param entity_category: a biolink category CURIE for self.sub :param phenotype_category: a biolink category CURIE for self.obj :return: """ # is this kosher? Assoc.add_association_to_graph(self) # make a blank stage if self.start_stage_id or self.end_stage_id is not None: stage_process_str = '-'.join( (str(self.start_stage_id), str(self.end_stage_id))) stage_process_id = ':'.join( # bnode ('_', self.gut.digest_id(stage_process_str))) self.model.addIndividualToGraph( stage_process_id, None, self.globaltt['developmental_process']) self.graph.addTriple(stage_process_id, self.globaltt['label'], stage_process_str) self.graph.addTriple(stage_process_id, self.globaltt['starts during'], self.start_stage_id) self.graph.addTriple(stage_process_id, self.globaltt['ends during'], self.end_stage_id) self.stage_process_id = stage_process_id self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'], self.stage_process_id) if self.environment_id is not None: self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'], self.environment_id)
def _add_component_pathway_association( self, eco_map, component, component_prefix, pathway_id, pathway_prefix, pathway_label, go_ecode): pathway = Pathway(self.graph) pathway_curie = "{}:{}".format(pathway_prefix, pathway_id) gene_curie = "{}:{}".format(component_prefix, component.strip()) eco_curie = eco_map[go_ecode] pathway.addPathway(pathway_curie, pathway_label) pathway.addComponentToPathway(gene_curie, pathway_curie) association = Assoc(self.graph, self.name) association.sub = gene_curie association.rel = pathway.object_properties['involved_in'] association.obj = pathway_curie association.set_association_id() association.add_evidence(eco_curie) association.add_association_to_graph() return
def _parse_aeolus_data(self, document, or_limit=None): model = Model(self.graph) rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui']) uni_curie = "UNII:{}".format(document['aeolus']['unii']) model.addLabel(rxcui_curie, document['aeolus']['drug_name']) model.addLabel(uni_curie, document['aeolus']['drug_name']) model.addSameIndividual(rxcui_curie, uni_curie) self.graph.addTriple(rxcui_curie, model.annotation_properties['inchi_key'], document['unii']['inchikey'], object_is_literal=True) if or_limit is not None: outcomes = (outcome for outcome in document['aeolus']['outcomes'] if 'ror' in outcome and outcome['ror'] >= or_limit) else: outcomes = (outcome for outcome in document['aeolus']['outcomes']) for outcome in outcomes: drug2outcome_assoc = Assoc(self.graph, self.name) meddra_curie = "MEDDRA:{}".format(outcome['code']) model.addLabel(meddra_curie, outcome['name']) drug2outcome_assoc.sub = rxcui_curie drug2outcome_assoc.obj = meddra_curie drug2outcome_assoc.rel = Assoc.object_properties[ 'causes_or_contributes'] drug2outcome_assoc.description = \ "A proportional reporting ratio or odds " \ "ratio greater than or equal to {} in the " \ "AEOLUS data was the significance cut-off " \ "used for creating drug-outcome associations".format(or_limit) drug2outcome_assoc.add_association_to_graph() drug2outcome_assoc.add_predicate_object( Assoc.annotation_properties['probabalistic_quantifier'], outcome['ror'], 'Literal') self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome) self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ model = Model(self.graph) record['relation']['id'] = self.resolve("has phenotype") # define the triple gene = record['subject']['id'] relation = record['relation']['id'] phenotype = record['object']['id'] # instantiate the association g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation) # add the references references = record['evidence']['has_supporting_reference'] # created RGDRef prefix in curie map to route to proper reference URL in RGD references = [ x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references] if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], self.globaltt['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list # This seems to be specific to this source and # there could be non-equivalent references in this list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add the date created on g2p_assoc.add_date(date=record['date']) g2p_assoc.add_evidence(self.resolve(record['evidence']['type'])) # ?set where? g2p_assoc.add_association_to_graph() return
def add_association_to_graph(self): """ Overrides Association by including bnode support The reified relationship between a genotype (or any genotype part) and a phenotype is decorated with some provenance information. This makes the assumption that both the genotype and phenotype are classes. currently hardcoded to map the annotation to the monarch namespace :param g: :return: """ Assoc.add_association_to_graph(self) # make a blank stage if self.start_stage_id or self.end_stage_id is not None: stage_process_id = '-'.join( (str(self.start_stage_id), str(self.end_stage_id))) stage_process_id = '_:' + re.sub(r':', '', stage_process_id) self.model.addIndividualToGraph( stage_process_id, None, self.globaltt['developmental_process']) self.graph.addTriple(stage_process_id, self.globaltt['starts_during'], self.start_stage_id) self.graph.addTriple(stage_process_id, self.globaltt['ends_during'], self.end_stage_id) self.stage_process_id = stage_process_id self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'], self.stage_process_id) if self.environment_id is not None: self.graph.addTriple(self.assoc_id, self.globaltt['has_qualifier'], self.environment_id) return
def add_association_to_graph(self): """ Overrides Association by including bnode support The reified relationship between a genotype (or any genotype part) and a phenotype is decorated with some provenance information. This makes the assumption that both the genotype and phenotype are classes. currently hardcoded to map the annotation to the monarch namespace :param g: :return: """ Assoc.add_association_to_graph(self) # make a blank stage if self.start_stage_id or self.end_stage_id is not None: stage_process_id = '-'.join((str(self.start_stage_id), str(self.end_stage_id))) stage_process_id = '_:'+re.sub(r':', '', stage_process_id) self.model.addIndividualToGraph( stage_process_id, None, self.globaltt['developmental_process']) self.graph.addTriple( stage_process_id, self.globaltt['starts during'], self.start_stage_id) self.graph.addTriple( stage_process_id, self.globaltt['ends during'], self.end_stage_id) self.stage_process_id = stage_process_id self.graph.addTriple( self.assoc_id, self.globaltt['has_qualifier'], self.stage_process_id) if self.environment_id is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has_qualifier'], self.environment_id) return
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', str(rank)) # ? can't do RE on a float ... model.addIndividualToGraph(gene_curie, None) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = self.globaltt['expressed in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object( self.globaltt['has_quantifier'], float(rank), 'Literal', 'xsd:float')
def _parse_aeolus_data(self, document, or_limit=None): model = Model(self.graph) rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui']) uni_curie = "UNII:{}".format(document['aeolus']['unii']) model.addLabel(rxcui_curie, document['aeolus']['drug_name']) model.addLabel(uni_curie, document['aeolus']['drug_name']) model.addSameIndividual(rxcui_curie, uni_curie) self.graph.addTriple( rxcui_curie, self.globaltt['inchi_key'], document['unii']['inchikey'], object_is_literal=True) if or_limit is not None: outcomes = (outcome for outcome in document['aeolus']['outcomes'] if 'ror' in outcome and outcome['ror'] >= or_limit) else: outcomes = (outcome for outcome in document['aeolus']['outcomes']) for outcome in outcomes: drug2outcome_assoc = Assoc(self.graph, self.name) meddra_curie = "MEDDRA:{}".format(outcome['code']) model.addLabel(meddra_curie, outcome['name']) drug2outcome_assoc.sub = rxcui_curie drug2outcome_assoc.obj = meddra_curie drug2outcome_assoc.rel = self.globaltt['causes_or_contributes'] drug2outcome_assoc.description = \ "A proportional reporting ratio or odds " \ "ratio greater than or equal to {} in the " \ "AEOLUS data was the significance cut-off " \ "used for creating drug-outcome associations".format(or_limit) drug2outcome_assoc.add_association_to_graph() drug2outcome_assoc.add_predicate_object( self.globaltt['probabalistic_quantifier'], outcome['ror'], 'Literal') self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome) self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) genotype = Genotype(self.graph) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', rank) model.addIndividualToGraph(ind_id=gene_curie, label=None, ind_type=genotype.genoparts['gene']) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = Assoc.object_properties['expressed_in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object( Assoc.datatype_properties['has_quantifier'], float(rank), 'Literal', 'xsd:float') return
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', str(rank)) # ? can't do RE on a float ... model.addIndividualToGraph(gene_curie, None) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = self.globaltt['expressed in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object( self.globaltt['has_quantifier'], float(rank), 'Literal', 'xsd:float') return
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if '7955' in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append( { 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_') ) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], Reference.ref_types['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def process_gaf(self, gaffile, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple( gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None: LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) model.addSynonym(gene_id, syn) else: model.addSynonym(gene_id, syn) for txid in taxon.split('|'): tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid) geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning( "Skipping %s from or with %s", uniprotid, itm) continue itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm) itm = re.sub(r'WB:', 'WormBase:', itm) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot)
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian??? phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = self.globaltt['has phenotype'] if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene, relation, pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) model.addTriple(subject_id=pheno_id, predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['Phenotype']) # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created Ref prefix in curie map to route to proper reference URL in SGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], self.globaltt['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id): """ Create an association linking a drug and disease with RO:0002606 (substance_that_treats) and any supporting information such as FDA approval and source (not implemented) :param drug_id: Id as curie of the drug :param disease_id: Id as curie of the disease :param therapy_status: (Optional) String label of therapy approval status :return: None """ gu = GraphUtils(curie_map.get()) # Placeholder relationship, note this does not exist in RO relationship_id = "RO:has_approval_status" gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id) # Make association drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id)) therapy_disease_assoc = Assoc(self.name) therapy_disease_assoc.set_subject(drug_id) therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats']) therapy_disease_assoc.set_object(disease_id) therapy_disease_assoc.set_association_id(drug_disease_annot) therapy_disease_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id( definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) # add the descripiton: all the unmodeled data in a '|' delimited list description = [ 'genomic_background: {}'.format(record['Strain Background']), 'allele: {}'.format(record['Allele']), 'chemical: {}'.format(record['Chemical']), 'condition: {}'.format(record['Condition']), 'details: {}'.format(record['Details']), 'feature_name: {}'.format(record['Feature Name']), 'gene_name: {}'.format(record['Gene Name']), 'mutant_type: {}'.format(record['Mutant Type']), 'reporter: {}'.format(record['Reporter']), ] g2p_assoc.description = " | ".join(description) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], Reference.ref_types['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n"+'\t' .join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info( ">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def _add_component_pathway_association(self, gene_curie, pathway_curie, pathway_label, eco_curie): pathway = Pathway(self.graph) pathway.addPathway(pathway_curie, pathway_label) pathway.addComponentToPathway(gene_curie, pathway_curie) association = Assoc(self.graph, self.name) association.sub = gene_curie association.rel = self.globaltt['involved in'] association.obj = pathway_curie association.set_association_id() association.add_evidence(eco_curie) association.add_association_to_graph()
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n"+'\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from '+uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc( g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub( prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def _add_component_pathway_association(self, eco_map, component, component_prefix, pathway_id, pathway_prefix, pathway_label, go_ecode): pathway = Pathway(self.graph) pathway_curie = "{}:{}".format(pathway_prefix, pathway_id) gene_curie = "{}:{}".format(component_prefix, component.strip()) eco_curie = eco_map[go_ecode] pathway.addPathway(pathway_curie, pathway_label) pathway.addComponentToPathway(gene_curie, pathway_curie) association = Assoc(self.graph, self.name) association.sub = gene_curie association.rel = self.globaltt['involved in'] association.obj = pathway_curie association.set_association_id() association.add_evidence(eco_curie) association.add_association_to_graph() return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return