def _process_phene_gene_row(self, row): geno = Genotype(self.g) model = Model(self.g) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_:'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' geno.addAllele(vl, 'some variant of ' + gene_label) geno.addAlleleOfGene(vl, gene_id) geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) assoc = G2PAssoc(self.g, self.name, vl, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return
def _process_phene_gene_row(self, row): geno = Genotype(self.graph) model = Model(self.graph) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.test_mode and not (omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene'] ) or gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: LOG.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d var = self.make_id(gene_id.split(':')[-1] + 'VL', '_') geno.addAllele(var, 'some variant of ' + gene_label) geno.addAlleleOfGene(var, gene_id) geno.addAffectedLocus(var, gene_id) model.addBlankNodeAnnotation(var) assoc = G2PAssoc(self.graph, self.name, var, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id)
def _build_gene_disease_model(self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple(assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing disease models") geno = Genotype(g) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_:'+'-'.join((gene_num, 'unspecified')) vl_label = 'some variant of '+gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( g, self.name, animal_id, disease_id, model.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph() return
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing disease models") geno = Genotype(g) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:' + gene_num # make a variant of the gene vl = '_:' + '-'.join((gene_num, 'unspecified')) vl_label = 'some variant of ' + gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc(g, self.name, animal_id, disease_id, model.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph() return
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:' + re.sub(r':', '', gene_id) + '-' + disorder_num + 'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph(alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph( alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph( gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
def _build_gene_disease_model( self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc( self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple( assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def _process_disease2gene(self, row): """ Here, we process the disease-to-gene associations. Note that we ONLY process direct associations (not inferred through chemicals). Furthermore, we also ONLY process "marker/mechanism" associations. We preferentially utilize OMIM identifiers over MESH identifiers for disease/phenotype. Therefore, if a single OMIM id is listed under the "omim_ids" list, we will choose this over any MeSH id that might be listed as the disease_id. If multiple OMIM ids are listed in the omim_ids column, we toss this for now. (Mostly, we are not sure what to do with this information.) We associate "some variant of gene X" with the phenotype, rather than the gene directly. We also pull in the MeSH labels here (but not OMIM) to ensure that we have them (as they may not be brought in separately). :param row: :return: """ # if self.testMode: # g = self.testgraph # else: # g = self.graph # self._check_list_len(row, 9) # geno = Genotype(g) # gu = GraphUtils(curie_map.get()) model = Model(self.g) (gene_symbol, gene_id, disease_name, disease_id, direct_evidence, inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row # we only want the direct associations; skipping inferred for now if direct_evidence == '' or direct_evidence != 'marker/mechanism': return # scrub some of the associations... # it seems odd to link human genes to the following "diseases" diseases_to_scrub = [ 'MESH:D004283', # dog diseases 'MESH:D004195', # disease models, animal 'MESH:D030342', # genetic diseases, inborn 'MESH:D040181', # genetic dieases, x-linked 'MESH:D020022'] # genetic predisposition to a disease if disease_id in diseases_to_scrub: logger.info( "Skipping association between NCBIGene:%s and %s", str(gene_id), disease_id) return intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.testMode and ( int(gene_id) not in self.test_geneids or len(intersect) < 1): return # there are three kinds of direct evidence: # (marker/mechanism | marker/mechanism|therapeutic | therapeutic) # we are only using the "marker/mechanism" for now # TODO what does it mean for a gene to be therapeutic for disease? # a therapeutic target? gene_id = 'NCBIGene:' + gene_id preferred_disease_id = disease_id if omim_ids is not None and omim_ids != '': omim_id_list = re.split(r'\|', omim_ids) # If there is only one OMIM ID for the Disease ID # or in the omim_ids list, # use the OMIM ID preferentially over any MeSH ID. if re.match(r'OMIM:.*', disease_id): if len(omim_id_list) > 1: # the disease ID is an OMIM ID and # there is more than one OMIM entry in omim_ids. # Currently no entries satisfy this condition pass elif disease_id != ('OMIM:' + omim_ids): # the disease ID is an OMIM ID and # there is only one non-equiv OMIM entry in omim_ids # we preferentially use the disease_id here logger.warning( "There may be alternate identifier for %s: %s", disease_id, omim_ids) # TODO: What should be done with the alternate disease IDs? else: if len(omim_id_list) == 1: # the disease ID is not an OMIM ID # and there is only one OMIM entry in omim_ids. preferred_disease_id = 'OMIM:' + omim_ids elif len(omim_id_list) > 1: # This is when the disease ID is not an OMIM ID and # there is more than one OMIM entry in omim_ids. pass # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. So we # make an anonymous alternate locus, and put that in the association. alt_id = gene_id + '-' + preferred_disease_id + 'VL' # can't have colons in the bnodes alt_locus = re.sub(r':', '', alt_id) alt_locus = "_:" + alt_locus alt_label = 'some variant of ' + gene_symbol + ' that is ' \ + direct_evidence + ' for ' + disease_name model.addIndividualToGraph( alt_locus, alt_label, self.geno.genoparts['variant_locus']) # assume that the label gets added elsewhere model.addClassToGraph(gene_id, None) self.geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) # not sure if MESH is getting added separately. # adding labels here for good measure dlabel = None if re.match(r'MESH', preferred_disease_id): dlabel = disease_name model.addClassToGraph(preferred_disease_id, dlabel) # Add the disease to gene relationship. rel_id = self._get_relationship_id(direct_evidence) refs = self._process_pubmed_ids(pubmed_ids) self._make_association(alt_locus, preferred_disease_id, rel_id, refs) return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and line_counter > limit): break LOG.info("Done with OMIM to KEGG gene") return
def _process_kegg_disease2gene(self, limit=None): """ This method creates an association between diseases and their associated genes. We are being conservative here, and only processing those diseases for which there is no mapping to OMIM. Triples created: <alternate_locus> is an Individual <alternate_locus> has type <variant_locus> <alternate_locus> is an allele of <gene_id> <assoc_id> has subject <disease_id> <assoc_id> has object <gene_id> :param limit: :return: """ LOG.info("Processing KEGG disease to gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) rel = self.globaltt['is marker for'] noomimset = set() raw = '/'.join((self.rawdir, self.files['disease_gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, disease_id) = row if self.test_mode and gene_id not in self.test_ids['genes']: continue gene_id = 'KEGG-' + gene_id.strip() disease_id = 'KEGG-' + disease_id.strip() # only add diseases for which # there is no omim id and not a grouping class if disease_id not in self.kegg_disease_hash: # add as a class disease_label = None if disease_id in self.label_hash: disease_label = self.label_hash[disease_id] if re.search(r'includ', str(disease_label)): # they use 'including' when it's a grouping class LOG.info( "Skipping this association because " + "it's a grouping class: %s", disease_label) continue # type this disease_id as a disease model.addClassToGraph( disease_id, disease_label, self.globaltt['disease']) noomimset.add(disease_id) alt_locus_id = self._make_variant_locus_id(gene_id, disease_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. assoc = G2PAssoc(graph, self.name, alt_locus_id, disease_id, rel) assoc.add_association_to_graph() if (not self.test_mode) and (limit is not None and line_counter > limit): break LOG.info("Done with KEGG disease to gene") LOG.info("Found %d diseases with no omim id", len(noomimset)) return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:' + str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:' + str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph(gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:' + r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:' + r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:' + r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) # previous: if omim type is not disease-ish then use # now is: if omim type is gene then use if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and reader.line_num > limit): break LOG.info("Done with OMIM to KEGG gene")
def _process_kegg_disease2gene(self, limit=None): """ This method creates an association between diseases and their associated genes. We are being conservative here, and only processing those diseases for which there is no mapping to OMIM. Triples created: <alternate_locus> is an Individual <alternate_locus> has type <variant_locus> <alternate_locus> is an allele of <gene_id> <assoc_id> has subject <disease_id> <assoc_id> has object <gene_id> :param limit: :return: """ LOG.info("Processing KEGG disease to gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) rel = self.globaltt['is marker for'] noomimset = set() raw = '/'.join((self.rawdir, self.files['disease_gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, disease_id) = row if self.test_mode and gene_id not in self.test_ids['genes']: continue gene_id = 'KEGG-' + gene_id.strip() disease_id = 'KEGG-' + disease_id.strip() # only add diseases for which # there is no omim id and not a grouping class if disease_id not in self.kegg_disease_hash: # add as a class disease_label = None if disease_id in self.label_hash: disease_label = self.label_hash[disease_id] if re.search(r'includ', str(disease_label)): # they use 'including' when it's a grouping class LOG.info( "Skipping association because it's a grouping class: %s", disease_label) continue # type this disease_id as a disease model.addClassToGraph(disease_id, disease_label) # , class_type=self.globaltt['disease']) noomimset.add(disease_id) alt_locus_id = self._make_variant_locus_id(gene_id, disease_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. assoc = G2PAssoc(graph, self.name, alt_locus_id, disease_id, rel) assoc.add_association_to_graph() if not self.test_mode and ( limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG disease to gene") LOG.info("Found %d diseases with no omim id", len(noomimset))
def add_gene_to_disease(self, association_type, gene_id, gene_symbol, disease_id, eco_id): """ Composes triples based on the DisorderGeneAssociationType element: AND the suffixes: - "gene phenotype" - "function consequence" - "cell origin" xmlstarlet sel -t -v "/JDBOR/DisorderList/Disorder/DisorderGeneAssociationList/ DisorderGeneAssociation/DisorderGeneAssociationType/Name" en_product6.xml \ | sort -u Biomarker tested in Candidate gene tested in Disease-causing germline mutation(s) (gain of function) in Disease-causing germline mutation(s) in Disease-causing germline mutation(s) (loss of function) in Disease-causing somatic mutation(s) in Major susceptibility factor in Modifying germline mutation in Part of a fusion gene in Role in the phenotype of These labels are a composition of terms, we map: gene-disease predicate (has phenotype, contributes_to) variant-origin (germline, somatic) variant-functional consequence (loss, gain) To check on the "DisorderGeneAssociationType" to id-label map xmlstarlet sel -t -m \ './JDBOR/DisorderList/Disorder/DisorderGeneAssociationList/\ DisorderGeneAssociation/DisorderGeneAssociationType'\ -v './@id' -o ' ' -v './Name' -n en_product6.xml |\ sort | uniq -c | sort -nr Although the id-label pairs appear to be stable after a few years, we map to the label instead of the id in case Orphanet changes their IDs :param association_type: {str} DisorderGeneAssociationType/Name, eg Role in the phenotype of :param gene_id: {str} gene id as curie :param gene_symbol: {str} HGVS gene symbol :param disease_id: {str} disease id as curie :param eco_id: {str} eco code as curie :return: None """ model = Model(self.graph) geno = Genotype(self.graph) gene_or_variant = "" # If we know something about the variant such as functional consequence or # cellular origin make a blank node and attach the attributes is_variant = False variant_id_string = "{}{}".format(gene_id, disease_id) functional_consequence = None cell_origin = None # hard fail for no mappings/new terms, otherwise they go unnoticed if "{}|gene phenotype".format(association_type) not in self.localtt: raise ValueError( 'Disease-gene association type {} not mapped'.format( association_type)) g2p_relation = self.resolve("|".join( [association_type, "gene phenotype"])) # Variant attributes if "|".join([association_type, "function consequence"]) in self.localtt: is_variant = True local_key = "|".join([association_type, "function consequence"]) functional_consequence = self.resolve(local_key) functional_consequence_lbl = self.localtt[local_key] if "|".join([association_type, "cell origin"]) in self.localtt: is_variant = True local_key = "|".join([association_type, "cell origin"]) cell_origin = self.resolve(local_key) cell_origin_lbl = self.localtt[local_key] if is_variant: variant_label = "of {}".format(gene_symbol) if functional_consequence: variant_label = "{} {}".format( functional_consequence_lbl.replace('_', ' '), variant_label) variant_id_string += functional_consequence_lbl else: variant_label = "variant {}".format(variant_label) if cell_origin: variant_label = "{} {}".format(cell_origin_lbl, variant_label) variant_id_string += cell_origin_lbl variant_bnode = self.make_id(variant_id_string, "_") model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) self._add_variant_attributes(variant_bnode, functional_consequence, cell_origin) gene_or_variant = variant_bnode else: gene_or_variant = gene_id assoc = G2PAssoc(self.graph, self.name, gene_or_variant, disease_id, g2p_relation) assoc.add_evidence(eco_id) assoc.add_association_to_graph() return