def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) # since there's a dependency on HGNC files; fetch those too hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized) hgnc.fetch(is_dl_forced) return
def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) # since there's a dependency on HGNC files; fetch those too hgnc = HGNC() hgnc.fetch(is_dl_forced) return
class HGNCTestCase(SourceTestCase): def setUp(self): self.source = HGNC('rdf_graph', True) self.source.test_ids = self._get_testids()['gene'] self.source.settestonly(True) self._setDirToSource() return def tearDown(self): self.source = None return
class HGNCTestCase(SourceTestCase): def setUp(self): self.source = HGNC() self.source.test_ids = self._get_conf()['test_ids']['gene'] self.source.settestonly(True) self._setDirToSource() return def tearDown(self): self.source = None return
def setUp(self): self.source = HGNC('rdf_graph', True) self.source.test_ids = self._get_testids()['gene'] self.source.settestonly(True) self._setDirToSource() return
def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.g is not None: g = self.g else: g = self.graph gu = GraphUtils(curie_map.get()) # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC() hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile( '/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: logger.error( "Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene gu.addClassToGraph(g, hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:'+str(omim.strip()) # assume this is declared elsewhere in ontology gu.addClassToGraph(g, omim_id, None) if category.strip() == 'Confirmed DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Probable DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Possible DD gene': rel = gu.object_properties['contributes_to'] elif category.strip() == 'Not DD gene': # TODO negative annotation continue assoc = G2PAssoc(self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:'+str(p) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) assoc.add_source(pmid) assoc.add_association_to_graph(g) else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways logger.warning( "No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() logger.warning( "gene-disorder associations with no omim id: %d", unmapped_omim_counter) logger.warning("unmapped gene count: %d", unmapped_gene_count) gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) return
def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.graph is not None: graph = self.graph else: graph = self.graph # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized) hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile('/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: LOG.error("Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene self.model.addClassToGraph(hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:' + str(omim.strip()) # assume this is declared elsewhere in ontology self.model.addClassToGraph(omim_id, None) # ??? rel is never used # if category.strip() == 'Confirmed DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Probable DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Possible DD gene': # rel = self.self.globaltt['contributes to'] # elif category.strip() == 'Not DD gene': # # TODO negative annotation # continue assoc = G2PAssoc(graph, self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:' + str(p) r = Reference(graph, pmid, self.globaltt['journal article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_association_to_graph() else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways LOG.warning("No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() LOG.warning("gene-disorder associations with no omim id: %d", unmapped_omim_counter) LOG.warning("unmapped gene count: %d", unmapped_gene_count) return
def setUp(self): self.source = HGNC() self.source.test_ids = self._get_conf()['test_ids']['gene'] self.source.settestonly(True) self._setDirToSource() return