def _add_deprecated_snp( self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' current_rs_id += str(snp_id_current) if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual(snp_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def _add_deprecated_snp(self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' current_rs_id += str(snp_id_current) if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual(snp_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def _add_deprecated_snp(self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and snp_id_current != '': current_rs_id = 'dbSNP:rs' + snp_id_current if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual( snp_id, current_rs_id, old_id_category=blv.terms['SequenceVariant']) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') # set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph(discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual(discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if (not self.testMode) and\ (limit is not None and line_counter > limit): break return
def _get_process_allelic_variants(self, entry, g): model = Model(g) reference = Reference(g) geno = Genotype(g) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele( al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene( al_id, 'OMIM:'+str(entry_num), geno.object_properties[ 'is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] g.addTriple( pmid, model.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = \ re.split(r',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ re.split( r';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [ (re.match(r'(RCV\d+);*', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + str(entry_num)+"#" + str(al_num).zfill(4)) elif re.search( r'moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ src_key = 'gene_history' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() discontinued_num = row[col.index( 'Discontinued_GeneID')].strip() discontinued_symbol = row[col.index( 'Discontinued_Symbol')].strip() # discontinued_date = row[col.index('Discontinue_Date')] # set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.test_mode and gene_num not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol, class_category=blv.terms['Gene']) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id], old_id_category=blv.terms['Gene']) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph(discontinued_gene_id, discontinued_symbol, ind_category=blv.terms['Gene']) model.addDeprecatedIndividual( discontinued_gene_id, [gene_id], old_id_category=blv.terms['Gene']) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if not self.test_mode and (limit is not None and line_counter > limit): break
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') # set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph( discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph( discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual( discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if (not self.testMode) and\ (limit is not None and line_counter > limit): break return
def _get_process_allelic_variants(self, entry, graph): model = Model(graph) reference = Reference(graph) geno = Genotype(graph) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, graph) if 'allelicVariantList' in entry: for alv in entry['allelicVariantList']: al_num = alv['allelicVariant']['number'] al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill( 4) al_label = None al_description = None if alv['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in alv['allelicVariant']: al_label = alv['allelicVariant']['mutations'] if 'text' in alv['allelicVariant']: al_description = alv['allelicVariant']['text'] mch = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(mch) geno.addAllele(al_id, al_label, self.globaltt['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num), self.globaltt['is_allele_of']) for ref in publist[al_id]: pmid = ref_to_pmid[int(ref)] graph.addTriple(pmid, self.globaltt['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in alv['allelicVariant']: dbsnp_ids = re.split( r',', alv['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:' + dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) # Note that RCVs are variant to disease associations # in ClinVar, rather than variant entries # so we make these xrefs instead of equivalents if 'clinvarAccessions' in alv['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ alv['allelicVariant']['clinvarAccessions'].split(';;;') rcv_ids = [rcv[:12] for rcv in rcv_ids] # incase more cruft for rnum in rcv_ids: rid = 'ClinVar:' + rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + '#'.join( (str(entry_num), str(al_num).zfill(4)))) elif re.search(r'moved', alv['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in alv['allelicVariant']: moved_id = 'OMIM:' + alv['allelicVariant'][ 'movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: LOG.error('Uncaught alleleic variant status %s', alv['allelicVariant']['status'])
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ src_key = 'gene_history' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() discontinued_num = row[col.index('Discontinued_GeneID')].strip() discontinued_symbol = row[col.index('Discontinued_Symbol')].strip() # discontinued_date = row[col.index('Discontinue_Date')] # set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.test_mode and gene_num not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph( discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual(discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if not self.test_mode and (limit is not None and line_counter > limit): break