def _process_all(self, limit): """ This takes the list of omim identifiers from the omimTitles file, excludes those designated as obsolete and iteratively queries the omim api in batches of 20 for the json-formatted data. This will create OMIM classes, with the label & definition. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: """ omimids = list(self.omim_type.keys() - self.omim_replaced.keys()) LOG.info('Have %i omim numbers to fetch records from their API', len(omimids)) LOG.info('Have %i omim types ', len(self.omim_type)) if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] # add genome and taxon geno.addGenome(tax_id, tax_label) model.addClassToGraph(tax_id, tax_label) includes = set() includes.add('all') self.process_entries(omimids, self._transform_entry, includes, graph, limit) # since we are not fetching obsolete records any more add them all in here for omim_id in self.omim_replaced: model.addDeprecatedClass( 'OMIM:' + omim_id, ['OMIM:' + o for o in self.omim_replaced[omim_id]])
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph( gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_gene_ids(self, limit): src_key = 'gene_ids' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) graph = self.graph model = Model(graph) geno = Genotype(graph) col = self.files[src_key]['columns'] LOG.info("Processing: %s", self.files[src_key]['file']) with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') # no header row to check collen = len(col) for row in reader: if len(row) != collen: LOG.error('In %s line %i expected %i colums but got %s.', self.files[src_key]['file'], reader.line_num, collen, row) pass taxon_num = row[col.index('taxon_num')] gene_num = row[col.index('gene_num')] gene_symbol = row[col.index('gene_symbol')] gene_synonym = row[col.index('gene_synonym')] live = row[col.index('live')] # gene_type = row[col.index('gene_type')] # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene taxon_curie = 'NCBITaxon:' + taxon_num gene_curie = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym # these are not the same in my book tec. if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_curie, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_curie, old_id_category=blv.terms['Gene']) geno.addTaxon(taxon_curie, gene_curie) if gene_synonym is not None and gene_synonym != '': model.addSynonym(gene_curie, gene_synonym) if limit is not None and reader.line_num > limit: break
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(graph) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:' + taxon_num gene_id = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_id, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') # set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph(discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual(discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if (not self.testMode) and\ (limit is not None and line_counter > limit): break return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): exit(-1) for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe seperated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple( 'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _transform_entry(self, e, graph): g = graph model = Model(g) geno = Genotype(graph) tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' build_num = "GRCh38" build_id = "NCBIGenome:"+build_num # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": # "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, # and add it as a synonym abbrev = None if len(re.split(r';', label)) > 1: abbrev = (re.split(r';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': model.addDeprecatedClass(omimid) else: omimtype = self._get_omimtype(e['entry']) nodelabel = newlabel # this uses our cleaned-up label if omimtype == Genotype.genoparts['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker model.addClassToGraph(omimid, nodelabel, None, newlabel) elif omimtype == Genotype.genoparts['gene']: if abbrev is not None: nodelabel = abbrev model.addClassToGraph(omimid, nodelabel, omimtype, newlabel) else: model.addClassToGraph(omimid, newlabel, omimtype) # add the original screaming-caps OMIM label as a synonym model.addSynonym(omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym') # for OMIM, we're adding the description as a definition model.addDefinition(omimid, description) if abbrev is not None: model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym') # if this is a genetic locus (but not sequenced) # then add the chrom loc info # but add it to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) feature_id = None feature_label = None if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] is_gene = False if omimtype == \ Genotype.genoparts['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(e['entry'], g) if len(ncbifeature) == 1: feature_id = 'NCBIGene:'+str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(g, self.name, feature_id, omimid) assoc.add_association_to_graph() elif len(ncbifeature) > 1: logger.info( "Its ambiguous when %s maps to >1 gene id: %s", omimid, str(ncbifeature)) else: # no ncbi feature, make an anonymous one feature_id = self._make_anonymous_feature(str(omimnum)) feature_label = abbrev elif omimtype == Genotype.genoparts['gene']: feature_id = omimid is_gene = True else: # 158900 falls into this category feature_id = self._make_anonymous_feature(str(omimnum)) if abbrev is not None: feature_label = abbrev omimtype = \ Genotype.genoparts[ 'heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship f = Feature(g, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_num, 'CHR') geno.addChromosomeClass( chrom_num, tax_id, tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID(chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance( chrom_num, build_id, build_num, chrom) if omimtype == \ Genotype.genoparts[ 'heritable_phenotypic_marker']: postypes = [Feature.types['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API f.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: f.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: logger.info( "start>end (%d>%d) for %s", fstart, fend, omimid) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_num, 'CHR') model.addClassToGraph(loc, None) f.addSubsequenceOfFeature(loc) f.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features # check if moved, if so, # make it deprecated and # replaced consider class to the other thing(s) # some entries have been moved to multiple other entries and # use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search(r'and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split(r'and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) model.addDeprecatedClass(omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_mapped_gene_ids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) # temp gag return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ src_key = 'gene_history' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() discontinued_num = row[col.index( 'Discontinued_GeneID')].strip() discontinued_symbol = row[col.index( 'Discontinued_Symbol')].strip() # discontinued_date = row[col.index('Discontinue_Date')] # set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.test_mode and gene_num not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol, class_category=blv.terms['Gene']) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id], old_id_category=blv.terms['Gene']) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph(discontinued_gene_id, discontinued_symbol, ind_category=blv.terms['Gene']) model.addDeprecatedIndividual( discontinued_gene_id, [gene_id], old_id_category=blv.terms['Gene']) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if not self.test_mode and (limit is not None and line_counter > limit): break
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') # set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph( discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph( discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual( discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if (not self.testMode) and\ (limit is not None and line_counter > limit): break return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): pass for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index( 'pubmed_id')].strip() # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple('PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _process_genes(self, limit=None): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass( hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass( hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': g.addTriple( 'PMID:' + str(p.strip()), model.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(g, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') model.addClassToGraph(band_id, None) f.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) f.addSubsequenceOfFeature(chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file return
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ src_key = 'gene_history' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() discontinued_num = row[col.index('Discontinued_GeneID')].strip() discontinued_symbol = row[col.index('Discontinued_Symbol')].strip() # discontinued_date = row[col.index('Discontinue_Date')] # set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.test_mode and gene_num not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph( discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual(discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if not self.test_mode and (limit is not None and line_counter > limit): break