def load_ensembl2acc(self): """ loading ensembl to transcripts/proteins data """ #Loading all ensembl GeneIDs, TranscriptIDs and ProteinIDs datafile = os.path.join(self.data_folder, 'gene_ensembl__translation__main.txt') genefile = os.path.join(self.data_folder, 'gene_ensembl__gene__main.txt') def _fn(x, eid): out = {'gene': eid, 'translation': []} def mapping(transcript_id, protein_id): trid = transcript_id and transcript_id != '\\N' and transcript_id or None pid = protein_id and protein_id != '\\N' and protein_id or None if trid and pid: out['translation'].append({"rna": trid, "protein": pid}) if isinstance(x, list): transcript_li = [] protein_li = [] for _x in x: if _x[0] and _x[0] != '\\N': transcript_li.append(_x[0]) if _x[1] and _x[1] != '\\N': protein_li.append(_x[1]) mapping(_x[0], _x[1]) if transcript_li: out['transcript'] = normalized_value(transcript_li) if protein_li: out['protein'] = normalized_value(protein_li) else: if x[0] and x[0] != '\\N': out['transcript'] = x[0] if x[1] and x[1] != '\\N': out['protein'] = x[1] mapping(x[0], x[1]) return out ensembl2acc = tab2dict(datafile, (1, 2, 3), 0, includefn=_not_LRG) typeofgene = tab2dict(genefile, (1, 8), 0, includefn=_not_LRG) #for datadict in tab2dict_iter(datafile, (1, 2, 3), 0, includefn=_not_LRG): # for k in datadict: # datadict[k] = {'ensembl': _fn(datadict[k], k), '__aslistofdict__' : 'ensembl'} # for doc in map_id(datadict,self.ensembl2entrez): # yield doc for k in ensembl2acc: ensembl2acc[k] = {'ensembl': _fn(ensembl2acc[k], k)} if k in typeofgene: ensembl2acc[k]['ensembl']['type_of_gene'] = typeofgene[k] return self.convert2entrez(ensembl2acc)
def loaddata(data_folder): #GNF1H datafile = os.path.join(data_folder, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab') gene2gnf1h = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') #GNF1m datafile = os.path.join(data_folder, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab') gene2gnf1m = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '') return {'GNF1H': gene2gnf1h, 'GNF1M': gene2gnf1m}
def load_data(self, data_folder): # fn to skip lines with LRG records.''' def _not_LRG(ld): return not ld[1].startswith("LRG_") # load mapping ensembl => entrez from Ensembl ens2ent_file = os.path.join(data_folder, 'gene_ensembl__xref_entrezgene__dm.txt') self.logger.info("Loading Ensembl-to-Entrez mapping file: %s" % ens2ent_file) ens2ent = tab2dict(ens2ent_file, (1, 2), 0, includefn=_not_LRG, alwayslist=True) self.logger.info("# mapping Ensembl => Entrez: %s" % len(ens2ent)) # load mapping entrez => ensembl from Entrez ent2ens_file = os.path.join(data_folder, 'gene2ensembl.gz') self.logger.info("Loading Entrez-to-Ensembl mapping file: %s" % ent2ens_file) ent2ens = tab2dict(ent2ens_file, (1, 2), 0, alwayslist=True) self.logger.info("# mapping Entrez => Ensembl: %s" % len(ent2ens)) # multual mapping mapping = {} for ensembl_id in ens2ent: entrez_ids_from_ensembl = ens2ent[ensembl_id] for entrez_id in entrez_ids_from_ensembl: if ensembl_id in ent2ens.get(entrez_id, []): mapping.setdefault(ensembl_id, set()).add(entrez_id) self.logger.info("%d mutual mappings found" % len(mapping)) for ens, ents in mapping.items(): sents = sorted(list(ents)) yield { "_id": "%s-%s" % (ens, ",".join(ents)), "multiplicity": len(sents), "ensembl": ens, "entrez": sents, } # last doc, sort of metadata src_doc = self.src_dump.find_one({"_id": self.main_source}) or {} release = src_doc["download"]["release"] ens_version, ent_version = release.split(":") yield { "_id": "_meta", "ensembl": { "file": ens2ent_file, "version": ens_version }, "entrez": { "file": ent2ens_file, "version": ent_version }, }
def load(self, aslist=False): ''' loading ncbi "homologene.data" file adding "homologene" field in gene doc ''' from biothings.utils.hub_db import get_src_dump homo_d = tab2dict(self.datafile,(2,1),0,header=0) entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {} entrez_dir = entrez_doc.get("data_folder") assert entrez_dir, "Can't find Entez data directory" DATAFILE = os.path.join(entrez_dir, 'gene_history.gz') assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-') for id in list(homo_d.keys()): homo_d[retired2gene.get(id,id)] = homo_d[id] with open(self.datafile) as df: homologene_d = {} doc_li = [] print() geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d) for line in df: ld = line.strip().split('\t') hm_id, tax_id, geneid = [int(x) for x in ld[:3]] if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d: # for selected species only # and also ignore those geneid does not match any # existing gene doc # in case of orignal geneid is retired, replaced with the # new one, if available. geneid = geneid_d[geneid] genes = homologene_d.get(hm_id, []) genes.append((tax_id, geneid)) homologene_d[hm_id] = genes doc_li.append(dict(_id=str(geneid), taxid=tax_id, homologene={'id': hm_id})) for i, gdoc in enumerate(doc_li): gdoc['homologene']['genes'] = self._sorted_homologenes( set(homologene_d[gdoc['homologene']['id']])) doc_li[i] = gdoc if aslist: return doc_li else: gene_d = dict([(d['_id'], d) for d in doc_li]) return gene_d
def load_broadinstitute_exac_any(one_file,key): logging.info("Loading file %s (%s)" % (one_file,key)) data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0) exacs = {} for transcript in data: tupleexac = data[transcript] # remove version in key so we can search the dict easily later exacs[transcript.split(".")[0]] = {"exac" : { "transcript" : transcript, # but keep version here "n_exons" : int(tupleexac[0]), "cds_start" : int(tupleexac[1]), "cds_end" : int(tupleexac[2]), "bp" : int(tupleexac[3]), key : { "mu_syn" : float(tupleexac[4]), "mu_mis" : float(tupleexac[5]), "mu_lof" : float(tupleexac[6]), "n_syn" : float(tupleexac[7]), "n_mis" : float(tupleexac[8]), "n_lof" : float(tupleexac[9]), "exp_syn" : float(tupleexac[10]), "exp_mis" : float(tupleexac[11]), "exp_lof" : float(tupleexac[12]), "syn_z" : float(tupleexac[13]), "mis_z" : float(tupleexac[14]), "lof_z" : float(tupleexac[15]), "p_li" : float(tupleexac[16]), "p_rec" : float(tupleexac[17]), "p_null" : float(tupleexac[18]) } } } return exacs
def _load_affy(df): filename = os.path.split(df)[1] rawfile, ext = os.path.splitext(filename) if ext.lower() == '.zip': df = (df, rawfile) dd = tab2dict(df, (0, 7), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 7 and ld[7] != '---' and ld[7] != 'gene_assignment') #fix for keys like "472 /// 4863" for mulitple geneids gene2affy = {} for k in dd: kk = k.split('///') if len(kk) > 1: for kkk in kk: k4 = kkk.split('//') if k4[len(k4) - 1].strip() != '---': dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k]) else: k4 = k.split('//') if len(k4) > 1: if k4[len(k4) - 1].strip() != '---': dict_apply(gene2affy, k4[len(k4) - 1].strip(), dd[k]) return gene2affy
def _load_ensembl_2taxid(self): """ensembl2taxid""" datafile = os.path.join(self.data_folder, 'gene_ensembl__translation__main.txt') ensembl2taxid = dict_nodup( tab2dict(datafile, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x)) return ensembl2taxid
def load(self, aslist=False): uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0) DATAFILE = os.path.join(self.data_folder, 'gene_history.gz') retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=lambda ld: ld[1] != '-') for id in list(uni_d.keys()): uni_d[retired2gene.get(id, id)] = uni_d[id] geneid_d = get_geneid_d(self.data_folder, self.species_li, load_cache=False, save_cache=False, only_for=uni_d) gene2unigene = tab2dict_iter( self.datafile, (0, 1), 0, alwayslist=0, includefn=lambda ld: int(ld[0]) in geneid_d) cnt = 0 for doc in gene2unigene: yield self.format(doc) cnt += 1
def _load_ensembl2entrez_li(self): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(self.data_folder, 'gene_ensembl__gene__extra.txt') if not os.path.exists(CUSTOM_MAPPING_FILE): print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(confirm=False) extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True) datafile = os.path.join(self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt') ensembl2entrez = tab2dict( datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping for k in extra: ensembl2entrez[k] = extra[k] # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) self.ensembl2entrez_li = ensembl2entrez_li
def _load_ensembl2entrez_li(self, src_name): """gene_ensembl__xref_entrezgene__dm""" CUSTOM_MAPPING_FILE = os.path.join(self.data_folder, 'gene_ensembl__gene__extra.txt') global extra_mapping_lock try: print("Trying to acquire extra mapping lock") extra_mapping_lock.acquire() print("Lock acquired") if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat( CUSTOM_MAPPING_FILE).st_size == 0: print("Missing extra mapping file, now generating") from . import ensembl_ncbi_mapping ensembl_ncbi_mapping.main(src_name, confirm=False) finally: print("Releasing lock") extra_mapping_lock.release() extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True) datafile = os.path.join(self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt') ensembl2entrez = tab2dict( datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True) # [(ensembl_gid, entrez_gid),...] # replace with our custom mapping ##adjusted = {} for k in extra: ##if k in ensembl2entrez: ## adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]} ensembl2entrez[k] = extra[k] ##import pickle ##pickle.dump(adjusted,open("/tmp/adjusted","wb")) # back to list of tuples ensembl2entrez_li = [] for ensembl_id, entrez_ids in ensembl2entrez.items(): for entrez_id in entrez_ids: ensembl2entrez_li.append((ensembl_id, entrez_id)) self.ensembl2entrez_li = ensembl2entrez_li
def load_exons_for_species(data_folder, species, exons_key='exons'): refflat_file = os.path.join(data_folder, species, 'database/refFlat.txt.gz') t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list( zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq, []).append({ 'transcript': refseq, 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'position': exons }) gene2exons = {} reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: ref2exons[refseq]} else: gene2exons[geneid][exons_key].extend(ref2exons[refseq]) return gene2exons
def _load_ensembl2name(self): """loading ensembl gene to symbol+name mapping""" datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') ensembl2name = tab2dict(datafile, (1, 2, 7), 0, includefn=_not_LRG) def _fn(x): out = {} if x[0].strip() not in ['', '\\N']: out['symbol'] = x[0].strip() if x[1].strip() not in ['', '\\N']: _name = SubStr(x[1].strip(), '', ' [Source:').strip() if _name: out['name'] = _name return out ensembl2name = value_convert(ensembl2name, _fn) return ensembl2name
def load_ensembl2pos(self): datafile = os.path.join( self.data_folder, 'gene_ensembl__gene__main.txt') # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos ensembl2pos = dict_nodup( tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG)) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])}) ensembl2pos = value_convert(ensembl2pos, lambda x: { 'genomic_pos': x}, traverse_list=False) for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG): datadict = dict_nodup(datadict) datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int( x[1]), 'end': int(x[2]), 'strand': int(x[4])}) datadict = value_convert(datadict, lambda x: { 'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False) for doc in map_id(datadict, self.ensembl2entrez): yield doc
def _load_affy(df): filename = os.path.split(df)[1] rawfile, ext = os.path.splitext(filename) if ext.lower() == '.zip': df = (df, rawfile) dd = tab2dict(df, (0, 18), 1, sep=',', header=1, includefn=lambda ld: len(ld) > 18 and ld[18] != '---' and ld[ 18] != 'Entrez Gene') #fix for keys like "472 /// 4863" for mulitple geneids gene2affy = {} for k in dd: if len(k.split(' /// ')) > 1: for kk in k.split(' /// '): dict_apply(gene2affy, kk.strip(), dd[k]) else: dict_apply(gene2affy, k.strip(), dd[k]) return gene2affy
def load(self, aslist=False): if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[ 1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert( gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} if aslist: return dict_to_list(gene_d) else: return gene_d
def get_geneid_d(data_folder, species_li=None, load_cache=True, save_cache=True, only_for={}): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set( [TAXONOMY[species]["tax_id"] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(data_folder) # check cache file _cache_file = 'geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene_info.gz') and \ file_newer(_cache_file, 'gene_history.gz'): _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(data_folder, 'gene_info.gz') if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set and ( only_for and ld[1] in only_for) elif only_for: species_filter = lambda ld: only_for and ld[1] in only_for else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) DATAFILE = os.path.join(data_folder, 'gene_history.gz') if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # TODO: this fills memory with key==value ... for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def load_pharmgkb(data_folder): datafile = os.path.join(data_folder, 'genes.zip') gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) return gene2pharmgkb
def loaddata(data_folder): #Snowball array datafile = os.path.join(data_folder, 'pigatlas', 'snowball_array_annotation.txt') gene2snowball = tab2dict(datafile, (0, 1), 1, header=0) return {'snowball': gene2snowball}