def load_annotations(data_folder): infile = os.path.join(data_folder, "var_drug_ann.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: if not rec["Gene"] or pandas.isna(rec["Gene"]): logging.warning("No gene information for annotation ID '%s'", rec["Annotation ID"]) continue _id = re.match(".* \((.*?)\)", rec["Gene"]).groups()[0] # we'll remove space in keys to make queries easier. Also, lowercase is preferred # for a BioThings API. We'll an helper function from BioThings SDK process_key = lambda k: k.replace(" ", "_").lower() rec = dict_convert(rec, keyfn=process_key) # remove NaN values, not indexable rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "annotations": docs} yield doc
def load(self): cnt = 0 for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1): datadict = dict_convert(datadict, valuefn=lambda v: { 'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]}) for id,doc in datadict.items(): cnt += 1 doc['_id'] = id yield doc
def load_GnomadConstraints(data_folder): infile = os.path.abspath("/opt/biothings/GRCh37/gnomAD_constraints/v2.1.1/GnomadConstraints.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE) dat = dat.drop(columns=["brain_expression"]).to_dict(orient='records') results = {} for rec in dat: _id = rec["symbol"] process_key = lambda k: k.replace(" ","_").lower() rec = dict_convert(rec,keyfn=process_key) rec = dict_sweep(rec,vals=[np.nan]) results.setdefault(_id,[]).append(rec) for _id,docs in results.items(): doc = {"_id": _id, "GnomadConstraints" : docs} yield doc
def load_drugs(data_folder): infile = os.path.join(data_folder, "drugs.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: _id = rec["PharmGKB Accession Id"] rec = dict_convert(rec, keyfn=process_key2) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "drugs": docs} yield doc
def load_HGNC(data_folder): infile = os.path.abspath("/opt/biothings/GRCh37/hgnc/latest/Hgnc.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE) dat = dat.drop(columns=["snornabase", "mirbase", "homeodb", "orphanet", "horde_id", "kznf_gene_catalog", "mamit-trnadb", "rna_central_ids", "imgt", "gtrnadb"]).to_dict(orient='records') results = {} for rec in dat: _id = rec["hgnc_id"] process_key = lambda k: k.replace(" ","_").lower() rec = dict_convert(rec,keyfn=process_key) rec = dict_sweep(rec,vals=[np.nan]) results.setdefault(_id,[]).append(rec) for _id,docs in results.items(): doc = {"_id": _id, "HGNC" : docs} yield doc
def load(self, aslist=False): gene2go = tab2dict_iter(self.datafile, (1, 2, 3, 4, 5, 6, 7), 0, alwayslist=1, includefn=self.species_filter) category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'} def _ff(d): out = {} for goid, evidence, qualifier, goterm, pubmed, gocategory in d: _gocategory = category_d[gocategory] _d = out.get(_gocategory, []) _rec = dict(id=goid, term=goterm) if gocategory == 'Function': _rec['category'] = 'MF' elif gocategory == 'Process': _rec['gocategory'] = 'BP' elif gocategory == 'Component': _rec['gocategory'] = 'CC' if evidence != '-': _rec['evidence'] = evidence if qualifier != '-': # here I also fixing some inconsistency issues in NCBI data # Colocalizes_with -> colocalizes_with # Contributes_with -> contributes_with # Not -> NOT _rec['qualifier'] = qualifier.replace('Co', 'co').replace( 'Not', 'NOT') if pubmed != '-': if pubmed.find('|') != -1: pubmed = [int(pid) for pid in pubmed.split('|')] else: pubmed = int(pubmed) _rec['pubmed'] = pubmed _d.append(_rec) out[_gocategory] = _d for k in out: if len(out[k]) == 1: out[k] = out[k][0] return out for gd in gene2go: convd = dict_convert(gd, valuefn=_ff) assert len(list( convd.items())) == 1, "nope: %s" % list(convd.items()) gid, go = list(convd.items())[0] gene_d = {"_id": gid, "go": go} yield gene_d
def load_occurrences(data_folder): infile = os.path.join(data_folder, "occurrences.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: if rec["Object Type"] != "Gene": continue _id = rec["Object ID"] rec = dict_convert(rec, keyfn=process_key) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "occurrences": docs} yield doc
def load_druglabels(data_folder): infile = os.path.join(data_folder, "drugLabels.byGene.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') for rec in dat: label_ids = rec.pop("Label IDs").split(";") label_names = rec.pop("Label Names").split(";") assert len(label_ids) == len(label_names) labels = [] for i, _ in enumerate(label_ids): labels.append({"id": label_ids[i], "name": label_names[i]}) _id = rec["Gene ID"] rec = dict_convert(rec, keyfn=process_key) doc = {"_id": _id, "drug_labels": labels} yield doc
def load_drug_annotations(data_folder): infile = os.path.join(data_folder, 'GDSC_Drug_anno.csv') assert os.path.exists(infile) dat = pandas.read_csv(infile, sep=',', squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') for rec in dat: drug_name = rec.pop('Drug_Name') drug_target_pathway = rec.pop('Drug_Targeted_process_or_pathway') drug_annotations = { 'drug_name': drug_name, 'target_pathway': drug_target_pathway } _id = rec['Drug_identifier'] rec = dict_convert(rec, keyfn=process_key) doc = {'_id': _id, 'drug_annotations': drug_annotations} yield doc
def load_KEGG(data_folder): infileInfo = os.path.abspath("/opt/biothings/GRCh37/kegg/april2011/KeggInfo.tsv") infileID = os.path.abspath("/opt/biothings/GRCh37/kegg/april2011/EnsemblToKegg.tsv") assert os.path.exists(infileInfo) assert os.path.exists(infileID) datInfo = pandas.read_csv(infileInfo,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE) datID = pandas.read_csv(infileID,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE) dat = datID.join(datInfo.set_index('kegg_id'), on='kegginfo_id').to_dict(orient='records') results = {} for rec in dat: _id = rec["gene_id"] process_key = lambda k: k.replace(" ","_").lower() rec = dict_convert(rec,keyfn=process_key) rec = dict_sweep(rec,vals=[np.nan]) results.setdefault(_id,[]).append(rec) for _id,docs in results.items(): doc = {"_id": _id, "KEGG" : docs} yield doc
def load_Exac(data_folder): infile = os.path.abspath("/opt/biothings/GRCh37/ExAC/r1/Exac.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: _id = rec["release"] + "_" + str(rec["chromosome"]) + "_" + str( rec["position"] ) + "_" + rec["reference"] + "_" + rec["alternative"] process_key = lambda k: k.replace(" ", "_").lower() rec = dict_convert(rec, keyfn=process_key) rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "Exac": docs} yield doc
def load(self, aslist=False): gene2acc = tab2dict_iter(self.datafile, (1, 3, 5, 7), 0, alwayslist=1, includefn=self.species_filter) def _ff(d): out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []} for rna, prot, dna in d: if rna == '-': rna = None if prot == '-': prot = None if dna == '-': dna = None if rna is not None: out['rna'].append(rna) if prot is not None: out['protein'].append(prot) if dna is not None: out['genomic'].append(dna) if rna and prot: out['translation'].append({'rna': rna, 'protein': prot}) # remove dup for k in out: out[k] = normalized_value(out[k]) # remove empty rna/protein/genomic field _out = {} for k, v in out.items(): if v: _out[k] = v if _out: _out = {self.fieldname: _out} return _out #gene/2acc = dict_convert(gene2acc, valuefn=_ff) cnt = 0 for gd in gene2acc: convd = self.format(dict_convert(gd, valuefn=_ff)) yield convd cnt += 1 if aslist: return dict_to_list(gene2acc) else: return gene2acc
def load_cpdb(data_folder, pathways): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = pathways VALID_COLUMN_NO = 4 t0 = time.time() DATA_FILES = [] DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) return _out
def load_clinvar(data_folder): infile = os.path.abspath( "/opt/biothings/GRCh37/clinvar/latest/Clinvar.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: var = rec["release"] + "_" + str(rec["chromosome"]) + "_" + str( rec["start"]) + "_" + rec["reference"] + "_" + rec["alternative"] _id = hashlib.sha224(var.encode('ascii')).hexdigest() process_key = lambda k: k.replace(" ", "_").lower() rec = dict_convert(rec, keyfn=process_key) rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "clinvar": docs} yield doc
def load_denovodb(data_folder): infile = os.path.abspath( "/opt/biothings/GRCh37/denovodb/denovo-db.non-ssc-samples.variants.tsv" ) assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: _id = str(rec["Chr"]) + "_" + str( rec["Position"]) + "_" + rec["REF"] + "_" + rec["ALT"] process_key = lambda k: k.replace(" ", "_").lower() rec = dict_convert(rec, keyfn=process_key) rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "denovodb": docs} yield doc
def load_gene_drug_associations(data_folder): infile = os.path.join(data_folder, 'mutation_drug_pairs.csv') assert os.path.exists(infile) dat = pandas.read_csv(infile, sep=',', squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: if not rec['Drug ID'] or pandas.isna(rec['Drug ID']): logging.warning('No drug information found in current record.') continue _id = rec['Drug ID'] rec = dict_convert(rec, keyfn=process_key) # Remove NaN values, not indexable rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {'_id': _id, 'gene_drug_associations': docs} yield doc
def load_annotations(data_folder): infile = os.path.join(data_folder, "var_drug_ann.tsv") assert os.path.exists(infile) dat = pandas.read_csv(infile, sep="\t", squeeze=True, quoting=csv.QUOTE_NONE).to_dict(orient='records') results = {} for rec in dat: if not rec["Gene"] or pandas.isna(rec["Gene"]): logging.warning("No gene information for annotation ID '%s'", rec["Annotation ID"]) continue _id = re.match(".* \((.*?)\)", rec["Gene"]).groups()[0] rec = dict_convert(rec, keyfn=process_key) # remove NaN values, not indexable rec = dict_sweep(rec, vals=[np.nan]) results.setdefault(_id, []).append(rec) for _id, docs in results.items(): doc = {"_id": _id, "annotations": docs} yield doc
def load(self, aslist=False): if self.species_li: _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[ 1] != '-' else: _includefn = lambda ld: ld[1] != '-' gene2retired = tab2dict(self.datafile, (1, 2), 0, alwayslist=1, includefn=_includefn) gene2retired = dict_convert( gene2retired, valuefn=lambda x: normalized_value([int(xx) for xx in x])) gene_d = {} for gid, retired in gene2retired.items(): gene_d[gid] = {'retired': retired} if aslist: return dict_to_list(gene_d) else: return gene_d
def get_geneid_d(data_folder, species_li=None, load_cache=True, save_cache=True, only_for={}): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set( [TAXONOMY[species]["tax_id"] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(data_folder) # check cache file _cache_file = 'geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene_info.gz') and \ file_newer(_cache_file, 'gene_history.gz'): _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(data_folder, 'gene_info.gz') if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set and ( only_for and ld[1] in only_for) elif only_for: species_filter = lambda ld: only_for and ld[1] in only_for else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) DATAFILE = os.path.join(data_folder, 'gene_history.gz') if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # TODO: this fills memory with key==value ... for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d