def load_data(data_folder): # Load .gmt (Gene Matrix Transposed) file with entrez ids for f in glob.glob(os.path.join(data_folder, "msigdb.*.entrez.gmt")): data = tabfile_feeder(f, header=0) all_genes = set() for rec in data: genes = set(rec[2:]) all_genes = all_genes | genes # Query gene info lookup = IDLookup(9606) # Human genes lookup.query_mygene(all_genes, 'entrezgene') data = tabfile_feeder(f, header=0) for rec in data: name = rec[0] url = rec[1] ncbigenes = rec[2:] genes = [] for gene in ncbigenes: if lookup.query_cache.get(gene): genes.append(lookup.query_cache[gene]) # Format schema doc = { '_id': name, 'is_public': True, 'taxid': 9606, 'genes': genes, 'source': 'msigdb', 'msigdb': { 'id': name, 'geneset_name': name, 'url': url } } yield doc
def load_data(data_folder): # Load .gmt (Gene Matrix Transposed) file with entrez ids f = os.path.join(data_folder, "ReactomePathways.gmt") data = tabfile_feeder(f, header=0) all_genes = set() for rec in data: genes = set(rec[2:]) all_genes = all_genes | genes # Query gene info lookup = IDLookup(9606) # Human genes lookup.query_mygene(all_genes, 'symbol') data = tabfile_feeder(f, header=0) for rec in data: name = rec[0] _id = rec[1] ncbigenes = rec[2:] genes = [] for gene in ncbigenes: if lookup.query_cache.get(gene): genes.append(lookup.query_cache[gene]) # Format schema doc = { '_id': _id, 'name': name, 'is_public': True, 'taxid': 9606, 'genes': genes, 'source': 'reactome', 'reactome': { 'id': _id, 'geneset_name': name, } } yield doc
def find_ncbi_symbols(gene_info_file, ensembl_dict): print("step 4 start: read NCBI gene symbol") ncbi_list_to_find = {} for key in ensembl_dict: ncbi_list = ensembl_dict[key]['data']['ncbi_list'] for e in ncbi_list: ncbi_list_to_find[e] = True #gene2ensembl_ncbi_gene_id_match_list = ensembl_dict[key]['data']['gene2ensembl'] #if len(gene2ensembl_ncbi_gene_id_match_list) != 1: # ncbi_list_to_find.append(ncbi_list) for e in list( set([item for sublist in ncbi_list_to_find for item in sublist])): ncbi_list_to_find[e] = True ncbi_id_symbols = {} for ld in tabfile_feeder(gene_info_file): if ld[1] in ncbi_list_to_find: ncbi_id_symbols[ld[1]] = ld[2] print( "number of unique NCBI gene IDs to be queried using Entrez gene_info file: ", len(ncbi_list_to_find)) print("number symbols found in NCBI file: ", len(ncbi_id_symbols)) print("step 4 end") return ncbi_id_symbols
def parse_gene_annotations(f): """Parse a gene annotation (.gaf.gz) file.""" data = tabfile_feeder(f, header=0) genesets = {} for rec in data: if not rec[0].startswith("!"): _id = rec[4].replace(":", "_") if genesets.get(_id) is None: taxid = rec[12].split("|")[0].replace("taxon:", "") genesets[_id] = { "_id": _id + "_" + taxid, "is_public": True, "taxid": taxid } uniprot = rec[1] symbol = rec[2] qualifiers = rec[3].split("|") # The gene can belong to several sets: if "NOT" in qualifiers: # Genes similar to genes in go term, but should be excluded genesets[_id].setdefault("excluded_genes", set()).add( (uniprot, symbol)) if "contributes_to" in qualifiers: # Genes that contribute to the specified go term genesets[_id].setdefault("contributing_genes", set()).add( (uniprot, symbol)) if "colocalizes_with" in qualifiers: # Genes colocalized with specified go term genesets[_id].setdefault("colocalized_genes", set()).add( (uniprot, symbol)) else: # Default set: genes that belong to go term genesets[_id].setdefault("genes", set()).add((uniprot, symbol)) return genesets
def load_broadinstitute_exac(data_folder): t0 = time.time() exacs = load_broadinstitute_exac_all(data_folder) for k,v in load_broadinstitute_exac_nontcga(data_folder).items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych(data_folder).items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") from ..ensembl.parser import EnsemblParser from biothings.utils.hub_db import get_src_dump ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {} ensembl_dir = ensembl_doc.get("data_folder") assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)" ensembl_parser = EnsemblParser(ensembl_dir) ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data return exacs
def load_exons_for_species(data_folder, species, exons_key='exons'): refflat_file = os.path.join(data_folder, species, 'database/refFlat.txt.gz') t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list( zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq, []).append({ 'transcript': refseq, 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'position': exons }) gene2exons = {} reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: ref2exons[refseq]} else: gene2exons[geneid][exons_key].extend(ref2exons[refseq]) return gene2exons
def load_cpdb(data_folder, pathways): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = pathways VALID_COLUMN_NO = 4 t0 = time.time() DATA_FILES = [] DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort(key=lambda e: e["id"]) return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) return _out
def load_data(data_folder): def get_taxid(species): taxids = { "Mus musculus": 10090, "Bos taurus": 9913, "H**o sapiens": 9606, "Anopheles gambiae": 180454, "Arabidopsis thaliana": 3702, "Caenorhabditis elegans": 6239, "Canis familiaris": 9615, "Danio rerio": 7955, "Drosophila melanogaster": 7227, "Equus caballus": 9796, "Gallus gallus": 9031, "Oryza sativa": 39947, "Pan troglodytes": 9598, "Rattus norvegicus": 10116, "Saccharomyces cerevisiae": 559292, "Populus trichocarpa": 3694, "Sus scrofa": 9823 } return taxids[species] # Load .gmt (Gene Matrix Transposed) files for f in glob.glob(os.path.join(data_folder, "*.gmt")): # Get species name from the filename and convert to taxid species = f.replace(".gmt", "").split("-")[-1].replace("_", " ") taxid = get_taxid(species) print("Parsing data for {} ({})".format(species, taxid)) # Read entire file and fetch data for joint set of all genes data = tabfile_feeder(f, header=0) all_genes = [] for rec in data: all_genes += rec[2:] all_genes = set(all_genes) lookup = IDLookup(taxid) lookup.query_mygene(all_genes, 'entrezgene') # Parse each individual document data = tabfile_feeder(f, header=0) for rec in data: header = rec[0].split("%") # Get fields from header pathway_name = header[0] wikipathways_id = header[2] assert species == header[3], "Species does not match." # Get URL and gene list url = rec[1] ncbigenes = rec[2:] genes = [] for g in ncbigenes: if lookup.query_cache.get(g): genes.append(lookup.query_cache[g]) # Format schema doc = { '_id': wikipathways_id, 'is_public': True, 'taxid': taxid, 'genes': genes, 'wikipathways': { 'id': wikipathways_id, 'pathway_name': pathway_name, 'url': url } } yield doc
def load_all(data_folder): '''Load "uniprot" using yield, while building "PDB" and "PIR" data dict while reading data file. These dict are then dumped (pickled) and stored later''' def cvt_fn(pdb_id): return pdb_id.split(':')[0] def merge(xli, transcode=False): xli2 = [] uniprot_acc, section, entrez_id, ensembl_id = xli if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) except KeyError: xli2.append((uniprot_acc, section, ensembl_id)) return xli2 def transform(xli2): gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) gid, uniprot = list(gene2uniprot.items())[0] docs = [] for gid, uniprot in gene2uniprot.items(): doc = {"_id": gid} doc.update(uniprot) docs.append(doc) return docs def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None): xli2 = [] entrez_id, ensembl_id, x_value = xli if not x_value: return if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = x_ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((_eid, x_value)) except KeyError: xli2.append((ensembl_id, x_value)) for x in xli2: gene2x.setdefault(x[0], []).append(x[1]) uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz') t0 = time.time() # cache for uniprot ensembl2geneid = {} # cache for PDB and PIR x_ensembl2geneid = {} remains = [] pdb_remains = [] pir_remains = [] # once filled, will be dumped for later storage gene2pdb = {} gene2pir = {} # store all PDB & PIR data while looping, the whole will be stored later for ld in tabfile_feeder(uniprot_datafile, header=1, assert_column_no=VALID_COLUMN_NO): # Uniprot data will be stored as we read line by line xlis = [] pdbxlis = [] pirxlis = [] # raw lines for each sources uniprotld = [ld[0], ld[1], ld[2], ld[18]] pdbld = [ld[2], ld[19], ld[5]] pirld = [ld[2], ld[19], ld[11]] # UniProt # GeneID and EnsemblID columns may have duplicates for value in dupline_seperator(dupline=uniprotld, dup_idx=[2, 3], dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xlis.append(value) # PDB for value in dupline_seperator(dupline=pdbld, dup_sep='; '): pdbxlis.append(value) # PIR for value in dupline_seperator(dupline=pirld, dup_sep='; '): pirxlis.append(value) for xli in xlis: # feed mapping if xli[2] != '' and xli[3] != '': ensembl2geneid.setdefault(xli[3], []).append(xli[2]) try: # postpone ensemblid->entrezid resolution while parsing uniprot as the # full transcodification dict is only correct at the end. # ex: # 1. UniprotID-A EntrezID-A EnsemblID # 2. UniprotID-B EnsemblID # 3. UniprotID-C EntrezID-B EnsemblID # # UniprotID-B should associated to both EntrezID-A and EntrezID-B # but we need to read up to line 3 to do so xli2 = merge(xli, transcode=False) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: remains.append(xli) for xli in pdbxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb") except KeyError: pdb_remains.append(xli) for xli in pirxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pir, transcode=False) except KeyError: pir_remains.append(xli) # now transcode with what we have for remain in remains: try: xli2 = merge(remain, transcode=True) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: pass for remain in pdb_remains: try: merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn) except KeyError: pass for remain in pir_remains: try: merge_x(remain, gene2pir, transcode=True) except KeyError: pass # PDB def normalize(value, keyname): res = None uniq = sorted(set(value)) if len(uniq) > 1: res = {keyname: uniq} else: res = {keyname: uniq[0]} return res def normalize_pdb(value): return normalize(value, "pdb") def normalize_pir(value): return normalize(value, "pir") # PDB gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False) pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj') dump(gene2pdb, pdb_dumpfile) # PIR gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False) pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj') dump(gene2pir, pir_dumpfile)
def load_annotations(data_folder): data_file = os.path.join(data_folder, "interactions.tsv") data = tabfile_feeder(data_file, header=0) header = next(data) mapfile = open(os.path.join(data_folder, "predicate-remap.yaml"), 'r').read() remappingdata = yaml.safe_load(mapfile) def get_predicate(relation): if relation != "": key = ':'.join(("DGIdb", relation)) return remappingdata[key]['rename'][0] return "" def get_gene_id(gene_name): query = ("http://mygene.info/v3/query?q=symbol:{}" "&fields=entrezgene&species=human".format(gene_name)) response = requests.get(query) if response.status_code == "200": data = response.json() entrez_id = data['hits'][0]['entrezgene'] if entrez_id != "": return entrez_id return None def get_chem_id(drug_name): query = ("http://mychem.info/v1/query?q=chembl.pref_name:{}" "&fields=chembl.molecule_chembl_id".format(drug_name)) response = requests.get(query) if response.status_code == "200": data = response.json() chembl_id = data['hits'][0]['chembl']['molecule_chembl_id'] if chembl_id != "": return chembl_id return None for rec in data: # Create a hash for _id bytestr = bytearray("-".join(rec), 'utf-8') hashstr = hashlib.blake2b(bytestr, digest_size=8).hexdigest() # Document framework doc = {"_id": hashstr, "subject": {}, "object": {}, "association": {}} # Subject entrez_id = rec[header.index("entrez_id")] gene_name = rec[header.index("gene_name")] if entrez_id == "": if gene_name == "": continue # Skip the record resp = get_gene_id(gene_name) if resp is None: subject_id = 'name:' + gene_name else: entrez_id = resp subject_id = 'NCBIGene:' + resp else: subject_id = 'NCBIGene:' + entrez_id doc['subject']['NCBIGene'] = entrez_id doc['subject']['SYMBOL'] = gene_name doc['subject']['id'] = subject_id # Object drug_name = rec[header.index("drug_name")] drug_chembl_id = rec[header.index("drug_concept_id")] if drug_chembl_id == "": if drug_name == "": continue # Skip the record resp = get_chem_id(drug_name) if resp is None: object_id = 'name:' + drug_name else: drug_chembl_id = resp object_id = 'CHEMBL.COMPOUND:' + resp elif drug_chembl_id.startswith("chembl:"): object_id = 'CHEMBL.COMPOUND:' + drug_chembl_id.split(':')[-1] drug_chembl_id = 'CHEMBL.COMPOUND:' + drug_chembl_id.split(':')[-1] doc['object']['name'] = drug_name doc['object']['CHEMBL_COMPOUND'] = drug_chembl_id doc['object']['id'] = object_id # Association interaction_types = rec[header.index("interaction_types")].replace( " ", "_").split(",") pmids = rec[header.index("PMIDs")].split(",") interaction_claim_source = rec[header.index( "interaction_claim_source")] edge_labels = [] for interaction in interaction_types: edge_labels.append(get_predicate(interaction)) if edge_labels == ['']: edge_labels = 'physically_interacts_with' doc['association']['edge_label'] = edge_labels doc['association']['relation_name'] = interaction_types doc['association']['pubmed'] = pmids doc['association']['provided_by'] = interaction_claim_source doc['association']['interaction_group_score'] = rec[header.index( "interaction_group_score")] # Cleanup doc = dict_sweep(doc) doc = unlist(doc) yield doc