def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k, v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k, v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder( os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")): _, ensid, transid, _ = line if transid in exacs: data = exacs.pop( transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid, [ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k,v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
from __future__ import print_function import os.path from collections import defaultdict from dataload import get_data_folder from utils.dataload import anyfile, tabfile_feeder from biothings.utils.common import safewfile ENSEMBL_DATA_FOLDER = get_data_folder('ensembl') print('Ensembl DATA_FOLDER: ' + ENSEMBL_DATA_FOLDER) Entrez_DATA_FOLDER = get_data_folder('entrez') print('Ensembl DATA_FOLDER: ' + Entrez_DATA_FOLDER) gene_ensembl_1_xref_dm_file = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__xref_entrezgene__dm.txt") gene_ensembl_2_main_file = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__main.txt") gene2ensembl_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene2ensembl.gz") gene_main_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene_info.gz") outfile = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__extra.txt") def find_multiple_mappings_from_entrezgene_file(gene_ensembl_entrezgene_dm_file): """Input gene_ensembl_entrezgene_dm_file, and identify how many NCBI gene IDs there are for each ensembl gene ID. Lines in input file are: 'gene_ensembl__xref_entrezgene__dm.txt' (useful columns in input_file): col1: Ensembl gene ID col2: NCBI gene ID
import os.path import copy #from config import DATA_ARCHIVE_ROOT from dataload import get_data_folder from utils.common import SubStr from utils.dataload import (load_start, load_done, tab2dict, tab2list, value_convert, normalized_value, list2dict, dict_nodup, dict_attrmerge ) #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/ensembl/69') DATA_FOLDER = get_data_folder('ensembl') print('DATA_FOLDER: ' + DATA_FOLDER) #fn to skip lines with LRG records.''' _not_LRG = lambda ld: not ld[1].startswith("LRG_") class EnsemblParser: def __init__(self): self.ensembl2entrez_li = None self.ensembl_main = None def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder import logging # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = get_data_folder('exac') def load_broadinstitute_exac_any(one_file,key): print("Loading file %s (%s)" % (one_file,key)) data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0) exacs = {} for transcript in data: tupleexac = data[transcript] # remove version in key so we can search the dict easily later exacs[transcript.split(".")[0]] = {"exac" : { "transcript" : transcript, # but keep version here "n_exons" : int(tupleexac[0]), "cds_start" : int(tupleexac[1]), "cds_end" : int(tupleexac[2]), "bp" : int(tupleexac[3]), key : { "mu_syn" : float(tupleexac[4]), "mu_mis" : float(tupleexac[5]), "mu_lof" : float(tupleexac[6]),
import os.path import copy #from config import DATA_ARCHIVE_ROOT from dataload import get_data_folder from utils.common import SubStr from utils.dataload import (load_start, load_done, tab2dict, tab2list, value_convert, normalized_value, list2dict, dict_nodup, dict_attrmerge ) #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/ensembl/69') DATA_FOLDER = get_data_folder('ensembl') print('DATA_FOLDER: ' + DATA_FOLDER) #fn to skip lines with LRG records.''' def _not_LRG(ld): return not ld[1].startswith("LRG_") class EnsemblParser: def __init__(self): self.ensembl2entrez_li = None self.ensembl_main = None def _load_ensembl_2taxid(self): """ensembl2taxid""" DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt') load_start(DATAFILE) ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG)) # need to convert taxid to integer here
from __future__ import print_function import os.path from collections import defaultdict from dataload import get_data_folder from utils.dataload import anyfile, tabfile_feeder from biothings.utils.common import safewfile ENSEMBL_DATA_FOLDER = get_data_folder('ensembl') print('Ensembl DATA_FOLDER: ' + ENSEMBL_DATA_FOLDER) Entrez_DATA_FOLDER = get_data_folder('entrez') print('Ensembl DATA_FOLDER: ' + Entrez_DATA_FOLDER) gene_ensembl_1_xref_dm_file = os.path.join( ENSEMBL_DATA_FOLDER, "gene_ensembl__xref_entrezgene__dm.txt") gene_ensembl_2_main_file = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__main.txt") gene2ensembl_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene2ensembl.gz") gene_main_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene_info.gz") outfile = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__extra.txt") def find_multiple_mappings_from_entrezgene_file( gene_ensembl_entrezgene_dm_file): """Input gene_ensembl_entrezgene_dm_file, and identify how many NCBI gene IDs there are for each ensembl gene ID. Lines in input file are: 'gene_ensembl__xref_entrezgene__dm.txt' (useful columns in input_file): col1: Ensembl gene ID
import os.path import types import time from utils.common import timesofar from utils.dataload import (load_start, load_done, listitems, dupline_seperator, tabfile_feeder, list2dict, list_nondup, value_convert) from config import DATA_ARCHIVE_ROOT from dataload import get_data_folder #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = get_data_folder('uniprot') #REF: #ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README def get_uniprot_section(uniprotkb_id): '''return either "Swiss-Prot" or "TrEMBL", two sections of UniProtKB, based on input uniprotkb_id, or entry name. The rule is (http://www.uniprot.org/manual/entry_name): Swiss-Prot entries have a maximum of 5 characters before the "_", TrEMBL entries have 6 characters before the "_" (the accession some examples: TrEMBL: O61847_CAEEL, F0YED1_AURAN Swiss-Prot: CDK2_HUMAN, CDK2_MOUSE ''' v = uniprotkb_id.split('_') if len(v) != 2: raise ValueError('Invalid UniprotKB ID') return 'TrEMBL' if len(v[0])==6 else "Swiss-Prot"
import os.path import time from biothings.utils.common import get_timestamp, timesofar from utils.dataload import (load_start, load_done, tabfile_feeder, list2dict, value_convert, dict_convert) from dataload import get_data_folder DATA_FOLDER = get_data_folder('cpdb') def _download(__metadata__): from utils.dataload import download as _download output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp()) for species in ['human', 'mouse', 'yeast']: url = __metadata__['__url_{}__'.format(species)] output_file = 'CPDB_pathways_genes_{}.tab'.format(species) _download(url, output_folder, output_file) def load_cpdb(__metadata__): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included'] VALID_COLUMN_NO = 4 t0 = time.time() print('DATA_FOLDER: ' + DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
import os.path from utils.dataload import (load_start, load_done, tab2dict, value_convert) from dataload import get_data_folder #from config import DATA_ARCHIVE_ROOT #timestamp = '20121005' #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/pharmgkb', timestamp) DATA_FOLDER = get_data_folder('pharmgkb') def load_pharmgkb(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip') load_start(DATAFILE) gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1,includefn=lambda ld:ld[1]!='') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) load_done('[%d]' % len(gene2pharmgkb)) return gene2pharmgkb
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, listitems, dupline_seperator, tabfile_feeder, list2dict, list_nondup, value_convert) #from config import DATA_ARCHIVE_ROOT from dataload import get_data_folder #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = get_data_folder('uniprot') #REF: #ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README VALID_COLUMN_NO = 22 def get_uniprot_section(uniprotkb_id): '''return either "Swiss-Prot" or "TrEMBL", two sections of UniProtKB, based on input uniprotkb_id, or entry name. The rule is (http://www.uniprot.org/manual/entry_name): Swiss-Prot entries have a maximum of 5 characters before the "_", TrEMBL entries have 6 or 10 characters before the "_" (the accession some examples: TrEMBL: O61847_CAEEL, F0YED1_AURAN, A0A024RB10_HUMAN Swiss-Prot: CDK2_HUMAN, CDK2_MOUSE ''' v = uniprotkb_id.split('_') if len(v) != 2: raise ValueError('Invalid UniprotKB ID') #return 'TrEMBL' if len(v[0])==6 else "Swiss-Prot"
import os.path from utils.dataload import (load_start, load_done, tab2dict, value_convert) from dataload import get_data_folder #from config import DATA_ARCHIVE_ROOT #timestamp = '20121005' #DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/pharmgkb', timestamp) DATA_FOLDER = get_data_folder('pharmgkb') def load_pharmgkb(): print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip') load_start(DATAFILE) gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '') fn = lambda value: {'pharmgkb': value} gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False) load_done('[%d]' % len(gene2pharmgkb)) return gene2pharmgkb
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder import logging # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = get_data_folder('exac') def load_broadinstitute_exac_any(one_file, key): print("Loading file %s (%s)" % (one_file, key)) data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), 0) exacs = {} for transcript in data: tupleexac = data[transcript] # remove version in key so we can search the dict easily later exacs[transcript.split(".")[0]] = { "exac": { "transcript": transcript, # but keep version here "n_exons": int(tupleexac[0]), "cds_start": int(tupleexac[1]), "cds_end": int(tupleexac[2]), "bp": int(tupleexac[3]), key: { "mu_syn": float(tupleexac[4]),
''' Populates MICROBE gene entries with genomic position data Currently updates the 120 microbial taxids that are NCBI Reference Sequences run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE when it's necessary. ''' import os.path from biothings.utils.common import (dump, loadobj, get_timestamp) from utils.dataload import (tab2list, load_start, load_done) from dataload import get_data_folder DATA_FOLDER = get_data_folder('entrez') print('DATA_FOLDER: ' + DATA_FOLDER) __metadata__ = { '__collection__': 'entrez_genomic_pos', } TAXIDS_FILE = os.path.join(DATA_FOLDER, "../ref_microbe_taxids.pyobj") DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene2refseq.gz') def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """
import os.path import time from utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes') def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz') load_start(refflat_file) t0 = time.time() refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False) ref2exons = [] for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x]) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.append((refseq, { 'chr': chr,
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes') REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False) def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list(zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq,[]).append({ 'transcript' : refseq,
''' Populates MICROBE gene entries with genomic position data Currently updates the 120 microbial taxids that are NCBI Reference Sequences run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE when it's necessary. ''' import os.path from biothings.utils.common import (dump, loadobj, get_timestamp) from utils.dataload import (tab2list, load_start, load_done) from dataload import get_data_folder DATA_FOLDER = get_data_folder('entrez') print('DATA_FOLDER: ' + DATA_FOLDER) __metadata__ = { '__collection__': 'entrez_genomic_pos', } TAXIDS_FILE = os.path.join(DATA_FOLDER, "../ref_microbe_taxids_20151014.pyobj") DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene2refseq.gz') def load_genedoc(self): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids = loadobj(TAXIDS_FILE)
import os.path import time from biothings.utils.common import timesofar from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder, list2dict) from dataload import get_data_folder # DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot') DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes') REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz') refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False) def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list( zip([int(x) for x in ld[9].split(',') if x],