Esempio n. 1
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k, v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k, v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li,
                               0,
                               alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")
    for line in tabfile_feeder(
            os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")):
        _, ensid, transid, _ = line
        if transid in exacs:
            data = exacs.pop(
                transid)  # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid, [ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Esempio n. 2
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k,v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")  
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Esempio n. 3
0
from __future__ import print_function
import os.path
from collections import defaultdict

from dataload import get_data_folder
from utils.dataload import anyfile, tabfile_feeder
from biothings.utils.common import safewfile


ENSEMBL_DATA_FOLDER = get_data_folder('ensembl')
print('Ensembl DATA_FOLDER: ' + ENSEMBL_DATA_FOLDER)
Entrez_DATA_FOLDER = get_data_folder('entrez')
print('Ensembl DATA_FOLDER: ' + Entrez_DATA_FOLDER)


gene_ensembl_1_xref_dm_file = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__xref_entrezgene__dm.txt")
gene_ensembl_2_main_file = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__main.txt")
gene2ensembl_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene2ensembl.gz")
gene_main_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene_info.gz")

outfile = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__extra.txt")


def find_multiple_mappings_from_entrezgene_file(gene_ensembl_entrezgene_dm_file):
    """Input gene_ensembl_entrezgene_dm_file, and identify how many NCBI gene IDs there are for
    each ensembl gene ID. Lines in input file are:

    'gene_ensembl__xref_entrezgene__dm.txt' (useful columns in input_file):

    col1: Ensembl gene ID
    col2: NCBI gene ID
Esempio n. 4
0
import os.path
import copy
#from config import DATA_ARCHIVE_ROOT
from dataload import get_data_folder
from utils.common import SubStr
from utils.dataload import (load_start, load_done,
                            tab2dict, tab2list, value_convert, normalized_value,
                            list2dict, dict_nodup, dict_attrmerge
                            )

#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/ensembl/69')
DATA_FOLDER = get_data_folder('ensembl')
print('DATA_FOLDER: ' + DATA_FOLDER)


#fn to skip lines with LRG records.'''
_not_LRG = lambda ld: not ld[1].startswith("LRG_")


class EnsemblParser:
    def __init__(self):
        self.ensembl2entrez_li = None
        self.ensembl_main = None

    def _load_ensembl_2taxid(self):
        """ensembl2taxid"""
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
        # need to convert taxid to integer here
        ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
Esempio n. 5
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict,
                            tabfile_feeder, list2dict)

from dataload import get_data_folder
import logging

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = get_data_folder('exac')


def load_broadinstitute_exac_any(one_file,key):
    print("Loading file %s (%s)" % (one_file,key))
    data = tab2dict(os.path.join(DATA_FOLDER, one_file), (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {"exac" : 
                {
                    "transcript" : transcript,  # but keep version here
                    "n_exons" : int(tupleexac[0]),
                    "cds_start" : int(tupleexac[1]),
                    "cds_end" : int(tupleexac[2]),
                    "bp" : int(tupleexac[3]),
                    key : {
                        "mu_syn" : float(tupleexac[4]),
                        "mu_mis" : float(tupleexac[5]),
                        "mu_lof" : float(tupleexac[6]),
Esempio n. 6
0
import os.path
import copy
#from config import DATA_ARCHIVE_ROOT
from dataload import get_data_folder
from utils.common import SubStr
from utils.dataload import (load_start, load_done,
                            tab2dict, tab2list, value_convert, normalized_value,
                            list2dict, dict_nodup, dict_attrmerge
                            )

#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/ensembl/69')
DATA_FOLDER = get_data_folder('ensembl')
print('DATA_FOLDER: ' + DATA_FOLDER)


#fn to skip lines with LRG records.'''
def _not_LRG(ld):
    return not ld[1].startswith("LRG_")


class EnsemblParser:
    def __init__(self):
        self.ensembl2entrez_li = None
        self.ensembl_main = None

    def _load_ensembl_2taxid(self):
        """ensembl2taxid"""
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
        load_start(DATAFILE)
        ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
        # need to convert taxid to integer here
from __future__ import print_function
import os.path
from collections import defaultdict

from dataload import get_data_folder
from utils.dataload import anyfile, tabfile_feeder
from biothings.utils.common import safewfile

ENSEMBL_DATA_FOLDER = get_data_folder('ensembl')
print('Ensembl DATA_FOLDER: ' + ENSEMBL_DATA_FOLDER)
Entrez_DATA_FOLDER = get_data_folder('entrez')
print('Ensembl DATA_FOLDER: ' + Entrez_DATA_FOLDER)

gene_ensembl_1_xref_dm_file = os.path.join(
    ENSEMBL_DATA_FOLDER, "gene_ensembl__xref_entrezgene__dm.txt")
gene_ensembl_2_main_file = os.path.join(ENSEMBL_DATA_FOLDER,
                                        "gene_ensembl__gene__main.txt")
gene2ensembl_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene2ensembl.gz")
gene_main_file = os.path.join(Entrez_DATA_FOLDER, "gene/gene_info.gz")

outfile = os.path.join(ENSEMBL_DATA_FOLDER, "gene_ensembl__gene__extra.txt")


def find_multiple_mappings_from_entrezgene_file(
        gene_ensembl_entrezgene_dm_file):
    """Input gene_ensembl_entrezgene_dm_file, and identify how many NCBI gene IDs there are for
    each ensembl gene ID. Lines in input file are:

    'gene_ensembl__xref_entrezgene__dm.txt' (useful columns in input_file):

    col1: Ensembl gene ID
Esempio n. 8
0
import os.path
import types
import time
from utils.common import timesofar
from utils.dataload import (load_start, load_done,
                            listitems, dupline_seperator,
                            tabfile_feeder, list2dict, list_nondup,
                            value_convert)
from config import DATA_ARCHIVE_ROOT
from dataload import get_data_folder

#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = get_data_folder('uniprot')

#REF:
#ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README

def get_uniprot_section(uniprotkb_id):
    '''return either "Swiss-Prot" or "TrEMBL", two sections of UniProtKB,
       based on input uniprotkb_id, or entry name.
       The rule is (http://www.uniprot.org/manual/entry_name):
           Swiss-Prot entries have a maximum of 5 characters before
           the "_", TrEMBL entries have 6 characters before the "_" (the accession
        some examples:
            TrEMBL: O61847_CAEEL, F0YED1_AURAN
            Swiss-Prot: CDK2_HUMAN, CDK2_MOUSE
    '''
    v = uniprotkb_id.split('_')
    if len(v) != 2:
        raise ValueError('Invalid UniprotKB ID')
    return 'TrEMBL' if len(v[0])==6 else "Swiss-Prot"
Esempio n. 9
0
import os.path
import time

from biothings.utils.common import get_timestamp, timesofar
from utils.dataload import (load_start, load_done, tabfile_feeder,
                            list2dict, value_convert, dict_convert)
from dataload import get_data_folder

DATA_FOLDER = get_data_folder('cpdb')


def _download(__metadata__):
    from utils.dataload import download as _download

    output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp())
    for species in ['human', 'mouse', 'yeast']:
        url = __metadata__['__url_{}__'.format(species)]
        output_file = 'CPDB_pathways_genes_{}.tab'.format(species)
        _download(url, output_folder, output_file)


def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
Esempio n. 10
0
import os.path
from utils.dataload import (load_start, load_done, tab2dict, value_convert)
from dataload import get_data_folder
#from config import DATA_ARCHIVE_ROOT

#timestamp = '20121005'
#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/pharmgkb', timestamp)
DATA_FOLDER = get_data_folder('pharmgkb')

def load_pharmgkb():

    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip')
    load_start(DATAFILE)
    gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1,includefn=lambda ld:ld[1]!='')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)

    load_done('[%d]' % len(gene2pharmgkb))

    return gene2pharmgkb
Esempio n. 11
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, listitems,
                            dupline_seperator, tabfile_feeder, list2dict,
                            list_nondup, value_convert)
#from config import DATA_ARCHIVE_ROOT
from dataload import get_data_folder

#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = get_data_folder('uniprot')

#REF:
#ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
VALID_COLUMN_NO = 22


def get_uniprot_section(uniprotkb_id):
    '''return either "Swiss-Prot" or "TrEMBL", two sections of UniProtKB,
       based on input uniprotkb_id, or entry name.
       The rule is (http://www.uniprot.org/manual/entry_name):
           Swiss-Prot entries have a maximum of 5 characters before
           the "_", TrEMBL entries have 6 or 10 characters before the "_" (the accession
        some examples:
            TrEMBL: O61847_CAEEL, F0YED1_AURAN, A0A024RB10_HUMAN
            Swiss-Prot: CDK2_HUMAN, CDK2_MOUSE
    '''
    v = uniprotkb_id.split('_')
    if len(v) != 2:
        raise ValueError('Invalid UniprotKB ID')
    #return 'TrEMBL' if len(v[0])==6 else "Swiss-Prot"
Esempio n. 12
0
import os.path
from utils.dataload import (load_start, load_done, tab2dict, value_convert)
from dataload import get_data_folder
#from config import DATA_ARCHIVE_ROOT

#timestamp = '20121005'
#DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/pharmgkb', timestamp)
DATA_FOLDER = get_data_folder('pharmgkb')


def load_pharmgkb():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip')
    load_start(DATAFILE)
    gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)

    load_done('[%d]' % len(gene2pharmgkb))

    return gene2pharmgkb
Esempio n. 13
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder,
                            list2dict)

from dataload import get_data_folder
import logging

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = get_data_folder('exac')


def load_broadinstitute_exac_any(one_file, key):
    print("Loading file %s (%s)" % (one_file, key))
    data = tab2dict(os.path.join(DATA_FOLDER, one_file),
                    (0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                     18, 19, 20, 21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {
            "exac": {
                "transcript": transcript,  # but keep version here
                "n_exons": int(tupleexac[0]),
                "cds_start": int(tupleexac[1]),
                "cds_end": int(tupleexac[2]),
                "bp": int(tupleexac[3]),
                key: {
                    "mu_syn": float(tupleexac[4]),
Esempio n. 14
0
'''
Populates MICROBE gene entries with genomic position data
Currently updates the 120 microbial taxids that are NCBI Reference Sequences

run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE
when it's necessary.
'''
import os.path
from biothings.utils.common import (dump, loadobj, get_timestamp)
from utils.dataload import (tab2list, load_start, load_done)
from dataload import get_data_folder


DATA_FOLDER = get_data_folder('entrez')
print('DATA_FOLDER: ' + DATA_FOLDER)

__metadata__ = {
    '__collection__': 'entrez_genomic_pos',
}

TAXIDS_FILE = os.path.join(DATA_FOLDER, "../ref_microbe_taxids.pyobj")
DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene2refseq.gz')


def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
Esempio n. 15
0
import os.path
import time

from biothings.utils.common import get_timestamp, timesofar
from utils.dataload import (load_start, load_done, tabfile_feeder,
                            list2dict, value_convert, dict_convert)
from dataload import get_data_folder

DATA_FOLDER = get_data_folder('cpdb')


def _download(__metadata__):
    from utils.dataload import download as _download

    output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp())
    for species in ['human', 'mouse', 'yeast']:
        url = __metadata__['__url_{}__'.format(species)]
        output_file = 'CPDB_pathways_genes_{}.tab'.format(species)
        _download(url, output_folder, output_file)


def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
Esempio n. 16
0
import os.path
import time
from utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict,
                            tabfile_feeder, list2dict)

from dataload import get_data_folder

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes')


def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')
    reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    ref2exons = []
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = zip([int(x) for x in ld[9].split(',') if x],
                    [int(x) for x in ld[10].split(',') if x])
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.append((refseq, {
            'chr': chr,
Esempio n. 17
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict,
                            tabfile_feeder, list2dict)

from dataload import get_data_folder

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = os.path.join(get_data_folder('ucsc'), 'goldenPath/currentGenomes')
REFLINK_FILE = os.path.join(get_data_folder('ucsc'), 'goldenPath/hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False)


def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(zip([int(x) for x in ld[9].split(',') if x],
                     [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq,[]).append({
            'transcript' : refseq,
Esempio n. 18
0
'''
Populates MICROBE gene entries with genomic position data
Currently updates the 120 microbial taxids that are NCBI Reference Sequences

run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE
when it's necessary.
'''
import os.path
from biothings.utils.common import (dump, loadobj, get_timestamp)
from utils.dataload import (tab2list, load_start, load_done)
from dataload import get_data_folder

DATA_FOLDER = get_data_folder('entrez')
print('DATA_FOLDER: ' + DATA_FOLDER)

__metadata__ = {
    '__collection__': 'entrez_genomic_pos',
}

TAXIDS_FILE = os.path.join(DATA_FOLDER, "../ref_microbe_taxids_20151014.pyobj")
DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene2refseq.gz')


def load_genedoc(self):
    """
    Loads gene data from NCBI's refseq2gene.gz file.
    Parses it based on genomic position data and refseq status provided by the
    list of taxids from get_ref_microbe_taxids() as lookup table
    :return:
    """
    taxids = loadobj(TAXIDS_FILE)
Esempio n. 19
0
import os.path
import time
from biothings.utils.common import timesofar
from utils.dataload import (load_start, load_done, tab2dict, tabfile_feeder,
                            list2dict)

from dataload import get_data_folder

# DATA_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/uniprot')
DATA_FOLDER = os.path.join(get_data_folder('ucsc'),
                           'goldenPath/currentGenomes')
REFLINK_FILE = os.path.join(get_data_folder('ucsc'),
                            'goldenPath/hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(REFLINK_FILE, (2, 6), 0, alwayslist=False)


def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species,
                                'database/refFlat.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(
            zip([int(x) for x in ld[9].split(',') if x],