Esempio n. 1
0
def load_data(data_folder):
    # Load .gmt (Gene Matrix Transposed) file with entrez ids
    for f in glob.glob(os.path.join(data_folder, "msigdb.*.entrez.gmt")):
        data = tabfile_feeder(f, header=0)
        all_genes = set()
        for rec in data:
            genes = set(rec[2:])
            all_genes = all_genes | genes
        # Query gene info
        lookup = IDLookup(9606)  # Human genes
        lookup.query_mygene(all_genes, 'entrezgene')

        data = tabfile_feeder(f, header=0)
        for rec in data:
            name = rec[0]
            url = rec[1]
            ncbigenes = rec[2:]
            genes = []
            for gene in ncbigenes:
                if lookup.query_cache.get(gene):
                    genes.append(lookup.query_cache[gene])
            # Format schema
            doc = {
                '_id': name,
                'is_public': True,
                'taxid': 9606,
                'genes': genes,
                'source': 'msigdb',
                'msigdb': {
                    'id': name,
                    'geneset_name': name,
                    'url': url
                }
            }
            yield doc
Esempio n. 2
0
def load_data(data_folder):
    # Load .gmt (Gene Matrix Transposed) file with entrez ids
    f = os.path.join(data_folder, "ReactomePathways.gmt")
    data = tabfile_feeder(f, header=0)
    all_genes = set()
    for rec in data:
        genes = set(rec[2:])
        all_genes = all_genes | genes
    # Query gene info
    lookup = IDLookup(9606)  # Human genes
    lookup.query_mygene(all_genes, 'symbol')

    data = tabfile_feeder(f, header=0)
    for rec in data:
        name = rec[0]
        _id = rec[1]
        ncbigenes = rec[2:]
        genes = []
        for gene in ncbigenes:
            if lookup.query_cache.get(gene):
                genes.append(lookup.query_cache[gene])
        # Format schema
        doc = {
            '_id': _id,
            'name': name,
            'is_public': True,
            'taxid': 9606,
            'genes': genes,
            'source': 'reactome',
            'reactome': {
                'id': _id,
                'geneset_name': name,
            }
        }
        yield doc
Esempio n. 3
0
def find_ncbi_symbols(gene_info_file, ensembl_dict):
    print("step 4 start: read NCBI gene symbol")
    ncbi_list_to_find = {}
    for key in ensembl_dict:
        ncbi_list = ensembl_dict[key]['data']['ncbi_list']
        for e in ncbi_list:
            ncbi_list_to_find[e] = True
        #gene2ensembl_ncbi_gene_id_match_list = ensembl_dict[key]['data']['gene2ensembl']
        #if len(gene2ensembl_ncbi_gene_id_match_list) != 1:
        #    ncbi_list_to_find.append(ncbi_list)

    for e in list(
            set([item for sublist in ncbi_list_to_find for item in sublist])):
        ncbi_list_to_find[e] = True

    ncbi_id_symbols = {}
    for ld in tabfile_feeder(gene_info_file):
        if ld[1] in ncbi_list_to_find:
            ncbi_id_symbols[ld[1]] = ld[2]

    print(
        "number of unique NCBI gene IDs to be queried using Entrez gene_info file: ",
        len(ncbi_list_to_find))
    print("number symbols found in NCBI file: ", len(ncbi_id_symbols))
    print("step 4 end")
    return ncbi_id_symbols
Esempio n. 4
0
File: parser.py Progetto: ravila4/go
def parse_gene_annotations(f):
    """Parse a gene annotation (.gaf.gz) file."""
    data = tabfile_feeder(f, header=0)
    genesets = {}
    for rec in data:
        if not rec[0].startswith("!"):
            _id = rec[4].replace(":", "_")
            if genesets.get(_id) is None:
                taxid = rec[12].split("|")[0].replace("taxon:", "")
                genesets[_id] = {
                    "_id": _id + "_" + taxid,
                    "is_public": True,
                    "taxid": taxid
                }
            uniprot = rec[1]
            symbol = rec[2]
            qualifiers = rec[3].split("|")
            # The gene can belong to several sets:
            if "NOT" in qualifiers:
                # Genes similar to genes in go term, but should be excluded
                genesets[_id].setdefault("excluded_genes", set()).add(
                    (uniprot, symbol))
            if "contributes_to" in qualifiers:
                # Genes that contribute to the specified go term
                genesets[_id].setdefault("contributing_genes", set()).add(
                    (uniprot, symbol))
            if "colocalizes_with" in qualifiers:
                # Genes colocalized with specified go term
                genesets[_id].setdefault("colocalized_genes", set()).add(
                    (uniprot, symbol))
            else:
                # Default set: genes that belong to go term
                genesets[_id].setdefault("genes", set()).add((uniprot, symbol))
    return genesets
Esempio n. 5
0
def load_broadinstitute_exac(data_folder):
    t0 = time.time()
    exacs = load_broadinstitute_exac_all(data_folder)
    for k,v in load_broadinstitute_exac_nontcga(data_folder).items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych(data_folder).items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    from ..ensembl.parser import EnsemblParser
    from biothings.utils.hub_db import get_src_dump
    ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
    ensembl_dir = ensembl_doc.get("data_folder")
    assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
    ensembl_parser = EnsemblParser(ensembl_dir)
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    return exacs
Esempio n. 6
0
def load_exons_for_species(data_folder, species, exons_key='exons'):
    refflat_file = os.path.join(data_folder, species,
                                'database/refFlat.txt.gz')
    t0 = time.time()
    ref2exons = {}
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = list(
            zip([int(x) for x in ld[9].split(',') if x],
                [int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq, []).append({
            'transcript':
            refseq,
            'chr':
            chr,
            'strand':
            -1 if ld[3] == '-' else 1,
            'txstart':
            int(ld[4]),
            'txend':
            int(ld[5]),
            'cdsstart':
            int(ld[6]),
            'cdsend':
            int(ld[7]),
            'position':
            exons
        })

    gene2exons = {}
    reflink_file = os.path.join(data_folder,
                                '../hgFixed/database/refLink.txt.gz')
    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    return gene2exons
Esempio n. 7
0
def load_cpdb(data_folder, pathways):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = pathways
    VALID_COLUMN_NO = 4

    t0 = time.time()
    DATA_FILES = []
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort(key=lambda e: e["id"])
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)

    return _out
Esempio n. 8
0
def load_data(data_folder):
    def get_taxid(species):
        taxids = {
            "Mus musculus": 10090,
            "Bos taurus": 9913,
            "H**o sapiens": 9606,
            "Anopheles gambiae": 180454,
            "Arabidopsis thaliana": 3702,
            "Caenorhabditis elegans": 6239,
            "Canis familiaris": 9615,
            "Danio rerio": 7955,
            "Drosophila melanogaster": 7227,
            "Equus caballus": 9796,
            "Gallus gallus": 9031,
            "Oryza sativa": 39947,
            "Pan troglodytes": 9598,
            "Rattus norvegicus": 10116,
            "Saccharomyces cerevisiae": 559292,
            "Populus trichocarpa": 3694,
            "Sus scrofa": 9823
        }
        return taxids[species]

    # Load .gmt (Gene Matrix Transposed) files
    for f in glob.glob(os.path.join(data_folder, "*.gmt")):
        # Get species name from the filename and convert to taxid
        species = f.replace(".gmt", "").split("-")[-1].replace("_", " ")
        taxid = get_taxid(species)
        print("Parsing data for {} ({})".format(species, taxid))
        # Read entire file and fetch data for joint set of all genes
        data = tabfile_feeder(f, header=0)
        all_genes = []
        for rec in data:
            all_genes += rec[2:]
        all_genes = set(all_genes)
        lookup = IDLookup(taxid)
        lookup.query_mygene(all_genes, 'entrezgene')

        # Parse each individual document
        data = tabfile_feeder(f, header=0)
        for rec in data:
            header = rec[0].split("%")
            # Get fields from header
            pathway_name = header[0]
            wikipathways_id = header[2]
            assert species == header[3], "Species does not match."
            # Get URL and gene list
            url = rec[1]
            ncbigenes = rec[2:]
            genes = []
            for g in ncbigenes:
                if lookup.query_cache.get(g):
                    genes.append(lookup.query_cache[g])

            # Format schema
            doc = {
                '_id': wikipathways_id,
                'is_public': True,
                'taxid': taxid,
                'genes': genes,
                'wikipathways': {
                    'id': wikipathways_id,
                    'pathway_name': pathway_name,
                    'url': url
                }
            }
            yield doc
Esempio n. 9
0
def load_all(data_folder):
    '''Load "uniprot" using yield, while building "PDB" and "PIR"
    data dict while reading data file. These dict are then dumped
    (pickled) and stored later'''
    def cvt_fn(pdb_id):
        return pdb_id.split(':')[0]

    def merge(xli, transcode=False):
        xli2 = []
        uniprot_acc, section, entrez_id, ensembl_id = xli
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            except KeyError:
                xli2.append((uniprot_acc, section, ensembl_id))
        return xli2

    def transform(xli2):
        gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
        gene2uniprot = value_convert(gene2uniprot,
                                     _dict_convert,
                                     traverse_list=False)
        gid, uniprot = list(gene2uniprot.items())[0]
        docs = []
        for gid, uniprot in gene2uniprot.items():
            doc = {"_id": gid}
            doc.update(uniprot)
            docs.append(doc)
        return docs

    def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None):
        xli2 = []
        entrez_id, ensembl_id, x_value = xli

        if not x_value:
            return

        if cvt_fn:
            x_value = cvt_fn(x_value)

        if entrez_id:
            xli2.append((entrez_id, x_value))
        elif ensembl_id:
            if not transcode:
                raise KeyError(ensembl_id)
            try:
                entrez_id = x_ensembl2geneid[ensembl_id]
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((_eid, x_value))
            except KeyError:
                xli2.append((ensembl_id, x_value))
        for x in xli2:
            gene2x.setdefault(x[0], []).append(x[1])

    uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz')
    t0 = time.time()

    # cache for uniprot
    ensembl2geneid = {}
    # cache for PDB and PIR
    x_ensembl2geneid = {}

    remains = []
    pdb_remains = []
    pir_remains = []

    # once filled, will be dumped for later storage
    gene2pdb = {}
    gene2pir = {}

    # store all PDB & PIR data while looping, the whole will be stored later
    for ld in tabfile_feeder(uniprot_datafile,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        # Uniprot data will be stored as we read line by line
        xlis = []
        pdbxlis = []
        pirxlis = []

        # raw lines for each sources
        uniprotld = [ld[0], ld[1], ld[2], ld[18]]
        pdbld = [ld[2], ld[19], ld[5]]
        pirld = [ld[2], ld[19], ld[11]]

        # UniProt
        # GeneID and EnsemblID columns may have duplicates
        for value in dupline_seperator(dupline=uniprotld,
                                       dup_idx=[2, 3],
                                       dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xlis.append(value)
        # PDB
        for value in dupline_seperator(dupline=pdbld, dup_sep='; '):
            pdbxlis.append(value)

        # PIR
        for value in dupline_seperator(dupline=pirld, dup_sep='; '):
            pirxlis.append(value)

        for xli in xlis:
            # feed mapping
            if xli[2] != '' and xli[3] != '':
                ensembl2geneid.setdefault(xli[3], []).append(xli[2])
            try:
                # postpone ensemblid->entrezid resolution while parsing uniprot as the
                # full transcodification dict is only correct at the end.
                # ex:
                #     1. UniprotID-A    EntrezID-A  EnsemblID
                #     2. UniprotID-B                EnsemblID
                #     3. UniprotID-C    EntrezID-B  EnsemblID
                #
                #     UniprotID-B should associated to both EntrezID-A and EntrezID-B
                #     but we need to read up to line 3 to do so
                xli2 = merge(xli, transcode=False)
                if not xli2:
                    continue
                docs = transform(xli2)
                for doc in docs:
                    yield doc
            except KeyError:
                remains.append(xli)

        for xli in pdbxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb")
            except KeyError:
                pdb_remains.append(xli)

        for xli in pirxlis:
            if xli[0] != '' and xli[1] != '':
                x_ensembl2geneid.setdefault(xli[1], []).append(xli[0])
            try:
                merge_x(xli, gene2pir, transcode=False)
            except KeyError:
                pir_remains.append(xli)

    # now transcode with what we have
    for remain in remains:
        try:
            xli2 = merge(remain, transcode=True)
            if not xli2:
                continue
            docs = transform(xli2)
            for doc in docs:
                yield doc
        except KeyError:
            pass

    for remain in pdb_remains:
        try:
            merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn)
        except KeyError:
            pass

    for remain in pir_remains:
        try:
            merge_x(remain, gene2pir, transcode=True)
        except KeyError:
            pass

    # PDB
    def normalize(value, keyname):
        res = None
        uniq = sorted(set(value))
        if len(uniq) > 1:
            res = {keyname: uniq}
        else:
            res = {keyname: uniq[0]}
        return res

    def normalize_pdb(value):
        return normalize(value, "pdb")

    def normalize_pir(value):
        return normalize(value, "pir")

    # PDB
    gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False)
    pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
    dump(gene2pdb, pdb_dumpfile)

    # PIR
    gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False)
    pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
    dump(gene2pir, pir_dumpfile)
Esempio n. 10
0
def load_annotations(data_folder):
    data_file = os.path.join(data_folder, "interactions.tsv")
    data = tabfile_feeder(data_file, header=0)
    header = next(data)

    mapfile = open(os.path.join(data_folder, "predicate-remap.yaml"),
                   'r').read()
    remappingdata = yaml.safe_load(mapfile)

    def get_predicate(relation):
        if relation != "":
            key = ':'.join(("DGIdb", relation))
            return remappingdata[key]['rename'][0]
        return ""

    def get_gene_id(gene_name):
        query = ("http://mygene.info/v3/query?q=symbol:{}"
                 "&fields=entrezgene&species=human".format(gene_name))
        response = requests.get(query)
        if response.status_code == "200":
            data = response.json()
            entrez_id = data['hits'][0]['entrezgene']
            if entrez_id != "":
                return entrez_id
        return None

    def get_chem_id(drug_name):
        query = ("http://mychem.info/v1/query?q=chembl.pref_name:{}"
                 "&fields=chembl.molecule_chembl_id".format(drug_name))
        response = requests.get(query)
        if response.status_code == "200":
            data = response.json()
            chembl_id = data['hits'][0]['chembl']['molecule_chembl_id']
            if chembl_id != "":
                return chembl_id
        return None

    for rec in data:
        # Create a hash for _id
        bytestr = bytearray("-".join(rec), 'utf-8')
        hashstr = hashlib.blake2b(bytestr, digest_size=8).hexdigest()

        # Document framework
        doc = {"_id": hashstr, "subject": {}, "object": {}, "association": {}}

        # Subject
        entrez_id = rec[header.index("entrez_id")]
        gene_name = rec[header.index("gene_name")]
        if entrez_id == "":
            if gene_name == "":
                continue  # Skip the record
            resp = get_gene_id(gene_name)
            if resp is None:
                subject_id = 'name:' + gene_name
            else:
                entrez_id = resp
                subject_id = 'NCBIGene:' + resp
        else:
            subject_id = 'NCBIGene:' + entrez_id
        doc['subject']['NCBIGene'] = entrez_id
        doc['subject']['SYMBOL'] = gene_name
        doc['subject']['id'] = subject_id

        # Object
        drug_name = rec[header.index("drug_name")]
        drug_chembl_id = rec[header.index("drug_concept_id")]
        if drug_chembl_id == "":
            if drug_name == "":
                continue  # Skip the record
            resp = get_chem_id(drug_name)
            if resp is None:
                object_id = 'name:' + drug_name
            else:
                drug_chembl_id = resp
                object_id = 'CHEMBL.COMPOUND:' + resp
        elif drug_chembl_id.startswith("chembl:"):
            object_id = 'CHEMBL.COMPOUND:' + drug_chembl_id.split(':')[-1]
            drug_chembl_id = 'CHEMBL.COMPOUND:' + drug_chembl_id.split(':')[-1]
        doc['object']['name'] = drug_name
        doc['object']['CHEMBL_COMPOUND'] = drug_chembl_id
        doc['object']['id'] = object_id

        # Association
        interaction_types = rec[header.index("interaction_types")].replace(
            " ", "_").split(",")
        pmids = rec[header.index("PMIDs")].split(",")
        interaction_claim_source = rec[header.index(
            "interaction_claim_source")]
        edge_labels = []
        for interaction in interaction_types:
            edge_labels.append(get_predicate(interaction))
        if edge_labels == ['']:
            edge_labels = 'physically_interacts_with'
        doc['association']['edge_label'] = edge_labels
        doc['association']['relation_name'] = interaction_types
        doc['association']['pubmed'] = pmids
        doc['association']['provided_by'] = interaction_claim_source
        doc['association']['interaction_group_score'] = rec[header.index(
            "interaction_group_score")]
        # Cleanup
        doc = dict_sweep(doc)
        doc = unlist(doc)
        yield doc