Exemple #1
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort(key=lambda e: e["id"])
     return {'pathway': _d}
Exemple #2
0
def load_broadinstitute_exac(data_folder):
    t0 = time.time()
    exacs = load_broadinstitute_exac_all(data_folder)
    for k,v in load_broadinstitute_exac_nontcga(data_folder).items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych(data_folder).items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    from ..ensembl.parser import EnsemblParser
    from biothings.utils.hub_db import get_src_dump
    ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
    ensembl_dir = ensembl_doc.get("data_folder")
    assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
    ensembl_parser = EnsemblParser(ensembl_dir)
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    return exacs
Exemple #3
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        # Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        # all genes with matched entrez
        def _fn(eid, taxid=None):
            # need to make a copy of the value here.
            d = copy.copy(ensembl2x.get(eid, {}))
            # otherwise, it will cause issue when multiple entrezgene ids
            return d
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        # add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                # if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Exemple #4
0
 def __init__(self, src_name, data_folder, load_ensembl2entrez=True):
     self.data_folder = data_folder
     self.ensembl2entrez_li = None
     self.ensembl_main = None
     if load_ensembl2entrez:
         self._load_ensembl2entrez_li(src_name)
         self.ensembl2entrez = list2dict(self.ensembl2entrez_li,
                                         0,
                                         alwayslist=True)
 def transform(xli2):
     gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
     gene2uniprot = value_convert(gene2uniprot,
                                  _dict_convert,
                                  traverse_list=False)
     gid, uniprot = list(gene2uniprot.items())[0]
     docs = []
     for gid, uniprot in gene2uniprot.items():
         doc = {"_id": gid}
         doc.update(uniprot)
         docs.append(doc)
     return docs
def _dict_convert(uniprot_li):
    '''
    convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'),
             (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into
    {'Swiss-Prot': u'P24941',
     'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']}
    '''
    _dict = list2dict(uniprot_li, 1)
    for k, v in _dict.items():
        if isinstance(v, list):
            _dict[k] = sorted(v)
    return {'uniprot': _dict}
Exemple #7
0
def load_cpdb(data_folder, pathways):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = pathways
    VALID_COLUMN_NO = 4

    t0 = time.time()
    DATA_FILES = []
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort(key=lambda e: e["id"])
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)

    return _out
Exemple #8
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from biothings.utils.dataload import list2dict
    from biothings.utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll)
                         for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print(k, end='')
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print("\t\t%s archived collections marked to remove." % cnt)
        else:
            print('skipped. Missing current "%s" collection!' % k)
    if len(coll_to_remove) > 0:
        print("%d archived collections will be removed." % len(coll_to_remove))
        if verbose:
            for coll in coll_to_remove:
                print('\t', coll)
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print("Done.[%s collections removed]" % len(coll_to_remove))
        else:
            print("Aborted.")
    else:
        print("Nothing needs to be removed.")