Ejemplo n.º 1
0
def load_annotations(data_folder):
    infile = os.path.join(data_folder, "var_drug_ann.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:

        if not rec["Gene"] or pandas.isna(rec["Gene"]):
            logging.warning("No gene information for annotation ID '%s'",
                            rec["Annotation ID"])
            continue
        _id = re.match(".* \((.*?)\)", rec["Gene"]).groups()[0]
        # we'll remove space in keys to make queries easier. Also, lowercase is preferred
        # for a BioThings API. We'll an helper function from BioThings SDK
        process_key = lambda k: k.replace(" ", "_").lower()
        rec = dict_convert(rec, keyfn=process_key)
        # remove NaN values, not indexable
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)

    for _id, docs in results.items():
        doc = {"_id": _id, "annotations": docs}
        yield doc
Ejemplo n.º 2
0
    def load(self):
        cnt = 0
        for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1):
            datadict = dict_convert(datadict, valuefn=lambda v: {
                            'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})

            for id,doc in datadict.items():
                cnt += 1
                doc['_id'] = id
                yield doc
Ejemplo n.º 3
0
def load_GnomadConstraints(data_folder):
    infile = os.path.abspath("/opt/biothings/GRCh37/gnomAD_constraints/v2.1.1/GnomadConstraints.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE)
    dat = dat.drop(columns=["brain_expression"]).to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = rec["symbol"]
        process_key = lambda k: k.replace(" ","_").lower()
        rec = dict_convert(rec,keyfn=process_key)
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(_id,[]).append(rec)
    for _id,docs in results.items():
        doc = {"_id": _id, "GnomadConstraints" : docs}
        yield doc
Ejemplo n.º 4
0
def load_drugs(data_folder):
    infile = os.path.join(data_folder, "drugs.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = rec["PharmGKB Accession Id"]
        rec = dict_convert(rec, keyfn=process_key2)
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "drugs": docs}
        yield doc
Ejemplo n.º 5
0
def load_HGNC(data_folder):
    infile = os.path.abspath("/opt/biothings/GRCh37/hgnc/latest/Hgnc.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE)
    dat = dat.drop(columns=["snornabase", "mirbase", "homeodb", "orphanet", "horde_id", "kznf_gene_catalog", "mamit-trnadb", "rna_central_ids", "imgt", "gtrnadb"]).to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = rec["hgnc_id"]
        process_key = lambda k: k.replace(" ","_").lower()
        rec = dict_convert(rec,keyfn=process_key)
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(_id,[]).append(rec)
    for _id,docs in results.items():
        doc = {"_id": _id, "HGNC" : docs}
        yield doc
Ejemplo n.º 6
0
    def load(self, aslist=False):
        gene2go = tab2dict_iter(self.datafile, (1, 2, 3, 4, 5, 6, 7),
                                0,
                                alwayslist=1,
                                includefn=self.species_filter)
        category_d = {'Function': 'MF', 'Process': 'BP', 'Component': 'CC'}

        def _ff(d):
            out = {}
            for goid, evidence, qualifier, goterm, pubmed, gocategory in d:
                _gocategory = category_d[gocategory]
                _d = out.get(_gocategory, [])
                _rec = dict(id=goid, term=goterm)
                if gocategory == 'Function':
                    _rec['category'] = 'MF'
                elif gocategory == 'Process':
                    _rec['gocategory'] = 'BP'
                elif gocategory == 'Component':
                    _rec['gocategory'] = 'CC'
                if evidence != '-':
                    _rec['evidence'] = evidence
                if qualifier != '-':
                    # here I also fixing some inconsistency issues in NCBI data
                    # Colocalizes_with -> colocalizes_with
                    # Contributes_with -> contributes_with
                    # Not -> NOT
                    _rec['qualifier'] = qualifier.replace('Co', 'co').replace(
                        'Not', 'NOT')
                if pubmed != '-':
                    if pubmed.find('|') != -1:
                        pubmed = [int(pid) for pid in pubmed.split('|')]
                    else:
                        pubmed = int(pubmed)
                    _rec['pubmed'] = pubmed
                _d.append(_rec)
                out[_gocategory] = _d
            for k in out:
                if len(out[k]) == 1:
                    out[k] = out[k][0]
            return out

        for gd in gene2go:
            convd = dict_convert(gd, valuefn=_ff)
            assert len(list(
                convd.items())) == 1, "nope: %s" % list(convd.items())
            gid, go = list(convd.items())[0]
            gene_d = {"_id": gid, "go": go}
            yield gene_d
Ejemplo n.º 7
0
def load_occurrences(data_folder):
    infile = os.path.join(data_folder, "occurrences.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        if rec["Object Type"] != "Gene":
            continue
        _id = rec["Object ID"]
        rec = dict_convert(rec, keyfn=process_key)
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "occurrences": docs}
        yield doc
Ejemplo n.º 8
0
def load_druglabels(data_folder):
    infile = os.path.join(data_folder, "drugLabels.byGene.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    for rec in dat:
        label_ids = rec.pop("Label IDs").split(";")
        label_names = rec.pop("Label Names").split(";")
        assert len(label_ids) == len(label_names)
        labels = []
        for i, _ in enumerate(label_ids):
            labels.append({"id": label_ids[i], "name": label_names[i]})
        _id = rec["Gene ID"]
        rec = dict_convert(rec, keyfn=process_key)
        doc = {"_id": _id, "drug_labels": labels}
        yield doc
Ejemplo n.º 9
0
def load_drug_annotations(data_folder):
    infile = os.path.join(data_folder, 'GDSC_Drug_anno.csv')
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep=',',
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    for rec in dat:
        drug_name = rec.pop('Drug_Name')
        drug_target_pathway = rec.pop('Drug_Targeted_process_or_pathway')
        drug_annotations = {
            'drug_name': drug_name,
            'target_pathway': drug_target_pathway
        }
        _id = rec['Drug_identifier']
        rec = dict_convert(rec, keyfn=process_key)
        doc = {'_id': _id, 'drug_annotations': drug_annotations}
        yield doc
Ejemplo n.º 10
0
def load_KEGG(data_folder):
    infileInfo = os.path.abspath("/opt/biothings/GRCh37/kegg/april2011/KeggInfo.tsv")
    infileID = os.path.abspath("/opt/biothings/GRCh37/kegg/april2011/EnsemblToKegg.tsv")
    assert os.path.exists(infileInfo)
    assert os.path.exists(infileID)
    datInfo = pandas.read_csv(infileInfo,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE)
    datID = pandas.read_csv(infileID,sep="\t",squeeze=True,quoting=csv.QUOTE_NONE)
    dat = datID.join(datInfo.set_index('kegg_id'), on='kegginfo_id').to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = rec["gene_id"]
        process_key = lambda k: k.replace(" ","_").lower()
        rec = dict_convert(rec,keyfn=process_key)
        rec = dict_sweep(rec,vals=[np.nan])
        results.setdefault(_id,[]).append(rec)
    for _id,docs in results.items():
        doc = {"_id": _id, "KEGG" : docs}
        yield doc
Ejemplo n.º 11
0
def load_Exac(data_folder):
    infile = os.path.abspath("/opt/biothings/GRCh37/ExAC/r1/Exac.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = rec["release"] + "_" + str(rec["chromosome"]) + "_" + str(
            rec["position"]
        ) + "_" + rec["reference"] + "_" + rec["alternative"]
        process_key = lambda k: k.replace(" ", "_").lower()
        rec = dict_convert(rec, keyfn=process_key)
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "Exac": docs}
        yield doc
Ejemplo n.º 12
0
    def load(self, aslist=False):
        gene2acc = tab2dict_iter(self.datafile, (1, 3, 5, 7),
                                 0,
                                 alwayslist=1,
                                 includefn=self.species_filter)

        def _ff(d):
            out = {'rna': [], 'protein': [], 'genomic': [], 'translation': []}
            for rna, prot, dna in d:
                if rna == '-': rna = None
                if prot == '-': prot = None
                if dna == '-': dna = None
                if rna is not None:
                    out['rna'].append(rna)
                if prot is not None:
                    out['protein'].append(prot)
                if dna is not None:
                    out['genomic'].append(dna)
                if rna and prot:
                    out['translation'].append({'rna': rna, 'protein': prot})
            # remove dup
            for k in out:
                out[k] = normalized_value(out[k])
            # remove empty rna/protein/genomic field
            _out = {}
            for k, v in out.items():
                if v:
                    _out[k] = v
            if _out:
                _out = {self.fieldname: _out}
            return _out

        #gene/2acc = dict_convert(gene2acc, valuefn=_ff)
        cnt = 0
        for gd in gene2acc:
            convd = self.format(dict_convert(gd, valuefn=_ff))
            yield convd
            cnt += 1

        if aslist:
            return dict_to_list(gene2acc)
        else:
            return gene2acc
Ejemplo n.º 13
0
def load_cpdb(data_folder, pathways):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = pathways
    VALID_COLUMN_NO = 4

    t0 = time.time()
    DATA_FILES = []
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(data_folder, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort(key=lambda e: e["id"])
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)

    return _out
Ejemplo n.º 14
0
def load_clinvar(data_folder):
    infile = os.path.abspath(
        "/opt/biothings/GRCh37/clinvar/latest/Clinvar.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        var = rec["release"] + "_" + str(rec["chromosome"]) + "_" + str(
            rec["start"]) + "_" + rec["reference"] + "_" + rec["alternative"]
        _id = hashlib.sha224(var.encode('ascii')).hexdigest()
        process_key = lambda k: k.replace(" ", "_").lower()
        rec = dict_convert(rec, keyfn=process_key)
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "clinvar": docs}
        yield doc
Ejemplo n.º 15
0
def load_denovodb(data_folder):
    infile = os.path.abspath(
        "/opt/biothings/GRCh37/denovodb/denovo-db.non-ssc-samples.variants.tsv"
    )
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        _id = str(rec["Chr"]) + "_" + str(
            rec["Position"]) + "_" + rec["REF"] + "_" + rec["ALT"]
        process_key = lambda k: k.replace(" ", "_").lower()
        rec = dict_convert(rec, keyfn=process_key)
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "denovodb": docs}
        yield doc
Ejemplo n.º 16
0
def load_gene_drug_associations(data_folder):
    infile = os.path.join(data_folder, 'mutation_drug_pairs.csv')
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep=',',
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        if not rec['Drug ID'] or pandas.isna(rec['Drug ID']):
            logging.warning('No drug information found in current record.')
            continue
        _id = rec['Drug ID']
        rec = dict_convert(rec, keyfn=process_key)
        # Remove NaN values, not indexable
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {'_id': _id, 'gene_drug_associations': docs}
        yield doc
Ejemplo n.º 17
0
def load_annotations(data_folder):
    infile = os.path.join(data_folder, "var_drug_ann.tsv")
    assert os.path.exists(infile)
    dat = pandas.read_csv(infile,
                          sep="\t",
                          squeeze=True,
                          quoting=csv.QUOTE_NONE).to_dict(orient='records')
    results = {}
    for rec in dat:
        if not rec["Gene"] or pandas.isna(rec["Gene"]):
            logging.warning("No gene information for annotation ID '%s'",
                            rec["Annotation ID"])
            continue
        _id = re.match(".* \((.*?)\)", rec["Gene"]).groups()[0]
        rec = dict_convert(rec, keyfn=process_key)
        # remove NaN values, not indexable
        rec = dict_sweep(rec, vals=[np.nan])
        results.setdefault(_id, []).append(rec)
    for _id, docs in results.items():
        doc = {"_id": _id, "annotations": docs}
        yield doc
Ejemplo n.º 18
0
    def load(self, aslist=False):
        if self.species_li:
            _includefn = lambda ld: int(ld[0]) in self.taxid_set and ld[
                1] != '-'
        else:
            _includefn = lambda ld: ld[1] != '-'
        gene2retired = tab2dict(self.datafile, (1, 2),
                                0,
                                alwayslist=1,
                                includefn=_includefn)
        gene2retired = dict_convert(
            gene2retired,
            valuefn=lambda x: normalized_value([int(xx) for xx in x]))

        gene_d = {}
        for gid, retired in gene2retired.items():
            gene_d[gid] = {'retired': retired}

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Ejemplo n.º 19
0
def get_geneid_d(data_folder,
                 species_li=None,
                 load_cache=True,
                 save_cache=True,
                 only_for={}):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set(
            [TAXONOMY[species]["tax_id"] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(data_folder)

    # check cache file
    _cache_file = 'geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene_info.gz') and \
       file_newer(_cache_file, 'gene_history.gz'):
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(data_folder, 'gene_info.gz')
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set and (
            only_for and ld[1] in only_for)
    elif only_for:
        species_filter = lambda ld: only_for and ld[1] in only_for
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))

    DATAFILE = os.path.join(data_folder, 'gene_history.gz')

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    # TODO: this fills memory with key==value ...
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d