Example #1
0
def pyobj_compare_worker(file_range, dir1, dir2):
    print("Starting worker on: " + str(file_range))
    fail_list = []
    for _file_num in file_range:  #First doc starts at '1'
        _obj1 = loadobj(dir1 + '/' + str(_file_num) + '.pyobj')
        _obj2 = loadobj(dir2 + '/' + str(_file_num) + '.pyobj')
        if (_obj1['source'] != _obj2['source'] or _obj1['add'] != _obj2['add']
                or _obj1['delete'] != _obj2['delete']
                or _obj1['update'] != _obj2['update']):
            fail_list.append(_file_num)
    print("Finished worker on: " + str(file_range))
    return (file_range, fail_list)
Example #2
0
 def main(self, diff_filepath, merge_collection, field):
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_ids = diff['add']
     delete_ids = diff['delete']
     update_ids = [_doc['_id'] for _doc in diff['update']]
     self.add_update(source_collection, merge_collection, add_ids)
     self.add_update(source_collection, merge_collection, update_ids)
     self.delete(merge_collection, field, delete_ids)
Example #3
0
 def main(self, diff_filepath, merge_collection, field):
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_ids = diff['add']
     delete_ids = diff['delete']
     update_ids = [_doc['_id'] for _doc in diff['update']]
     self.add_update(source_collection, merge_collection, add_ids)
     self.add_update(source_collection, merge_collection, update_ids)
     self.delete(merge_collection, field, delete_ids)
Example #4
0
 def _load_ensembl2entrez_li(self):
     ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs')
     #filter out those deprecated entrez gene ids
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li
                          if int(entrez_id) in self._entrez_geneid_d]
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez = list2dict(ensembl2entrez_li, 0)
     self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
Example #5
0
def load_genedoc(self=None):
    genedoc_d = loadobj(os.path.join(DATA_ARCHIVE_ROOT, 'by_resources/gnfreagents/gnfreagents_20110817.pyobj'))
    #Fixing invalid key "GNF_hs-ORFeome1.1_reads" (replacing "." with "_")
    for k in genedoc_d:
        doc = genedoc_d[k]
        if "GNF_hs-ORFeome1.1_reads" in doc['reagent']:
            doc['reagent']['GNF_hs-ORFeome1_1_reads'] = doc['reagent']['GNF_hs-ORFeome1.1_reads']
            del doc['reagent']['GNF_hs-ORFeome1.1_reads']
            genedoc_d[k] = doc
    return genedoc_d
Example #6
0
 def _load_ensembl2entrez_li(self):
     ensembl2entrez_li = loadobj(
         ("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs')
     #filter out those deprecated entrez gene ids
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez_li = [(ensembl_id,
                           self._entrez_geneid_d[int(entrez_id)])
                          for (ensembl_id, entrez_id) in ensembl2entrez_li
                          if int(entrez_id) in self._entrez_geneid_d]
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez = list2dict(ensembl2entrez_li, 0)
     self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
Example #7
0
def load_genedoc(self=None):
    genedoc_d = loadobj(
        os.path.join(DATA_ARCHIVE_ROOT,
                     'by_resources/gnfreagents/gnfreagents_20110817.pyobj'))
    #Fixing invalid key "GNF_hs-ORFeome1.1_reads" (replacing "." with "_")
    for k in genedoc_d:
        doc = genedoc_d[k]
        if "GNF_hs-ORFeome1.1_reads" in doc['reagent']:
            doc['reagent']['GNF_hs-ORFeome1_1_reads'] = doc['reagent'][
                'GNF_hs-ORFeome1.1_reads']
            del doc['reagent']['GNF_hs-ORFeome1.1_reads']
            genedoc_d[k] = doc
    return genedoc_d
Example #8
0
 def main(self, index, collection, diff_filepath, validate=False, wait=60):
     self._index = index
     self._esi._index = index
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_list = self.add(source_collection, diff['add'])
     delete_list = self.delete(collection, diff['delete'])
     update_list = self.update(diff['update'])
     t00 = time()
     print('Adding new {} docs...'.format(len(diff['add'])))
     t0 = time()
     bulk(self._es, add_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Deleting {} docs'.format(len(diff['delete'])))
     t0 = time()
     bulk(self._es, delete_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Updating {} docs'.format(len(diff['update'])))
     t0 = time()
     bulk(self._es, update_list)
     print("Done. [{}]".format(timesofar(t0)))
     print("=" * 20)
     print("Finished! [{}]".format(timesofar(t00)))
     if validate:
         print('Waiting {}s to let ES to finish...'.format(wait), end="")
         sleep(wait)
         print("Done.")
         print("Validating...")
         t0 = time()
         q = {
             "query": {
                 "constant_score": {
                     "filter": {
                         "exists": {
                             "field": collection
                         }
                     }
                 }
             }
         }
         data = self._esi.doc_feeder(query=q, _source=collection)
         temp_collection = collection + '_temp_' + get_random_string()
         self._src[temp_collection].drop()
         load_source(temp_collection, src_data=data)
         c1 = get_backend(source_collection, 'mongodb')
         c2 = get_backend(temp_collection, 'mongodb')
         diff_result = diff_collections(c1, c2, use_parallel=False)
         self._src[temp_collection].drop()
         print("Done. [{}]".format(t0))
         return diff_result
Example #9
0
    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
Example #10
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene'
        #config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_host = 'localhost:' + str(es_local_tunnel_port)
    _es_index = config + TARGET_ES_INDEX_SUFFIX    # '_current_1'

    # for test
    #_es_host = 'localhost:9200'
    #_es_index = config + TARGET_ES_INDEX_SUFFIX    # '_current_1'

    with open_tunnel() as tunnel:
        if tunnel.ok:
            esi = ESIndexer2(_es_index, es_host=_es_host)
            meta = esi.get_mapping_meta(changes)
            print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
            pprint(meta)
            code = esi.apply_changes(changes, noconfirm=noconfirm)
            if code != -1:
                # aborted when code == -1
                _meta = {'_meta': meta}
                # somehow when only update "_meta", "_timestamp" get empty
                # so add "_timestamp" explicitly here. This is an ES bug.
                _meta['_timestamp'] = {
                    "enabled": True,
                    "path": "_timestamp"
                }
                #esi.update_mapping_meta(_meta)
                print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME]))
                esi.post_verify_changes(changes)
Example #11
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_index = config + TARGET_ES_INDEX_SUFFIX
    # ES host will be set depending on whether a tunnel is used or not
    with open_tunnel() as tunnel:
        if tunnel.ok:
            _es_host = 'localhost:' + str(es_local_tunnel_port)
        else:
            _es_host = ES_HOST

        esi = ESIndexer2(_es_index, es_host=_es_host)

        meta = esi.get_mapping_meta(changes)
        print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
        pprint(meta)
        code = esi.apply_changes(changes, noconfirm=noconfirm)
        if code != -1:
            # aborted when code == -1
            _meta = {'_meta': meta}
            print(
                esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta,
                                             [esi.ES_INDEX_NAME]))
            esi.post_verify_changes(changes)
Example #12
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_index = config + TARGET_ES_INDEX_SUFFIX
    # ES host will be set depending on whether a tunnel is used or not
    with open_tunnel() as tunnel:
        if tunnel.ok:
            _es_host = 'localhost:' + str(es_local_tunnel_port)
        else:
            _es_host = ES_HOST

        esi = ESIndexer2(_es_index, es_host=_es_host)

        meta = esi.get_mapping_meta(changes)
        print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
        pprint(meta)
        code = esi.apply_changes(changes, noconfirm=noconfirm)
        if code != -1:
            # aborted when code == -1
            _meta = {'_meta': meta}
            print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME]))
            esi.post_verify_changes(changes)
Example #13
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([TAXONOMY[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))
    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Example #14
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may contain
       retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([taxid_d[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li    # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))

    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)    # convert key/value to int
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Example #15
0
def load_data(step=1000, offset=0, gwas_data_local=None):
    if gwas_data_local:
        gwas_data = loadobj('gwasdata.pyobj')
        for item in gwas_data:
            snp = item
            chrom = snp[1]
            chrom = chrom[3:]
            rsid = snp[4]
            pubMedID = snp[5]
            title = snp[9]
            trait = snp[10]
            region = snp[13]
            gene_name = snp[14]
            riskAllele = snp[15]
            riskAlleleFreq = snp[16]
            if not is_float(riskAlleleFreq):
                riskAlleleFreq = None
            pValue = snp[17]
            pValue_desc = snp[18]
            if not is_float(pValue):
                pValue = None
                pValue_desc = None
            # parse from myvariant.info to get hgvs_id,
            # ref, alt information based on rsid
            url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\
                + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19'
            r = requests.get(url)
            for hits in r.json()['hits']:
                HGVS = hits['_id']

                one_snp_json = {
                    "_id": HGVS,
                    "gwassnp":
                        {
                            "rsid": rsid,
                            "pubmed": pubMedID,
                            "title": title,
                            "trait": trait,
                            "region": region,
                            "genename": gene_name,
                            "risk_allele": riskAllele,
                            "risk_allele_freq": riskAlleleFreq,
                            "pvalue": pValue,
                            "pvalue_desc": pValue_desc
                        }
                }
                yield one_snp_json
    else:
        MySQLHG19 = MySQLdb.connect('genome-mysql.cse.ucsc.edu',
                                    db='hg19', user='******', passwd='password')
        Cursor = MySQLHG19.cursor()

        # get the row number of gwasCatalog
        sql = "SELECT COUNT(*) FROM gwasCatalog"
        Cursor.execute(sql)
        numrows = Cursor.fetchone()[0]
        print(numrows)

        sql = "SELECT * FROM gwasCatalog"
        Cursor.execute(sql)

        for i in range(numrows):
            snp = Cursor.fetchone()
            if i and i % step == 0:
                print(i)

            chrom = snp[1]
            chrom = chrom[3:]
            rsid = snp[4]
            pubMedID = snp[5]
            title = snp[9]
            trait = snp[10]
            region = snp[13]
            gene_name = snp[14]
            riskAllele = snp[15]
            riskAlleleFreq = snp[16]
            if not is_float(riskAlleleFreq):
                riskAlleleFreq = None
            pValue = snp[17]
            pValue_desc = snp[18]
            if not is_float(pValue):
                pValue = None
                pValue_desc = None
            # parse from myvariant.info to get hgvs_id, ref, alt information based on rsid
            url = 'http://localhost:8000/v1/query?q=dbsnp.rsid:'\
                + rsid + '&fields=_id,dbsnp.ref,dbsnp.alt,dbsnp.chrom,dbsnp.hg19'
            r = requests.get(url)
            for hits in r.json()['hits']:
                HGVS = hits['_id']
                one_snp_json = {
                    "_id": HGVS,
                    "gwassnp":
                        {
                            "rsid": rsid,
                            "pubmed": pubMedID,
                            "title": title,
                            "trait": trait,
                            "region": region,
                            "genename": gene_name,
                            "risk_allele": riskAllele,
                            "risk_allele_freq": riskAlleleFreq,
                            "pvalue": pValue,
                            "pvalue_desc": pValue_desc
                        }
                }
                yield one_snp_json
 def load_chr_data(self):
     print("\tLoading chromosome data...", end='')
     self._chr_data = loadobj(HG19_DATAFILE)
     print("Done.")
Example #17
0
 def _load_entrez_geneid_d(self):
     self._entrez_geneid_d = loadobj(
         ("entrez_gene__geneid_d.pyobj", self.src), mode='gridfs')
Example #18
0
 def load_chr_data(self):
     print("\tLoading chromosome data...", end='')
     self._chr_data = loadobj(HG19_DATAFILE)
     print("Done.")
Example #19
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result
Example #20
0
 def _load_entrez_geneid_d(self):
     self._entrez_geneid_d = loadobj(("entrez_gene__geneid_d.pyobj", self.src), mode='gridfs')