Esempio n. 1
0
def clone_index(createidx=False, test=True):
    if test:
        return
    from utils.es import ESIndexer
    from utils.common import iter_n

    new_idx = 'myvariant_current_3'
    step = 10000
    if createidx:
        from mapping import get_mapping
        m = get_mapping()
        body = {'settings': {'number_of_shards': 10}}  # ###
        es.indices.create(new_idx, body=body)
        es.indices.put_mapping(index=new_idx, doc_type='variant', body=m)
    # helpers.reindex(es, source_index='myvariant_all',
    #                 target_index= new_idx, chunk_size=10000)
    esi = ESIndexer()
    doc_iter = esi.doc_feeder(index='myvariant_all_1',
                              doc_type='variant',
                              step=step)

    for doc_batch in iter_n(doc_iter, step):
        do_index(doc_batch,
                 index_name=new_idx,
                 doc_type='variant',
                 step=step,
                 verbose=False,
                 update=True)
Esempio n. 2
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']


    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                           sync_src.name,
                                           sync_src.count())
    print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                             sync_target.name,
                                             sync_target.count())
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 3
0
 def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
     self._es = get_es(es_host)
     self._index = index or config.ES_INDEX_NAME
     self._doc_type = doc_type or config.ES_DOC_TYPE
     self._esi = ESIndexer(es_host=es_host)
     self._esi._index = self._index
     self._src = get_src_db()
     self.step = step
Esempio n. 4
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 5
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 6
0
 def _get_ids_worker(args):
     from utils.es import ESIndexer
     from pyes import MatchAllQuery
     es_kwargs, start, step = args
     q = MatchAllQuery().search()
     q.sort = [{'entrezgene': 'asc'}, {'ensembl.gene': 'asc'}]
     q.fields = []
     q.start = start
     q.size = step
     esi = ESIndexer(**es_kwargs)
     cnt = esi.count()['count']
     res = esi.conn.search_raw(q)
     assert res['hits']['total'] == cnt
     return [doc['_id'] for doc in res['hits']['hits']]
Esempio n. 7
0
    def __init__(self, build_config=None, backend='mongodb'):
        self.src = get_src_db()
        self.step = 10000
        self.use_parallel = False
        self.merge_logging = True  # save output into a logging file when merge is called.
        self.max_build_status = 10  # max no. of records kept in "build" field of src_build collection.

        self.using_ipython_cluster = False
        self.shutdown_ipengines_after_done = False
        self.log_folder = LOG_FOLDER

        self._build_config = build_config
        self._entrez_geneid_d = None
        self._idmapping_d_cache = {}

        self.get_src_master()

        if backend == 'mongodb':
            self.target = databuild.backend.GeneDocMongoDBBackend()
        elif backend == 'es':
            self.target = databuild.backend.GeneDocESBackend(ESIndexer())
        elif backend == 'couchdb':
            from config import COUCHDB_URL
            import couchdb
            self.target = databuild.backend.GeneDocCouchDBBackend(
                couchdb.Server(COUCHDB_URL))
        elif backend == 'memory':
            self.target = databuild.backend.GeneDocMemeoryBackend()
        else:
            raise ValueError('Invalid backend "%s".' % backend)
Esempio n. 8
0
def test():
    target = get_target_db()
    sync_src = backend.GeneDocMongoDBBackend(
        target['genedoc_mygene_allspecies_20130402_uiu7bkyi'])
    idxer = ESIndexer()
    sync_target = backend.GeneDocESBackend(idxer)
    return sync_src, sync_target
Esempio n. 9
0
def get_backend(target_name, bk_type, **kwargs):
    '''Return a backend instance for given target_name and backend type.
        currently support MongoDB and ES backend.
    '''
    if bk_type == 'mongodb':
        return GeneDocMongoDBBackend(target_name)
    elif bk_type == 'es':
        esi = ESIndexer(target_name, **kwargs)
        return GeneDocESBackend(esi)
Esempio n. 10
0
 def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None):
     """Build ES index from last successfully-merged mongodb collection.
         optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
         optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
     """
     from pprint import pprint
     self.load_build_config(build_config)
     last_build = self._build_config['build'][last_build_idx]
     print "Last build record:"
     pprint(last_build)
     assert last_build['status'] == 'success', \
         "Abort. Last build did not success."
     assert last_build['target_backend'] == "mongodb", \
         'Abort. Last build need to be built using "mongodb" backend.'
     assert last_build.get('stats', None), \
         'Abort. Last build stats are not available.'
     self._stats = last_build['stats']
     assert last_build.get('target', None), \
         'Abort. Last build target_collection is not available.'
     #target_collection = last_build['target']
     target_collection = "genedoc_{}_current".format(build_config)  ######
     _db = get_target_db()
     target_collection = _db[target_collection]
     print
     print 'Source: ', target_collection.name
     _mapping = self.get_mapping()
     _meta = {}
     src_version = self.get_src_version()
     if src_version:
         _meta['src_version'] = src_version
     if getattr(self, '_stats', None):
         _meta['stats'] = self._stats
     if 'timestamp' in last_build:
         _meta['timestamp'] = last_build['timestamp']
     if _meta:
         _mapping['_meta'] = _meta
     es_index_name = es_index_name or target_collection.name
     es_idxer = ESIndexer(mapping=_mapping,
                          es_index_name=es_index_name,
                          es_host=es_host,
                          step=5000)
     if build_config == 'mygene_allspecies':
         es_idxer.number_of_shards = 10   # default 5
     print "ES host:", es_idxer.conn.servers[0].geturl()
     print "ES index:", es_index_name
     if ask("Continue to build ES index?") == 'Y':
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME):
             if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                 es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
             else:
                 print "Abort."
                 return
         es_idxer.create_index()
         #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
Esempio n. 11
0
def clone_index(createidx=False, test=True):
    if test:
        return
    from utils.es import ESIndexer
    from utils.common import iter_n

    new_idx = 'myvariant_current_3'
    step = 10000
    if createidx:
        from mapping import get_mapping
        m = get_mapping()
        body = {'settings': {'number_of_shards': 10}}    # ###
        es.indices.create(new_idx, body=body)
        es.indices.put_mapping(index=new_idx, doc_type='variant', body=m)
    # helpers.reindex(es, source_index='myvariant_all',
    #                 target_index= new_idx, chunk_size=10000)
    esi = ESIndexer()
    doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step)

    for doc_batch in iter_n(doc_iter, step):
        do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
Esempio n. 12
0
def make_test_index():

    def get_sample_gene(gene):
        qbdr = ESQueryBuilder(fields=['_source'], size=1000)
        _query = qbdr.dis_max_query(gene)
        _query = qbdr.add_species_custom_filters_score(_query)
        _q = {'query': _query}
        if qbdr.options:
            _q.update(qbdr.options)

        esq = ESQuery()
        res = esq._search(_q)
        return [h['_source'] for h in res['hits']['hits']]

    gli = get_sample_gene('CDK2') + \
          get_sample_gene('BTK')  + \
          get_sample_gene('insulin')

    from utils.es import ESIndexer
    index_name = 'genedoc_2'
    index_type = 'gene_sample'
    esidxer = ESIndexer(None, None)
    conn = esidxer.conn
    try:
        esidxer.delete_index_type(index_type)
    except:
        pass
    mapping = dict(conn.get_mapping('gene', index_name)['gene'])
    print conn.put_mapping(index_type, mapping, [index_name])

    print "Building index..."
    cnt = 0
    for doc in gli:
        conn.index(doc, index_name, index_type, doc['_id'])
        cnt += 1
        print cnt, ':', doc['_id']
    print conn.flush()
    print conn.refresh()
    print 'Done! - {} docs indexed.'.format(cnt)
Esempio n. 13
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                             sync_src.name, sync_src.count()))
    print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                               sync_target.name,
                                               sync_target.count()))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 14
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb')
                   for name in sorted(target_db.collection_names())
                   if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src,
                                        sync_target,
                                        use_parallel=use_parallel)
        return changes
Esempio n. 15
0
def validate(build_config=None):
    from pprint import pprint
    from utils.diff import diff_collections
    from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend
    from biothings.utils.mongo import get_src_build, get_target_db
    from utils.es import ESIndexer

    src_build = get_src_build()
    _cfg = src_build.find_one({'_id': build_config})
    last_build = _cfg['build'][-1]
    print("Last build record:")
    pprint(last_build)
    target_name = last_build['target']

    mongo_target = get_target_db()
    b1 = GeneDocMongoDBBackend(mongo_target[target_name])
    b2 = GeneDocESBackend(
        ESIndexer(es_index_name=target_name,
                  es_host='127.0.0.1:' + str(es_local_tunnel_port)))
    changes = diff_collections(b1, b2, use_parallel=True, step=10000)
    return changes
Esempio n. 16
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices=[]
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print "Found {} sources:".format(len(src_li))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1)
    print '\tsync_target\t{:<45}{}\t{}'.format(*src_2)
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel)
        return changes
Esempio n. 17
0
 def build_index(self, use_parallel=True):
     target_collection = self.get_target_collection()
     if target_collection:
         es_idxer = ESIndexer(mapping=self.get_mapping())
         es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name']
         es_idxer.step = 10000
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
         es_idxer.create_index()
         es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
         es_idxer.optimize()
     else:
         logging.info("Error: target collection is not ready yet or failed to build.")
Esempio n. 18
0
 def build_index(self, use_parallel=True):
     target_collection = self.get_target_collection()
     if target_collection:
         es_idxer = ESIndexer(mapping=self.get_mapping())
         es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name']
         es_idxer.step = 10000
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
         es_idxer.create_index()
         es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
         es_idxer.optimize()
     else:
         logging.info(
             "Error: target collection is not ready yet or failed to build."
         )
Esempio n. 19
0
class ESSyncer():
    def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
        self._es = get_es(es_host)
        self._index = index or config.ES_INDEX_NAME
        self._doc_type = doc_type or config.ES_DOC_TYPE
        self._esi = ESIndexer(es_host=es_host)
        self._esi._index = self._index
        self._src = get_src_db()
        self.step = step

    def add(self, collection, ids):
        # compare id_list with current index, get list of ids with true/false indicator
        cnt_update = 0
        cnt_create = 0
		for ids_chunk in iter_n(ids, 100):
		    id_list_all = self._esi.mexists(ids_chunk, verbose=False)
		    for _id, _exists in id_list_all:
			_doc = self._src[collection].find_one({'_id': _id})
			_doc.pop('_id')
			# case one: this id exists in current index, then just update
			if _exists:
			    es_info = {
				'_op_type': 'update',
				'_index': self._index,
				'_type': self._doc_type,
				'_id': _id,
				'doc': _doc
			    }
			    cnt_update += 1
			# case two: this id not exists in current index, then create a new one
			else:
			    es_info = {
				'_op_type': 'create',
				'_index': self._index,
				'_type': self._doc_type,
				"_id": _id,
				'_source': _doc
			    }
			    cnt_create += 1
			yield es_info
		print('items updated: ', cnt_update)
		print('items newly created: ', cnt_create)

	    def delete(self, field, ids):
		cnt_update = 0
		cnt_delete = 0
		for _id in ids:
		    # get doc from index based on id
		    if self._esi.exists(_id):
			doc = self._esi.get_variant(_id)['_source']
                        doc.pop('_id', None)
                # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item
                if len(set(doc) - set([field, 'snpeff', 'vcf', 'hg19', 'hg38', 'chrom'])) == 0:
                    es_info = {
                        '_op_type': 'delete',
                        '_index': self._index,
                        '_type': self._doc_type,
                        "_id": _id,
                    }
                    cnt_delete += 1
                # case two: exists fields other than snpeff, vcf and target field
                else:
                    # get rid of the target field, delete original doc, update the new doc
                    # plus count
                    # this requires enabling ElasticSearch dynamic scripting
                    es_info = {
                        '_op_type': 'update',
                        '_index': self._index,
                        '_type': self._doc_type,
                        '_id': _id,
                        "script": 'ctx._source.remove("{}");ctx._source.remove("_id")'.format(field)
                    }
                    cnt_update += 1
                yield es_info
            else:
Esempio n. 20
0
    def build_index2(self,
                     build_config='mygene_allspecies',
                     last_build_idx=-1,
                     use_parallel=False,
                     es_host=None,
                     es_index_name=None,
                     noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10  # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(
                        es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Esempio n. 21
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE))
        if noconfirm or ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE,
                                       noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print("Aborted.")
    else:
        print("Error: target collection is not ready yet or failed to build.")
Esempio n. 22
0
    def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10   # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Esempio n. 23
0
class ESSyncer():
    def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
        self._es = get_es(es_host)
        self._index = index or config.ES_INDEX_NAME
        self._doc_type = doc_type or config.ES_DOC_TYPE
        self._esi = ESIndexer()
        self._esi._index = self._index
        self._src = get_src_db()
        self.step = step

    def add(self, collection, ids):
        # compare id_list with current index, get list of ids with true/false indicator
        id_list = []
        id_list_all = []
        cnt = 0
        for _id in ids:
            id_list.append(_id)
            cnt += 1
            if len(id_list) == 100:
                id_list_all += self._esi.mexists(id_list, verbose=False)
                id_list = []
        if id_list:
            id_list_all += self._esi.mexists(id_list, verbose=False)
        cnt_update = 0
        cnt_create = 0
        for _id, _exists in id_list_all:
            # case one: this id exists in current index, then just update
            if _exists:
                es_info = {
                    '_op_type': 'update',
                    '_index': self._index,
                    '_type': self._doc_type,
                    "_id": _id,
                    'doc': self._src[collection].find_one({'_id': _id})
                }
                cnt_update += 1
            # case two: this id not exists in current index, then create a new one
            else:
                es_info = {
                    '_op_type': 'create',
                    '_index': self._index,
                    '_type': self._doc_type,
                    "_id": _id,
                    '_source': self._src[collection].find_one({'_id': _id})
                }
                cnt_create += 1
            yield es_info
        print('items updated: ', cnt_update)
        print('items newly created: ', cnt_create)

    def delete(self, field, ids):
        cnt_update = 0
        cnt_delete = 0
        for _id in ids:
            # get doc from index based on id
            if self._esi.exists(_id):
                doc = self._esi.get_variant(_id)['_source']
                # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item
                if set(doc) == set([field]) or set(doc) == set(
                    [field, 'snpeff', 'vcf']):
                    es_info = {
                        '_op_type': 'delete',
                        '_index': self._index,
                        '_type': self._doc_type,
                        "_id": _id,
                    }
                    cnt_delete += 1
                # case two: exists fields other than snpeff, vcf and target field
                else:
                    # get rid of the target field, delete original doc, update the new doc
                    # plus count
                    es_info = {
                        '_op_type': 'update',
                        '_index': self._index,
                        '_type': self._doc_type,
                        "_id": _id,
                        "script": 'ctx._source.remove("{}")'.format(field)
                    }
                    cnt_update += 1
                yield es_info
            else:
                print('id not exists: ', _id)
        print('items updated: ', cnt_update)
        print('items deleted: ', cnt_delete)

    def _update_one(self, _id, _patch):
        doc = self._esi.get_variant(_id)['_source']
        doc = apply_patch(doc, _patch)
        es_info = {
            '_op_type': 'index',
            '_index': self._index,
            '_type': self._doc_type,
            "_id": _id,
            '_source': doc
        }
        return es_info

    def update(self, id_patchs):
        for _id_patch in id_patchs:
            _id = _id_patch['_id']
            _patch = _id_patch['patch']
            if self._esi.exists(_id):
                _es_info = self._update_one(_id, _patch)
                yield _es_info
            else:
                print('id not exists:', _id)

    def update1(self, id_patchs):
        for _id_patch in id_patchs:
            _id = _id_patch['_id']
            _patch = _id_patch['patch']
            if self._esi.exists(_id):
                _es_info = self._update_one(_id, _patch)
                self._esi.delete_doc(_id)
                yield _es_info
            else:
                print('id not exists:', _id)

    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("=" * 20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
            print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
Esempio n. 24
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print "ES target: {}/{}/{}".format(es_server,
                                           es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE)
        if ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print "Aborted."
    else:
        print "Error: target collection is not ready yet or failed to build."
Esempio n. 25
0
class ESSyncer():
    def __init__(self, index=None, doc_type=None, es_host=None, step=5000):
        self._es = get_es(es_host)
        self._index = index or config.ES_INDEX_NAME
        self._doc_type = doc_type or config.ES_DOC_TYPE
        self._esi = ESIndexer()
        self._esi._index = self._index
        self._src = get_src_db()
        self.step = step

    def add(self, collection, ids):
        # compare id_list with current index, get list of ids with true/false indicator
        id_list = []
        id_list_all = []
        cnt = 0
        for _id in ids:
            id_list.append(_id)
            cnt += 1
            if len(id_list) == 100:
                id_list_all += self._esi.mexists(id_list, verbose=False)
                id_list = []
        if id_list:
            id_list_all += self._esi.mexists(id_list, verbose=False)
        cnt_update = 0
        cnt_create = 0
        for _id, _exists in id_list_all:
            # case one: this id exists in current index, then just update
            if _exists:
                es_info = {
                    '_op_type': 'update',
                    '_index': self._index,
                    '_type': self._doc_type,
                    "_id": _id,
                    'doc': self._src[collection].find_one({'_id': _id})
                }
                cnt_update += 1
            # case two: this id not exists in current index, then create a new one
            else:
                es_info = {
                    '_op_type': 'create',
                    '_index': self._index,
                    '_type': self._doc_type,
                    "_id": _id,
                    '_source': self._src[collection].find_one({'_id': _id})
                }
                cnt_create += 1
            yield es_info
        print('items updated: ', cnt_update)
        print('items newly created: ', cnt_create)

    def delete(self, field, ids):
        cnt_update = 0
        cnt_delete = 0
        for _id in ids:
            # get doc from index based on id
            if self._esi.exists(_id):
                doc = self._esi.get_variant(_id)['_source']
                # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item
                if set(doc) == set([field]) or set(doc) == set([field, 'snpeff', 'vcf']):
                    es_info = {
                        '_op_type': 'delete',
                        '_index': self._index,
                        '_type': self._doc_type,
                        "_id": _id,
                    }
                    cnt_delete += 1
                # case two: exists fields other than snpeff, vcf and target field
                else:
                    # get rid of the target field, delete original doc, update the new doc
                    # plus count
                    es_info = {
                        '_op_type': 'update',
                        '_index': self._index,
                        '_type': self._doc_type,
                        "_id": _id,
                        "script": 'ctx._source.remove("{}")'.format(field)
                    }
                    cnt_update += 1
                yield es_info
            else:
                print('id not exists: ', _id)
        print('items updated: ', cnt_update)
        print('items deleted: ', cnt_delete)

    def _update_one(self, _id, _patch):
        doc = self._esi.get_variant(_id)['_source']
        doc = apply_patch(doc, _patch)
        es_info = {
            '_op_type': 'index',
            '_index': self._index,
            '_type': self._doc_type,
            "_id": _id,
            '_source': doc
        }
        return es_info

    def update(self, id_patchs):
        for _id_patch in id_patchs:
            _id = _id_patch['_id']
            _patch = _id_patch['patch']
            if self._esi.exists(_id):
                _es_info = self._update_one(_id, _patch)
                yield _es_info
            else:
                print('id not exists:', _id)

    def update1(self, id_patchs):
        for _id_patch in id_patchs:
            _id = _id_patch['_id']
            _patch = _id_patch['patch']
            if self._esi.exists(_id):
                _es_info = self._update_one(_id, _patch)
                self._esi.delete_doc(_id)
                yield _es_info
            else:
                print('id not exists:', _id)

    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result