def clone_index(createidx=False, test=True): if test: return from utils.es import ESIndexer from utils.common import iter_n new_idx = 'myvariant_current_3' step = 10000 if createidx: from mapping import get_mapping m = get_mapping() body = {'settings': {'number_of_shards': 10}} # ### es.indices.create(new_idx, body=body) es.indices.put_mapping(index=new_idx, doc_type='variant', body=m) # helpers.reindex(es, source_index='myvariant_all', # target_index= new_idx, chunk_size=10000) esi = ESIndexer() doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step) for doc_batch in iter_n(doc_iter, step): do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
def doc_iterator(self, genedoc_d, batch=True, step=10000, validate=True): if isinstance(genedoc_d, types.GeneratorType) and batch: for doc_li in iter_n(genedoc_d, n=step): yield doc_li else: if batch: doc_li = [] i = 0 for _id, doc in genedoc_d.items(): doc['_id'] = _id _doc = copy.copy(self) _doc.clear() _doc.update(doc) if validate: _doc.validate() if batch: doc_li.append(_doc) i += 1 if i % step == 0: yield doc_li doc_li = [] else: yield _doc if batch: yield doc_li
def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1)))
def verify_ids(doc_iter, step=100000, index=None, doc_type=None): '''verify how many docs from input interator/list overlapping with existing docs.''' index = index or config.ES_INDEX_NAME doc_type = doc_type or config.ES_DOC_TYPE q = {'query': {'ids': {"values": []}}} total_cnt = 0 found_cnt = 0 out = [] for doc_batch in iter_n(doc_iter, n=step): id_li = [doc['_id'] for doc in doc_batch] # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch] q['query']['ids']['values'] = id_li xres = es.search(index=index, doc_type=doc_type, body=q, _source=False) found_cnt += xres['hits']['total'] total_cnt += len(id_li) print(xres['hits']['total'], found_cnt, total_cnt) out.extend([x['_id'] for x in xres['hits']['hits']]) return out
def verify_ids(doc_iter, step=100000, index=None, doc_type=None): '''verify how many docs from input interator/list overlapping with existing docs.''' index = index or config.ES_INDEX_NAME doc_type = doc_type or config.ES_DOC_TYPE es = get_es() q = {'query': {'ids': {"values": []}}} total_cnt = 0 found_cnt = 0 out = [] for doc_batch in iter_n(doc_iter, n=step): id_li = [doc['_id'] for doc in doc_batch] # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch] q['query']['ids']['values'] = id_li xres = es.search(index=index, doc_type=doc_type, body=q, _source=False) found_cnt += xres['hits']['total'] total_cnt += len(id_li) print(xres['hits']['total'], found_cnt, total_cnt) out.extend([x['_id'] for x in xres['hits']['hits']]) return out
def apply_changes(self, changes): step = self.step target_col = self._target_col source_col = self._db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) _timestamp = changes['timestamp'] t0 = time.time() if changes['add']: logging.info("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() for _ids in iter_n(changes['add'], step): _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp target.insert(_doc_li) logging.info("done. [{}]".format(timesofar(t00))) if changes['delete']: logging.info("Deleting {} discontinued docs...".format( len(changes['delete']))) t00 = time.time() target.remove_from_ids(changes['delete'], step=step) logging.info("done. [{}]".format(timesofar(t00))) if changes['update']: logging.info("Updating {} existing docs...".format( len(changes['update']))) t00 = time.time() i = 0 t1 = time.time() for _diff in changes['update']: target.update_diff(_diff, extra={'_timestamp': _timestamp}) i += 1 if i > 1 and i % step == 0: logging.info('\t{}\t{}'.format(i, timesofar(t1))) t1 = time.time() logging.info("done. [{}]".format(timesofar(t00))) logging.info("\n") logging.info("Finished. %s" % timesofar(t0))
def apply_changes(self, changes): step = self.step target_col = self._target_col source_col = self._db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) _timestamp = changes['timestamp'] t0 = time.time() if changes['add']: logging.info("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() for _ids in iter_n(changes['add'], step): _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp target.insert(_doc_li) logging.info("done. [{}]".format(timesofar(t00))) if changes['delete']: logging.info("Deleting {} discontinued docs...".format(len(changes['delete']))) t00 = time.time() target.remove_from_ids(changes['delete'], step=step) logging.info("done. [{}]".format(timesofar(t00))) if changes['update']: logging.info("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() i = 0 t1 = time.time() for _diff in changes['update']: target.update_diff(_diff, extra={'_timestamp': _timestamp}) i += 1 if i > 1 and i % step == 0: logging.info('\t{}\t{}'.format(i, timesofar(t1))) t1 = time.time() logging.info("done. [{}]".format(timesofar(t00))) logging.info("\n") logging.info("Finished. %s" % timesofar(t0))