def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count()) print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count()) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def get_changes(self, source_col, use_parallel=True): target_col = self._target_col source_col = self._db[source_col] if is_str(source_col) else source_col src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) changes = diff_collections(target, src, use_parallel=use_parallel, step=self.step) if changes: changes['source'] = source_col.name changes['timestamp'] = _get_timestamp(source_col.name) return changes
def main(s1, s2, p=False): conn1, dbstr1, colstr1 = s1 conn2, dbstr2, colstr2 = s2 col1 = MongoClient(conn1)[dbstr1][colstr1] col2 = MongoClient(conn2)[dbstr2][colstr2] b1 = GeneDocMongoDBBackend(col1) b2 = GeneDocMongoDBBackend(col2) changes = diff.diff_collections(b1, b2, use_parallel=p, step=1000) return changes
def sync_index(self, use_parallel=True): from utils import diff sync_src = self.get_target_collection() es_idxer = ESIndexer(self.get_mapping()) es_idxer.ES_INDEX_NAME = sync_src.target_collection.name es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = databuild.backend.GeneDocESBackend(es_idxer) changes = diff.diff_collections(sync_src, sync_target) return changes
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend(ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend( ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices = [] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print("Found {} sources:".format(len(src_li))) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1)) print('\tsync_target\t{:<45}{}\t{}'.format(*src_2)) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count())) print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count())) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices=[] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print "Found {} sources:".format(len(src_li)) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1) print '\tsync_target\t{:<45}{}\t{}'.format(*src_2) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def diff_two(col_1, col_2, use_parallel=True): target = get_target_db() b1 = GeneDocMongoDBBackend(target[col_1]) b2 = GeneDocMongoDBBackend(target[col_2]) return diff_collections(b1, b2, use_parallel=use_parallel)
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result