def _diff_doc_worker(args): # b1_target_collection, b2_es_index, ids, _path = args _b1, _b2, ids, _path = args import sys if _path not in sys.path: sys.path.append(_path) import utils.diff reload(utils.diff) from utils.diff import _diff_doc_inner_worker, get_backend b1 = get_backend(*_b1) b2 = get_backend(*_b2) _updates = _diff_doc_inner_worker(b1, b2, ids) return _updates
def _diff_parallel_worker(old_collection_name, new_collection_name, common_ids): b1 = get_backend(old_collection_name, 'mongodb') b2 = get_backend(new_collection_name, 'mongodb') _updates = [] for doc1, doc2 in two_docs_iterator(b1, b2, common_ids): assert doc1['_id'] == doc2['_id'], repr((common_ids, len(common_ids))) _patch = jsondiff.make(doc1, doc2) if _patch: _diff = {} _diff['patch'] = _patch _diff['_id'] = doc1['_id'] _updates.append(_diff) return _updates
def _diff_doc_worker(args): #b1_target_collection, b2_es_index, ids, _path = args _b1, _b2, ids, _path = args import sys if _path not in sys.path: sys.path.append(_path) import utils.diff reload(utils.diff) from utils.diff import _diff_doc_inner_worker, get_backend b1 = get_backend(*_b1) b2 = get_backend(*_b2) _updates = _diff_doc_inner_worker(b1, b2, ids) return _updates
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result