Example #1
0
def _diff_doc_worker(args):
    # b1_target_collection, b2_es_index, ids, _path = args
    _b1, _b2, ids, _path = args
    import sys
    if _path not in sys.path:
        sys.path.append(_path)
    import utils.diff
    reload(utils.diff)
    from utils.diff import _diff_doc_inner_worker, get_backend

    b1 = get_backend(*_b1)
    b2 = get_backend(*_b2)
    _updates = _diff_doc_inner_worker(b1, b2, ids)
    return _updates
Example #2
0
def _diff_parallel_worker(old_collection_name, new_collection_name,
                          common_ids):
    b1 = get_backend(old_collection_name, 'mongodb')
    b2 = get_backend(new_collection_name, 'mongodb')
    _updates = []
    for doc1, doc2 in two_docs_iterator(b1, b2, common_ids):
        assert doc1['_id'] == doc2['_id'], repr((common_ids, len(common_ids)))
        _patch = jsondiff.make(doc1, doc2)
        if _patch:
            _diff = {}
            _diff['patch'] = _patch
            _diff['_id'] = doc1['_id']
            _updates.append(_diff)
    return _updates
Example #3
0
def _diff_doc_worker(args):
    #b1_target_collection, b2_es_index, ids, _path = args
    _b1, _b2, ids, _path = args
    import sys
    if _path not in sys.path:
        sys.path.append(_path)
    import utils.diff
    reload(utils.diff)
    from utils.diff import _diff_doc_inner_worker, get_backend

    b1 = get_backend(*_b1)
    b2 = get_backend(*_b2)
    _updates = _diff_doc_inner_worker(b1, b2, ids)
    return _updates
Example #4
0
 def main(self, index, collection, diff_filepath, validate=False, wait=60):
     self._index = index
     self._esi._index = index
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_list = self.add(source_collection, diff['add'])
     delete_list = self.delete(collection, diff['delete'])
     update_list = self.update(diff['update'])
     t00 = time()
     print('Adding new {} docs...'.format(len(diff['add'])))
     t0 = time()
     bulk(self._es, add_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Deleting {} docs'.format(len(diff['delete'])))
     t0 = time()
     bulk(self._es, delete_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Updating {} docs'.format(len(diff['update'])))
     t0 = time()
     bulk(self._es, update_list)
     print("Done. [{}]".format(timesofar(t0)))
     print("=" * 20)
     print("Finished! [{}]".format(timesofar(t00)))
     if validate:
         print('Waiting {}s to let ES to finish...'.format(wait), end="")
         sleep(wait)
         print("Done.")
         print("Validating...")
         t0 = time()
         q = {
             "query": {
                 "constant_score": {
                     "filter": {
                         "exists": {
                             "field": collection
                         }
                     }
                 }
             }
         }
         data = self._esi.doc_feeder(query=q, _source=collection)
         temp_collection = collection + '_temp_' + get_random_string()
         self._src[temp_collection].drop()
         load_source(temp_collection, src_data=data)
         c1 = get_backend(source_collection, 'mongodb')
         c2 = get_backend(temp_collection, 'mongodb')
         diff_result = diff_collections(c1, c2, use_parallel=False)
         self._src[temp_collection].drop()
         print("Done. [{}]".format(t0))
         return diff_result
Example #5
0
    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
Example #6
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result