Example #1
0
    def make_temp_collection(self):
        '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.'''

        new_collection = None
        while 1:
            new_collection = self.__collection__ + '_temp_' + get_random_string()
            if new_collection not in self.db.collection_names():
                break
        self.temp_collection = self.db[new_collection]
        return new_collection
Example #2
0
    def make_temp_collection(self):
        '''Create a temp collection for dataloading.'''

        new_collection = None
        while 1:
            new_collection = self.src_name + '_temp_' + get_random_string()
            if new_collection not in self.src_db.collection_names():
                break
        self.temp_collection = self.src_db[new_collection]
        return new_collection
Example #3
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.src_db[self.src_name].count() > 0:
             new_name = '_'.join([self.src_name, 'archive', get_timestamp(), get_random_string()])
             self.src_db[self.src_name].rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.src_name)
     else:
         return None
Example #4
0
    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
Example #5
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result
Example #6
0
 def switch_collection(self):
     '''after a successful loading, rename temp_collection to regular collection name,
        and renaming existing collection to a temp name for archiving purpose.
     '''
     if self.temp_collection and self.temp_collection.count() > 0:
         if self.collection.count() > 0:
             # renaming existing collections
             new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()])
             self.collection.rename(new_name, dropTarget=True)
         self.temp_collection.rename(self.__collection__)
     else:
         print("Error: load data first.")
Example #7
0
 def _get_target_name(self):
     return 'genedoc_{}_{}_{}'.format(self._build_config['name'],
                                      get_timestamp(), get_random_string()).lower()