예제 #1
0
    def validate(self, build_config='mygene_allspecies', n=10):
        '''Validate merged genedoc, currently for ES backend only.'''
        import random
        import itertools
        import pyes

        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only'

        target_name = last_build['target']
        self.validate_src_collections()
        self.prepare_target(target_name=target_name)
        logging.info("Validating...")
        target_cnt = self.target.count()
        stats_cnt = last_build['stats']['total_genes']
        if target_cnt == stats_cnt:
            logging.info("OK [total count={}]".format(target_cnt))
        else:
            logging.info(
                "Warning: total count of gene documents does not match [{}, should be {}]"
                .format(target_cnt, stats_cnt))

        if n > 0:
            for src in self._build_config['sources']:
                logging.info("\nSrc: %s" % src)
                # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene':
                #     print "skipped."
                #     continue
                cnt = self.src[src].count()
                fdr1 = doc_feeder(self.src[src], step=10000, s=cnt - n)
                rand_s = random.randint(0, cnt - n)
                fdr2 = doc_feeder(self.src[src],
                                  step=n,
                                  s=rand_s,
                                  e=rand_s + n)
                _first_exception = True
                for doc in itertools.chain(fdr1, fdr2):
                    _id = doc['_id']
                    try:
                        es_doc = self.target.get_from_id(_id)
                    except pyes.exceptions.NotFoundException:
                        if _first_exception:
                            logging.info()
                            _first_exception = False
                        logging.info("%s not found." % _id)
                        continue
                    for k in doc:
                        if src == 'entrez_homologene' and k == 'taxid':
                            # there is occasionally known error for taxid in homologene data.
                            continue
                        assert es_doc.get(
                            k,
                            None) == doc[k], (_id, k, es_doc.get(k,
                                                                 None), doc[k])
예제 #2
0
def diff_collections2(b1, b2, result_dir, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    DIFFFILE_PATH = '/home/kevinxin/diff_result/'
    DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir)
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[])
    data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[])
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0

    for _batch in data_new:
        cnt += 1
        id_list_new = [_doc['_id'] for _doc in _batch]
        docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True)
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'add': id_in_new,
                   'update': _updates,
                   'delete': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add))
    print("="*100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_old = list(set(id_list_old)-set(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'delete': id_in_old,
                   'add': [],
                   'update': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete))
    print("="*100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
예제 #3
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(
        sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(
                len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
예제 #4
0
    def validate(self, build_config='mygene_allspecies', n=10):
        '''Validate merged genedoc, currently for ES backend only.'''
        import random
        import itertools
        import pyes

        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only'

        target_name = last_build['target']
        self.validate_src_collections()
        self.prepare_target(target_name=target_name)
        logging.info("Validating...")
        target_cnt = self.target.count()
        stats_cnt = last_build['stats']['total_genes']
        if target_cnt == stats_cnt:
            logging.info("OK [total count={}]".format(target_cnt))
        else:
            logging.info("Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, stats_cnt))

        if n > 0:
            for src in self._build_config['sources']:
                logging.info("\nSrc: %s" % src)
                # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene':
                #     print "skipped."
                #     continue
                cnt = self.src[src].count()
                fdr1 = doc_feeder(self.src[src], step=10000, s=cnt - n)
                rand_s = random.randint(0, cnt - n)
                fdr2 = doc_feeder(self.src[src], step=n, s=rand_s, e=rand_s + n)
                _first_exception = True
                for doc in itertools.chain(fdr1, fdr2):
                    _id = doc['_id']
                    try:
                        es_doc = self.target.get_from_id(_id)
                    except pyes.exceptions.NotFoundException:
                        if _first_exception:
                            logging.info()
                            _first_exception = False
                        logging.info("%s not found." % _id)
                        continue
                    for k in doc:
                        if src == 'entrez_homologene' and k == 'taxid':
                            # there is occasionally known error for taxid in homologene data.
                            continue
                        assert es_doc.get(k, None) == doc[k], (_id, k, es_doc.get(k, None), doc[k])
예제 #5
0
    def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True):

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                time.sleep(delay)

        from biothings.utils.mongo import doc_feeder
        src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query)
        if bulk:
            if update:
                # input doc will update existing one
                # if allow_upsert, create new one if not exist
                res = self.update_docs(src_docs, upsert=allow_upsert)
            else:
                # input doc will overwrite existing one
                res = self.index_bulk(src_docs)
            # if len(res[1]) > 0:
            if res[1]:
                raise IndexerException("Error: {} docs failed indexing.".format(len(res[1])))
            return res[0]

        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
            return cnt
예제 #6
0
def merge_index_worker(col_name, ids, pindexer, batch_num):
    col = create_backend(col_name).target_collection
    idxer = pindexer()
    upd_cnt = 0
    new_cnt = 0
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    docs = [d for d in cur]
    [d.pop("_timestamp", None) for d in docs]
    dids = dict([(d["_id"], d) for d in docs])
    dexistings = dict([(d["_id"], d)
                       for d in idxer.get_docs([k for k in dids.keys()])])
    for _id in dexistings:
        d = dexistings[_id]
        # update in-place
        d.update(dids[_id])
        # mark as processed/updated
        dids.pop(_id)
    # updated docs (those existing in col *and* index)
    upd_cnt = idxer.index_bulk(dexistings.values(), len(dexistings))
    logging.debug("%s documents updated in index" % repr(upd_cnt))
    # new docs (only in col, *not* in index)
    new_cnt = idxer.index_bulk(dids.values(), len(dids))
    logging.debug("%s new documents in index" % repr(new_cnt))
    # need to return one: tuple(cnt,list)
    ret = (upd_cnt[0] + new_cnt[0], upd_cnt[1] + new_cnt[1])
    return ret
예제 #7
0
def merger_worker(col_name, dest_name, ids, mapper, upsert, batch_num):
    try:
        src = mongo.get_src_db()
        tgt = mongo.get_target_db()
        col = src[col_name]
        #if batch_num == 2:
        #    raise ValueError("oula pa bon")
        dest = DocMongoBackend(tgt, tgt[dest_name])
        cur = doc_feeder(col,
                         step=len(ids),
                         inbatch=False,
                         query={'_id': {
                             '$in': ids
                         }})
        mapper.load()
        docs = mapper.process(cur)
        cnt = dest.update(docs, upsert=upsert)
        return cnt
    except Exception as e:
        logger_name = "build_%s_%s_batch_%s" % (dest_name, col_name, batch_num)
        logger = get_logger(logger_name, btconfig.LOG_FOLDER)
        logger.exception(e)
        exc_fn = os.path.join(btconfig.LOG_FOLDER, "%s.pick" % logger_name)
        pickle.dump(e, open(exc_fn, "wb"))
        logger.info("Exception was dumped in pickle file '%s'" % exc_fn)
        raise
예제 #8
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                           sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
예제 #9
0
파일: sync.py 프로젝트: SuLab/mygene.info
def mark_timestamp(timestamp):
    #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True)
    target = get_target_db()
    #genedoc_col = target.genedoc_mygene_allspecies_current
    genedoc_col = target.genedoc_mygene_xxxxx
    for doc in doc_feeder(genedoc_col):
        genedoc_col.update({'_id': doc['_id']},
                           {'$set': {'_timestamp': timestamp}},
                           manipulate=False, check_keys=False,
                           upsert=False, w=0)
예제 #10
0
def new_index_worker(col_name, ids, pindexer, batch_num):
    col = create_backend(col_name).target_collection
    idxer = pindexer()
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    cnt = idxer.index_bulk(cur)
    return cnt
예제 #11
0
    def validate_src(self,
                     collection,
                     return_false=False,
                     return_none=False,
                     return_true=False,
                     verbose=False,
                     flag_invalid=False,
                     generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}  # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id},
                                  {'$set': {
                                      "unmatched_ref": "True"
                                  }})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
예제 #12
0
 def index(self):
     clients = self._get_clients()
     docs = doc_feeder(
         clients.mongo,
         step=len(self.ids),
         inbatch=False,
         query={'_id': {
             '$in': self.ids
         }})
     self.logger.info("%s: %d documents.", self.name, len(self.ids))
     return clients.es.mindex(docs)
예제 #13
0
    def _merge_parallel_ipython(self,
                                collection,
                                geneid_set,
                                step=100000,
                                idmapping_d=None):
        from IPython.parallel import Client, require

        rc = Client()
        dview = rc[:]
        #dview = rc.load_balanced_view()
        dview.block = False
        target_collection = self.target.target_collection
        dview['server'], dview[
            'port'] = target_collection.database.client.address
        dview['database'] = target_collection.database.name
        dview['collection_name'] = target_collection.name

        def partition(lst, n):
            q, r = divmod(len(lst), n)
            indices = [q * i + min(i, r) for i in range(n + 1)]
            return [lst[indices[i]:indices[i + 1]] for i in range(n)]

        @require('pymongo', 'time')
        def worker(doc_li):
            conn = pymongo.MongoClient(server, port)
            target_collection = conn[database][collection_name]
            t0 = time.time()
            for doc in doc_li:
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target_collection.update({'_id': __id}, {'$set': doc},
                                         manipulate=False,
                                         upsert=False)  # ,safe=True)
            logging.info('Done. [%.1fs]' % (time.time() - t0))

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(
                    _id
            ):  # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    self.doc_queue.append(doc)

                    if len(self.doc_queue) >= step:
                        #dview.scatter('doc_li', self.doc_queue)
                        #dview.apply_async(worker)
                        dview.map_async(worker,
                                        partition(self.doc_queue, len(rc.ids)))
                        self.doc_queue = []
                        logging.info("!")
예제 #14
0
def do_index_worker(col_name, ids, pindexer, batch_num):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    idxer = pindexer()
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    cnt = idxer.index_bulk(cur)
    return cnt
예제 #15
0
def inspect_data(backend_provider, ids, mode, pre_mapping, **kwargs):
    col = create_backend(backend_provider).target_collection
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    return btinspect.inspect_docs(cur,
                                  mode=mode,
                                  pre_mapping=pre_mapping,
                                  metadata=False,
                                  **kwargs)
예제 #16
0
 def _merge_sequential(self, collection, geneid_set, step=100000, idmapping_d=None):
     for doc in doc_feeder(self.src[collection], step=step):
         _id = doc['_id']
         if idmapping_d:
             _id = idmapping_d.get(_id, None) or _id
         for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
             __id = str(__id)
             if __id in geneid_set:
                 doc.pop('_id', None)
                 doc.pop('taxid', None)
                 # target_collection.update({'_id': __id}, {'$set': doc},
                 #                           manipulate=False,
                 #                           upsert=False) #,safe=True)
                 self.target.update(__id, doc)
예제 #17
0
def mark_timestamp(timestamp):
    #.update({'_id': {'$in': xli1}}, {'$set': {'_timestamp': ts}}, multi=True)
    target = get_target_db()
    #genedoc_col = target.genedoc_mygene_allspecies_current
    genedoc_col = target.genedoc_mygene_xxxxx
    for doc in doc_feeder(genedoc_col):
        genedoc_col.update({'_id': doc['_id']},
                           {'$set': {
                               '_timestamp': timestamp
                           }},
                           manipulate=False,
                           check_keys=False,
                           upsert=False,
                           w=0)
예제 #18
0
파일: sync.py 프로젝트: SuLab/mygene.info
 def backup_timestamp(self, outfile=None, compress=True, ):
     '''backup "_id" and "_timestamp" fields into a output file.'''
     ts = time.strftime('%Y%m%d')
     outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt'
     if compress:
         outfile += '.bz'
         import bz2
     logging.info('Backing up timestamps into "{}"...'.format(outfile))
     t0 = time.time()
     file_handler = bz2.BZ2File if compress else open
     with file_handler(outfile, 'wb') as out_f:
         for doc in doc_feeder(self._target_col, step=100000, fields=['_timestamp']):
             data = '%s\t%s\n' % (doc['_id'], doc['_timestamp'].strftime('%Y%m%d'))
             out_f.write(data.encode())
     logging.info("Done. %s" % timesofar(t0))
     return outfile
예제 #19
0
    def _merge_parallel_ipython(self, collection, geneid_set, step=100000, idmapping_d=None):
        from IPython.parallel import Client, require

        rc = Client()
        dview = rc[:]
        #dview = rc.load_balanced_view()
        dview.block = False
        target_collection = self.target.target_collection
        dview['server'], dview['port'] = target_collection.database.client.address
        dview['database'] = target_collection.database.name
        dview['collection_name'] = target_collection.name

        def partition(lst, n):
            q, r = divmod(len(lst), n)
            indices = [q * i + min(i, r) for i in range(n + 1)]
            return [lst[indices[i]:indices[i + 1]] for i in range(n)]

        @require('pymongo', 'time')
        def worker(doc_li):
            conn = pymongo.MongoClient(server, port)
            target_collection = conn[database][collection_name]
            t0 = time.time()
            for doc in doc_li:
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target_collection.update({'_id': __id}, {'$set': doc},
                                         manipulate=False,
                                         upsert=False)  # ,safe=True)
            logging.info('Done. [%.1fs]' % (time.time() - t0))

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    self.doc_queue.append(doc)

                    if len(self.doc_queue) >= step:
                        #dview.scatter('doc_li', self.doc_queue)
                        #dview.apply_async(worker)
                        dview.map_async(worker, partition(self.doc_queue, len(rc.ids)))
                        self.doc_queue = []
                        logging.info("!")
예제 #20
0
    def validate_src(self, collection, return_false=False,
                     return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}    # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
예제 #21
0
    def _merge_parallel(self,
                        collection,
                        geneid_set,
                        step=100000,
                        idmapping_d=None):
        from multiprocessing import Process, Queue
        NUMBER_OF_PROCESSES = 8

        input_queue = Queue()
        input_queue.conn_pool = []

        def worker(q, target):
            while True:
                doc = q.get()
                if doc == 'STOP':
                    break
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target.update(__id, doc)
                # target_collection.update({'_id': __id}, {'$set': doc},
                #                           manipulate=False,
                #                           upsert=False) #,safe=True)

        # Start worker processes
        for i in range(NUMBER_OF_PROCESSES):
            Process(target=worker, args=(input_queue, self.target)).start()

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(
                    _id
            ):  # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    input_queue.put(doc)

        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            input_queue.put('STOP')
예제 #22
0
 def _merge_sequential(self,
                       collection,
                       geneid_set,
                       step=100000,
                       idmapping_d=None):
     for doc in doc_feeder(self.src[collection], step=step):
         _id = doc['_id']
         if idmapping_d:
             _id = idmapping_d.get(_id, None) or _id
         for __id in alwayslist(
                 _id
         ):  # there could be cases that idmapping returns multiple entrez_gene ids.
             __id = str(__id)
             if __id in geneid_set:
                 doc.pop('_id', None)
                 doc.pop('taxid', None)
                 # target_collection.update({'_id': __id}, {'$set': doc},
                 #                           manipulate=False,
                 #                           upsert=False) #,safe=True)
                 self.target.update(__id, doc)
예제 #23
0
    def post_merge(self, source_names, batch_size, job_manager):
        # get the lineage mapper
        mapper = LineageMapper(name="lineage")
        # load cache (it's being loaded automatically
        # as it's not part of an upload process
        mapper.load()

        # create a storage to save docs back to merged collection
        db = get_target_db()
        col_name = self.target_backend.target_collection.name
        storage = UpsertStorage(db,col_name)

        for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True):
            docs = mapper.process(docs)
            storage.process(docs,batch_size)

        # add indices used to create metadata stats
        keys = ["rank","taxid"]
        self.logger.info("Creating indices on %s" % repr(keys))
        for k in keys:
            self.target_backend.target_collection.ensure_index(k)
예제 #24
0
    def post_merge(self, source_names, batch_size, job_manager):
        # get the lineage mapper
        mapper = LineageMapper(name="lineage")
        # load cache (it's being loaded automatically
        # as it's not part of an upload process
        mapper.load()

        # create a storage to save docs back to merged collection
        db = get_target_db()
        col_name = self.target_backend.target_collection.name
        storage = UpsertStorage(db,col_name)

        for docs in doc_feeder(self.target_backend.target_collection, step=batch_size, inbatch=True):
            docs = mapper.process(docs)
            storage.process(docs,batch_size)

        # add indices used to create metadata stats
        keys = ["rank","taxid"]
        self.logger.info("Creating indices on %s" % repr(keys))
        for k in keys:
            self.target_backend.target_collection.ensure_index(k)
예제 #25
0
 def backup_timestamp(
     self,
     outfile=None,
     compress=True,
 ):
     '''backup "_id" and "_timestamp" fields into a output file.'''
     ts = time.strftime('%Y%m%d')
     outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt'
     if compress:
         outfile += '.bz'
         import bz2
     logging.info('Backing up timestamps into "{}"...'.format(outfile))
     t0 = time.time()
     file_handler = bz2.BZ2File if compress else open
     with file_handler(outfile, 'wb') as out_f:
         for doc in doc_feeder(self._target_col,
                               step=100000,
                               fields=['_timestamp']):
             data = '%s\t%s\n' % (doc['_id'],
                                  doc['_timestamp'].strftime('%Y%m%d'))
             out_f.write(data.encode())
     logging.info("Done. %s" % timesofar(t0))
     return outfile
예제 #26
0
    def merge(self):
        clients = self._get_clients()

        upd_cnt, docs_old = 0, {}
        new_cnt, docs_new = 0, {}

        # populate docs_old
        for doc in clients.es.mget(self.ids):
            docs_old[doc['_id']] = doc

        # populate docs_new
        for doc in doc_feeder(
                clients.mongo,
                step=len(self.ids),
                inbatch=False,
                query={'_id': {
                    '$in': self.ids
                }}):
            docs_new[doc['_id']] = doc
            doc.pop("_timestamp", None)

        # merge existing ids
        for key in list(docs_new):
            if key in docs_old:
                docs_old[key].update(docs_new[key])
                del docs_new[key]

        # updated docs (those existing in col *and* index)
        upd_cnt = clients.es.mindex(docs_old.values())
        self.logger.info("%s: %d documents updated.", self.name, upd_cnt)

        # new docs (only in col, *not* in index)
        new_cnt = clients.es.mindex(docs_new.values())
        self.logger.info("%s: %d new documents.", self.name, new_cnt)

        return upd_cnt + new_cnt
예제 #27
0
    def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None):
        from multiprocessing import Process, Queue
        NUMBER_OF_PROCESSES = 8

        input_queue = Queue()
        input_queue.conn_pool = []

        def worker(q, target):
            while True:
                doc = q.get()
                if doc == 'STOP':
                    break
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target.update(__id, doc)
                # target_collection.update({'_id': __id}, {'$set': doc},
                #                           manipulate=False,
                #                           upsert=False) #,safe=True)

        # Start worker processes
        for i in range(NUMBER_OF_PROCESSES):
            Process(target=worker, args=(input_queue, self.target)).start()

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    input_queue.put(doc)

        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            input_queue.put('STOP')
예제 #28
0
def sync_es_coldhot_jsondiff_worker(diff_file,
                                    es_config,
                                    new_db_col_names,
                                    batch_size,
                                    cnt,
                                    force=False,
                                    selfcontained=False,
                                    metadata={},
                                    debug=False):
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)" %
                  (es_config, eskwargs))
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)

    # add: diff between hot collections showed we have new documents but it's
    # possible some of those docs already exist in premerge/cold collection.
    # if so, they should be treated as dict.update() where the hot document content
    # has precedence over the cold content for fields in common
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff[
            "source"], "Source is different in diff file '%s': %s" % (
                diff_file, diff["source"])
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        # check which docs already exist in existing index (meaning they exist in cold collection)
        dids = dict([(d["_id"], d) for d in docs])
        dexistings = dict([
            (d["_id"], d) for d in indexer.get_docs([k for k in dids.keys()])
        ])
        logging.debug("From current batch, %d already exist" % len(dexistings))
        # remove existing docs from "add" so the rest of the dict will be treated
        # as "real" added documents while update existing ones with new content
        toremove = []
        for _id, d in dexistings.items():
            # update in-place
            if d == dids[d["_id"]]:
                logging.debug("%s was already added, skip it" % d["_id"])
                toremove.append(d["_id"])
                res["skipped"] += 1
            else:
                newd = copy.deepcopy(d)
                d.update(dids[d["_id"]])
                if d == newd:
                    logging.debug("%s was already updated, skip it" % d["_id"])
                    toremove.append(d["_id"])
                    res["skipped"] += 1
            dids.pop(d["_id"])
        for _id in toremove:
            dexistings.pop(_id)
        logging.info("Syncing 'add' documents (%s in total) from cold/hot merge: " % len(docs)
                     + "%d documents will be updated as they already exist in the index, " % len(dexistings)
                     + "%d documents will be added (%d skipped as already processed)" % (len(dids), len(toremove)))
        # treat real "added" documents
        # Note: no need to check for "already exists" errors, as we already checked that before
        # in order to know what to do
        try:
            res["added"] += indexer.index_bulk(dids.values(),
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            logging.error("Error while adding documents %s" %
                          [k for k in dids.keys()])
        # update already existing docs in cold collection
        # treat real "added" documents
        try:
            res["updated"] += indexer.index_bulk(dexistings.values(),
                                                 batch_size)[0]
        except BulkIndexError as e:
            logging.error(
                "Error while updating (via new hot detected docs) documents: %s"
                % e)

    # update: get doc from indexer and apply diff
    # note: it's the same process as for non-coldhot
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res,
                       debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
예제 #29
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={},
                            debug=False):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)" %
                  (es_config, eskwargs))
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)
    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff[
            "source"], "Source is different in diff file '%s': %s" % (
                diff_file, diff["source"])
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                _id = doc.pop("_id")
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, _id, action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    logging.warning("_id '%s' already added" % _id)
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({"_id": _id, "file": diff_file, "error": e})
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise
        except Exception as e:
            if debug:
                logging.error(
                    "From diff file '%s', following IDs couldn't be synced because: %s\n%s"
                    % (diff_file, e, [d.get("_id") for d in docs]))
                pickfile = "batch_%s_%s.pickle" % (cnt,
                                                   os.path.basename(diff_file))
                logging.error("Documents pickled in '%s'" % pickfile)
                pickle.dump(docs, open(pickfile, "wb"))
            raise

    # update: get doc from indexer and apply diff
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res,
                       debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
예제 #30
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={},
                               debug=False):
    """Worker to sync data between a new and an old mongo collection"""
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res
예제 #31
0
 def get_all(self, batch_size=100000):
     for doc_ids in doc_feeder(self.col, step=batch_size, inbatch=True):
         for d in doc_ids:
             yield d
예제 #32
0
    def make_genedoc_root(self):
        if not self._entrez_geneid_d:
            self._load_entrez_geneid_d()

        if 'ensembl_gene' in self._build_config['gene_root']:
            self._load_ensembl2entrez_li()
            ensembl2entrez = self._idmapping_d_cache['ensembl_gene']

        if "species" in self._build_config:
            _query = {'taxid': {'$in': self._build_config['species']}}
        elif "species_to_exclude" in self._build_config:
            _query = {'taxid': {'$nin': self._build_config['species_to_exclude']}}
        else:
            _query = None

        geneid_set = []
        species_set = set()
        if "entrez_gene" in self._build_config['gene_root']:
            for doc_li in doc_feeder(self.src['entrez_gene'], inbatch=True, step=self.step, query=_query):
                #target_collection.insert(doc_li, manipulate=False, check_keys=False)
                self.target.insert(doc_li)
                geneid_set.extend([doc['_id'] for doc in doc_li])
                species_set |= set([doc['taxid'] for doc in doc_li])
            cnt_total_entrez_genes = len(geneid_set)
            cnt_total_species = len(species_set)
            logging.info('# of entrez Gene IDs in total: %d' % cnt_total_entrez_genes)
            logging.info('# of species in total: %d' % cnt_total_species)

        if "ensembl_gene" in self._build_config['gene_root']:
            cnt_ensembl_only_genes = 0
            cnt_total_ensembl_genes = 0
            for doc_li in doc_feeder(self.src['ensembl_gene'], inbatch=True, step=self.step, query=_query):
                _doc_li = []
                for _doc in doc_li:
                    cnt_total_ensembl_genes += 1
                    ensembl_id = _doc['_id']
                    entrez_gene = ensembl2entrez.get(ensembl_id, None)
                    if entrez_gene is None:
                        #this is an Ensembl only gene
                        _doc_li.append(_doc)
                        cnt_ensembl_only_genes += 1
                        geneid_set.append(_doc['_id'])
                if _doc_li:
                    #target_collection.insert(_doc_li, manipulate=False, check_keys=False)
                    self.target.insert(_doc_li)
            cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes
            logging.info('# of ensembl Gene IDs in total: %d' % cnt_total_ensembl_genes)
            logging.info('# of ensembl Gene IDs match entrez Gene IDs: %d' % cnt_matching_ensembl_genes)
            logging.info('# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' % cnt_ensembl_only_genes)

            geneid_set = set(geneid_set)
            logging.info('# of total Root Gene IDs: %d' % len(geneid_set))
            _stats = {'total_entrez_genes': cnt_total_entrez_genes,
                      'total_species': cnt_total_species,
                      'total_ensembl_genes': cnt_total_ensembl_genes,
                      'total_ensembl_genes_mapped_to_entrez': cnt_matching_ensembl_genes,
                      'total_ensembl_only_genes': cnt_ensembl_only_genes,
                      'total_genes': len(geneid_set)}
            self._stats = _stats
            self._src_version = self.get_src_version()
            self.log_src_build({'stats': _stats, 'src_version': self._src_version})
            return geneid_set
예제 #33
0
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    if use_parallel:
        import multiprocessing
        from functools import partial
    DATA_FOLDER = result_dir
    data_new = doc_feeder(b2.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    data_old = doc_feeder(b1.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0
    _timestamp = get_timestamp()
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    for batch in data_new:
        cnt += 1
        id_list_new = [doc['_id'] for doc in batch]
        ids_common = [
            doc['_id']
            for doc in b1.target_collection.find({'_id': {
                '$in': id_list_new
            }}, {'_id': 1})
        ]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            if use_parallel:
                step = int(len(ids_common) / multiprocessing.cpu_count())
                task_list = [
                    ids_common[i:i + step]
                    for i in range(0, len(ids_common), step)
                ]
                pool = multiprocessing.Pool()
                partial_worker = partial(_diff_parallel_worker,
                                         b1.target_collection.name,
                                         b2.target_collection.name)
                results = pool.map(partial_worker, task_list)
                pool.close()
                pool.join()
                for result in results:
                    _updates += result
            else:
                _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {
            'add': id_in_new,
            'update': _updates,
            'delete': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates),
                                                    len(id_in_new)),
                  end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print(
        "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
        .format(cnt_update, cnt_add))
    print("=" * 100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        ids_common = [
            doc['_id']
            for doc in b2.target_collection.find({'_id': {
                '$in': id_list_old
            }}, {'_id': 1})
        ]
        id_in_old = list(set(id_list_old) - set(ids_common))
        _result = {
            'delete': id_in_old,
            'add': [],
            'update': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print(
        "Finished calculating diff for the old collection. Total number of docs deleted: {}"
        .format(cnt_delete))
    print("=" * 100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(
        cnt_update, cnt_add, cnt_delete))
예제 #34
0
def sync_es_jsondiff_worker(diff_file,
                            es_config,
                            new_db_col_names,
                            batch_size,
                            cnt,
                            force=False,
                            selfcontained=False,
                            metadata={}):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    new = create_backend(new_db_col_names)  # mongo collection to sync from
    indexer = create_backend(es_config).target_esidxer
    diff = loadobj(diff_file)
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    if not force and diff.get("synced", {}).get("es") == True:
        logging.info("Diff file '%s' already synced, skip it" % diff_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
    for docs in iter_n(cur, batch_size):
        try:
            res["added"] += indexer.index_bulk(docs,
                                               batch_size,
                                               action="create")[0]
        except BulkIndexError:
            for doc in docs:
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, doc["_id"], action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({
                        "_id": doc["_id"],
                        "file": diff_file,
                        "error": e
                    })
                    import pickle
                    pickle.dump(errors, open("errors", "wb"))
                    raise

    # update: get doc from indexer and apply diff
    batch = []
    ids = [p["_id"] for p in diff["update"]]
    for i, doc in enumerate(indexer.get_docs(ids)):
        try:
            patch_info = diff["update"][
                i]  # same order as what's return by get_doc()...
            assert patch_info["_id"] == doc["_id"]  # ... but just make sure
            newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
            if newdoc == doc:
                # already applied
                res["skipped"] += 1
                continue
            batch.append(newdoc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += indexer.index_bulk(batch, batch_size)[0]
            batch = []
    if batch:
        res["updated"] += indexer.index_bulk(batch, batch_size)[0]

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    diff.setdefault("synced", {}).setdefault("es", True)
    dump(diff, diff_file)
    return res
예제 #35
0
    def make_genedoc_root(self):
        if not self._entrez_geneid_d:
            self._load_entrez_geneid_d()

        if 'ensembl_gene' in self._build_config['gene_root']:
            self._load_ensembl2entrez_li()
            ensembl2entrez = self._idmapping_d_cache['ensembl_gene']

        if "species" in self._build_config:
            _query = {'taxid': {'$in': self._build_config['species']}}
        elif "species_to_exclude" in self._build_config:
            _query = {
                'taxid': {
                    '$nin': self._build_config['species_to_exclude']
                }
            }
        else:
            _query = None

        geneid_set = []
        species_set = set()
        if "entrez_gene" in self._build_config['gene_root']:
            for doc_li in doc_feeder(self.src['entrez_gene'],
                                     inbatch=True,
                                     step=self.step,
                                     query=_query):
                #target_collection.insert(doc_li, manipulate=False, check_keys=False)
                self.target.insert(doc_li)
                geneid_set.extend([doc['_id'] for doc in doc_li])
                species_set |= set([doc['taxid'] for doc in doc_li])
            cnt_total_entrez_genes = len(geneid_set)
            cnt_total_species = len(species_set)
            logging.info('# of entrez Gene IDs in total: %d' %
                         cnt_total_entrez_genes)
            logging.info('# of species in total: %d' % cnt_total_species)

        if "ensembl_gene" in self._build_config['gene_root']:
            cnt_ensembl_only_genes = 0
            cnt_total_ensembl_genes = 0
            for doc_li in doc_feeder(self.src['ensembl_gene'],
                                     inbatch=True,
                                     step=self.step,
                                     query=_query):
                _doc_li = []
                for _doc in doc_li:
                    cnt_total_ensembl_genes += 1
                    ensembl_id = _doc['_id']
                    entrez_gene = ensembl2entrez.get(ensembl_id, None)
                    if entrez_gene is None:
                        #this is an Ensembl only gene
                        _doc_li.append(_doc)
                        cnt_ensembl_only_genes += 1
                        geneid_set.append(_doc['_id'])
                if _doc_li:
                    #target_collection.insert(_doc_li, manipulate=False, check_keys=False)
                    self.target.insert(_doc_li)
            cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes
            logging.info('# of ensembl Gene IDs in total: %d' %
                         cnt_total_ensembl_genes)
            logging.info('# of ensembl Gene IDs match entrez Gene IDs: %d' %
                         cnt_matching_ensembl_genes)
            logging.info(
                '# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' %
                cnt_ensembl_only_genes)

            geneid_set = set(geneid_set)
            logging.info('# of total Root Gene IDs: %d' % len(geneid_set))
            _stats = {
                'total_entrez_genes': cnt_total_entrez_genes,
                'total_species': cnt_total_species,
                'total_ensembl_genes': cnt_total_ensembl_genes,
                'total_ensembl_genes_mapped_to_entrez':
                cnt_matching_ensembl_genes,
                'total_ensembl_only_genes': cnt_ensembl_only_genes,
                'total_genes': len(geneid_set)
            }
            self._stats = _stats
            self._src_version = self.get_src_version()
            self.log_src_build({
                'stats': _stats,
                'src_version': self._src_version
            })
            return geneid_set