Example #1
0
    def validate(self, build_config='mygene_allspecies', n=10):
        '''Validate merged genedoc, currently for ES backend only.'''
        import random
        import itertools
        import pyes
        from pprint import pprint

        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        print "Last build record:"
        pprint(last_build)
        #assert last_build['target_backend'] == 'es', '"validate" currently works for "es" backend only'

        target_name = last_build['target']
        self.validate_src_collections()
        self.prepare_target(target_name=target_name)
        print "Validating..."
        target_cnt = self.target.count()
        stats_cnt = last_build['stats']['total_genes']
        if target_cnt == stats_cnt:
            print "OK [total count={}]".format(target_cnt)
        else:
            print "Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, stats_cnt)

        if n > 0:
            for src in self._build_config['sources']:
                print "\nSrc:", src
                # if 'id_type' in self.src_master[src] and self.src_master[src]['id_type'] != 'entrez_gene':
                #     print "skipped."
                #     continue
                cnt = self.src[src].count()
                fdr1 = doc_feeder(self.src[src], step=10000, s=cnt-n)
                rand_s = random.randint(0, cnt-n)
                fdr2 = doc_feeder(self.src[src], step=n, s=rand_s, e=rand_s+n)
                _first_exception = True
                for doc in itertools.chain(fdr1, fdr2):
                    _id = doc['_id']
                    try:
                        es_doc = self.target.get_from_id(_id)
                    except pyes.exceptions.NotFoundException:
                        if _first_exception:
                            print
                            _first_exception = False
                        print _id, 'not found.'
                        continue
                    for k in doc:
                        if src == 'entrez_homologene' and k == 'taxid':
                            # there is occasionally known error for taxid in homologene data.
                            continue
                        assert es_doc.get(k, None) == doc[k], (_id, k, es_doc.get(k, None), doc[k])
Example #2
0
    def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True):
        from utils.mongo import doc_feeder

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                print("\tPausing for {}s...".format(delay), end='')
                time.sleep(delay)
                print("done.")

        src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query)
        if bulk:
            if update:
                # input doc will update existing one
                # if allow_upsert, create new one if not exist
                res = self.update_docs(src_docs, upsert=allow_upsert)
            else:
                # input doc will overwrite existing one
                res = self.index_bulk(src_docs)
            if len(res[1]) > 0:
                print("Error: {} docs failed indexing.".format(len(res[1])))
            return res[0]
        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
                if verbose:
                    print(cnt, ':', doc['_id'])
            return cnt
Example #3
0
    def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True):

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                print("\tPausing for {}s...".format(delay), end='')
                time.sleep(delay)
                print("done.")

        src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query)
        if bulk:
            res = self.index_bulk(src_docs)
            if len(res[1]) > 0:
                print("Error: {} docs failed indexing.".format(len(res[1])))
            return res[0]
        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
                if verbose:
                    print(cnt, ':', doc['_id'])
            return cnt
Example #4
0
def do_index(doc_li,
             index_name,
             doc_type,
             step=1000,
             update=True,
             verbose=True):
    for doc_batch in doc_feeder(doc_li, step=step, verbose=verbose):
        _index_doc_batch(doc_batch, index_name, doc_type, update=update)
Example #5
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print "\t{}\trecords will be added.".format(len(changes['add']))
    print "\t{}\trecords will be deleted.".format(len(changes['delete']))
    print "\t{}\trecords will be updated.".format(len(changes['update']))

    print
    print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                             sync_src.name)
    print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                               sync_target.name)

    if noconfirm or ask("Continue?")=='Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print "Adding {} new records...".format(len(changes['add']))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['delete']) > 0:
            print "Deleting {} old records...".format(len(changes['delete']))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['update']) > 0:
            print "Updating {} existing records...".format(len(changes['update']))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))
        print '='*20
        print 'Finished. [{}]'.format(timesofar(t00))
Example #6
0
def do_index_from_collection(collection,
                             index_name,
                             doc_type,
                             skip,
                             step=10000,
                             update=True):
    from utils.mongo import doc_feeder
    for doc_batch in doc_feeder(collection, step=step, s=skip, inbatch=True):
        _index_doc_batch(doc_batch, index_name, doc_type, update=update)
Example #7
0
    def validate_src(self,
                     collection,
                     return_false=False,
                     return_none=False,
                     return_true=False,
                     verbose=False,
                     flag_invalid=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}  # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id},
                                  {'$set': {
                                      "unmatched_ref": "True"
                                  }})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Example #8
0
 def _merge_sequential(self, collection, geneid_set, step=100000, idmapping_d=None):
     for doc in doc_feeder(self.src[collection], step=step):
         _id = doc['_id']
         if idmapping_d:
             _id = idmapping_d.get(_id, None) or _id
         for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
             __id = str(__id)
             if __id in geneid_set:
                 doc.pop('_id', None)
                 doc.pop('taxid', None)
                 # target_collection.update({'_id': __id}, {'$set': doc},
                 #                           manipulate=False,
                 #                           upsert=False) #,safe=True)
                 self.target.update(__id, doc)
Example #9
0
    def _merge_parallel_ipython(self, collection, geneid_set, step=100000, idmapping_d=None):
        from IPython.parallel import Client, require

        rc = Client()
        dview = rc[:]
        #dview = rc.load_balanced_view()
        dview.block = False
        target_collection = self.target.target_collection
        dview['server'] = target_collection.database.connection.host
        dview['port'] = target_collection.database.connection.port
        dview['database'] = target_collection.database.name
        dview['collection_name'] = target_collection.name

        def partition(lst, n):
            q, r = divmod(len(lst), n)
            indices = [q*i + min(i, r) for i in xrange(n+1)]
            return [lst[indices[i]:indices[i+1]] for i in xrange(n)]

        @require('mongokit', 'time')
        def worker(doc_li):
            conn = mongokit.Connection(server, port)
            target_collection = conn[database][collection_name]
            print "len(doc_li): {}".format(len(doc_li))
            t0 = time.time()
            for doc in doc_li:
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target_collection.update({'_id': __id}, {'$set': doc},
                                         manipulate=False,
                                         upsert=False)  # ,safe=True)
            print 'Done. [%.1fs]' % (time.time()-t0)

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    self.doc_queue.append(doc)

                    if len(self.doc_queue) >= step:
                        #dview.scatter('doc_li', self.doc_queue)
                        #dview.apply_async(worker)
                        dview.map_async(worker, partition(self.doc_queue, len(rc.ids)))
                        self.doc_queue = []
                        print "!",
Example #10
0
    def _build_index_sequential(self,
                                collection,
                                verbose=False,
                                query=None,
                                bulk=True,
                                update=False,
                                allow_upsert=True):
        from utils.mongo import doc_feeder

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                print("\tPausing for {}s...".format(delay), end='')
                time.sleep(delay)
                print("done.")

        src_docs = doc_feeder(collection,
                              step=self.step,
                              s=self.s,
                              batch_callback=rate_control,
                              query=query)
        if bulk:
            if update:
                # input doc will update existing one
                # if allow_upsert, create new one if not exist
                res = self.update_docs(src_docs, upsert=allow_upsert)
            else:
                # input doc will overwrite existing one
                res = self.index_bulk(src_docs)
            if len(res[1]) > 0:
                print("Error: {} docs failed indexing.".format(len(res[1])))
                file_name = collection + '_es_error.pyobj'
                dump(res, file_name)
            return res[0]
        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
                if verbose:
                    print(cnt, ':', doc['_id'])
            return cnt
Example #11
0
    def validate_src(self, collection, return_false=False,
                     return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}    # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Example #12
0
    def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None):
        from multiprocessing import Process, Queue
        NUMBER_OF_PROCESSES = 8

        input_queue = Queue()
        input_queue.conn_pool = []

        def worker(q, target):
            while True:
                doc = q.get()
                if doc == 'STOP':
                    break
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target.update(__id, doc)
                # target_collection.update({'_id': __id}, {'$set': doc},
                #                           manipulate=False,
                #                           upsert=False) #,safe=True)

        # Start worker processes
        for i in range(NUMBER_OF_PROCESSES):
            Process(target=worker, args=(input_queue, self.target)).start()

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    input_queue.put(doc)

        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            input_queue.put('STOP')
Example #13
0
def do_index_from_collection_0(collection, index_name, doc_type, skip, step=10000, update=True):
    from utils.mongo import doc_feeder

    for doc_batch in doc_feeder(collection, step=step, s=skip, inbatch=True):
        _index_doc_batch(doc_batch, index_name, doc_type, update=update)
Example #14
0
def do_index(doc_li, index_name, doc_type, step=1000, update=True, verbose=True):
    for doc_batch in doc_feeder(doc_li, step=step, verbose=verbose):
        _index_doc_batch(doc_batch, index_name, doc_type, update=update)
Example #15
0
    def make_genedoc_root(self):
        if not self._entrez_geneid_d:
            self._load_entrez_geneid_d()

        if 'ensembl_gene' in self._build_config['gene_root']:
            self._load_ensembl2entrez_li()
            ensembl2entrez = self._idmapping_d_cache['ensembl_gene']

        if "species" in self._build_config:
            _query = {'taxid': {'$in': self._build_config['species']}}
        elif "species_to_exclude" in self._build_config:
            _query = {'taxid': {'$nin': self._build_config['species_to_exclude']}}
        else:
            _query = None

        geneid_set = []
        species_set = set()
        if "entrez_gene" in self._build_config['gene_root']:
            for doc_li in doc_feeder(self.src['entrez_gene'], inbatch=True,  step=self.step, query=_query):
                #target_collection.insert(doc_li, manipulate=False, check_keys=False)
                self.target.insert(doc_li)
                geneid_set.extend([doc['_id'] for doc in doc_li])
                species_set |= set([doc['taxid'] for doc in doc_li])
            cnt_total_entrez_genes = len(geneid_set)
            cnt_total_species = len(species_set)
            print '# of entrez Gene IDs in total: %d' % cnt_total_entrez_genes
            print '# of species in total: %d' % cnt_total_species

        if "ensembl_gene" in self._build_config['gene_root']:
            cnt_ensembl_only_genes = 0
            cnt_total_ensembl_genes = 0
            for doc_li in doc_feeder(self.src['ensembl_gene'], inbatch=True, step=self.step, query=_query):
                _doc_li = []
                for _doc in doc_li:
                    cnt_total_ensembl_genes += 1
                    ensembl_id = _doc['_id']
                    entrez_gene = ensembl2entrez.get(ensembl_id, None)
                    if entrez_gene is None:
                        #this is an Ensembl only gene
                        _doc_li.append(_doc)
                        cnt_ensembl_only_genes += 1
                        geneid_set.append(_doc['_id'])
                if _doc_li:
                    #target_collection.insert(_doc_li, manipulate=False, check_keys=False)
                    self.target.insert(_doc_li)
            cnt_matching_ensembl_genes = cnt_total_ensembl_genes - cnt_ensembl_only_genes
            print '# of ensembl Gene IDs in total: %d' % cnt_total_ensembl_genes
            print '# of ensembl Gene IDs match entrez Gene IDs: %d' % cnt_matching_ensembl_genes
            print '# of ensembl Gene IDs DO NOT match entrez Gene IDs: %d' % cnt_ensembl_only_genes

            geneid_set = set(geneid_set)
            print '# of total Root Gene IDs: %d' % len(geneid_set)
            _stats = {'total_entrez_genes': cnt_total_entrez_genes,
                      'total_species': cnt_total_species,
                      'total_ensembl_genes': cnt_total_ensembl_genes,
                      'total_ensembl_genes_mapped_to_entrez': cnt_matching_ensembl_genes,
                      'total_ensembl_only_genes': cnt_ensembl_only_genes,
                      'total_genes': len(geneid_set)}
            self._stats = _stats
            self._src_version = self.get_src_version()
            self.log_src_build({'stats': _stats, 'src_version': self._src_version})
            return geneid_set