Ejemplo n.º 1
0
    def _merge_parallel(self,
                        collection,
                        geneid_set,
                        step=100000,
                        idmapping_d=None):
        from multiprocessing import Process, Queue
        NUMBER_OF_PROCESSES = 8

        input_queue = Queue()
        input_queue.conn_pool = []

        def worker(q, target):
            while True:
                doc = q.get()
                if doc == 'STOP':
                    break
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target.update(__id, doc)
                # target_collection.update({'_id': __id}, {'$set': doc},
                #                           manipulate=False,
                #                           upsert=False) #,safe=True)

        # Start worker processes
        for i in range(NUMBER_OF_PROCESSES):
            Process(target=worker, args=(input_queue, self.target)).start()

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(
                    _id
            ):  # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    input_queue.put(doc)

        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            input_queue.put('STOP')
Ejemplo n.º 2
0
    def _merge_parallel(self, collection, geneid_set, step=100000, idmapping_d=None):
        from multiprocessing import Process, Queue
        NUMBER_OF_PROCESSES = 8

        input_queue = Queue()
        input_queue.conn_pool = []

        def worker(q, target):
            while True:
                doc = q.get()
                if doc == 'STOP':
                    break
                __id = doc.pop('_id')
                doc.pop('taxid', None)
                target.update(__id, doc)
                # target_collection.update({'_id': __id}, {'$set': doc},
                #                           manipulate=False,
                #                           upsert=False) #,safe=True)

        # Start worker processes
        for i in range(NUMBER_OF_PROCESSES):
            Process(target=worker, args=(input_queue, self.target)).start()

        for doc in doc_feeder(self.src[collection], step=step):
            _id = doc['_id']
            if idmapping_d:
                _id = idmapping_d.get(_id, None) or _id
            for __id in alwayslist(_id):    # there could be cases that idmapping returns multiple entrez_gene ids.
                __id = str(__id)
                if __id in geneid_set:
                    doc['_id'] = __id
                    input_queue.put(doc)

        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            input_queue.put('STOP')