Ejemplo n.º 1
0
    def update2(self, id_patchs, collection, source_collection):
        from utils import backend

        _es = backend.GeneDocESBackend(self._esi)
        _db = backend.GeneDocMongoDBBackend(source_collection)

        for _id_chunk in iter_n(id_patchs, 100):
            es_docs = _es.mget_from_ids(_id_chunk, step=100)
            db_docs = _db.mget_from_ids(_id_chunk)
            es_docs = dict([(doc['_id'], doc) for doc in es_docs])
            db_docs = dict([(doc['_id'], doc) for doc in db_docs])
            for _id in es_docs:
                doc_es = es_docs[_id]
                doc_mongo = db_docs.get(_id, None)
                if doc_mongo:
                    doc_es[collection] = doc_mongo[collection]
                else:
                    print('id does not exist in mongodb collection:', _id)
                doc_es.pop('_id', None)
                es_info = {
                    '_op_type': 'index',
                    '_index': self._index,
                    "_id": _id,
                    '_type': self._doc_type,
                    '_source': doc_es
                }
                yield es_info
Ejemplo n.º 2
0
    def add(self, collection, ids):
        # compare id_list with current index, get list of ids with true/false indicator
        cnt_update = 0
        cnt_create = 0
		for ids_chunk in iter_n(ids, 100):
		    id_list_all = self._esi.mexists(ids_chunk, verbose=False)
		    for _id, _exists in id_list_all:
			_doc = self._src[collection].find_one({'_id': _id})
			_doc.pop('_id')
			# case one: this id exists in current index, then just update
			if _exists:
			    es_info = {
				'_op_type': 'update',
				'_index': self._index,
				'_type': self._doc_type,
				'_id': _id,
				'doc': _doc
			    }
			    cnt_update += 1
			# case two: this id not exists in current index, then create a new one
			else:
			    es_info = {
				'_op_type': 'create',
				'_index': self._index,
				'_type': self._doc_type,
				"_id": _id,
				'_source': _doc
			    }
			    cnt_create += 1
			yield es_info
		print('items updated: ', cnt_update)
		print('items newly created: ', cnt_create)
Ejemplo n.º 3
0
    def doc_iterator(self, genedoc_d, batch=True, step=10000):
        if isinstance(genedoc_d, types.GeneratorType) and batch:
            for doc_li in iter_n(genedoc_d, n=step):
                yield doc_li
        else:
            if batch:
                doc_li = []
                i = 0
            for _id, doc in genedoc_d.items():
                doc['_id'] = _id
                _doc = copy.copy(self)
                _doc.clear()
                _doc.update(doc)
                #if validate:
                #    _doc.validate()
                if batch:
                    doc_li.append(_doc)
                    i += 1
                    if i % step == 0:
                        yield doc_li
                        doc_li = []
                else:
                    yield _doc

            if batch:
                yield doc_li
Ejemplo n.º 4
0
    def set_chrom(self, batch_size, job_manager):
        # divide & conquer... build batches
        jobs = []
        total = self.target_backend.count()
        btotal = math.ceil(total/batch_size) 
        bnum = 1
        cnt = 0
        results = {"missing" : [], "disagreed" : []}
        root_keys = {}
        # grab ids only, so we can get more and fill queue for each step
        # each round, fill the queue to make sure every cpu slots are always working
        id_batch_size = batch_size * job_manager.process_queue._max_workers * 2
        self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \
                (self.target_backend.target_collection.name, id_batch_size, batch_size))
        for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger):
            for doc_ids in iter_n(big_doc_ids,batch_size):
                yield from asyncio.sleep(0.1)
                cnt += len(doc_ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = "post-merge (chrom)"
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.))
                self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \
                        (bnum,btotal,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(pinfo,
                        partial(chrom_worker, self.target_backend.target_name, doc_ids))
                def processed(f,results, batch_num):
                    try:
                        fres = f.result()
                        results["missing"].extend(fres["missing"])
                        results["disagreed"].extend(fres["disagreed"])
                        # merge root key counts
                        rk = fres["root_keys"]
                        for k in rk:
                            root_keys.setdefault(k,0)
                            root_keys[k] += rk[k]
                        self.logger.info("chrom batch #%d, done" % batch_num)
                    except Exception as e:
                        import traceback
                        self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \
                                (batch_num, e, traceback.format_exc()))
                        raise
                job.add_done_callback(partial(processed, results=results, batch_num=bnum))
                jobs.append(job)
                bnum += 1
        self.logger.info("%d jobs created for merging step" % len(jobs))
        if jobs:
            yield from asyncio.gather(*jobs)
            self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"])))
            if results["missing"] or results["disagreed"]:
                fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
                self.logger.info("Pickling 'chrom' discrepancies into %s" % fn)
                pickle.dump(results,open(fn,"wb"))
            # now store metadata
            root_keys["total"] = root_keys.pop("_id")
            self.logger.info("Root keys: %s" % root_keys)
            src_build = self.source_backend.build
            src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}})


        return results
Ejemplo n.º 5
0
 def _add_docs(ids):
     i = 0
     for _ids in iter_n(ids, step):
         t1 = time.time()
         _doc_li = src.mget_from_ids(_ids)
         for _doc in _doc_li:
             _doc['_timestamp'] = _timestamp
             i += 1
         target.insert(_doc_li)
         print('\t{}\t{}'.format(i, timesofar(t1)))
Ejemplo n.º 6
0
 def post_dump(self, *args, **kwargs):
     self.logger.info("Merging JSON documents in '%s'" % self.new_data_folder)
     # we'll merge 100 files together, that's 100'000 documents. That way we don't have one huge
     # big files and we don't have thousands of them too. We'll also remove metadata (useless now)
     parts = glob.iglob(os.path.join(self.new_data_folder,"molecule.part*"))
     for chunk,cnt in iter_n(parts,self.__class__.CHUNK_MERGE_SIZE,with_cnt=True):
         outfile = os.path.join(self.new_data_folder,"molecule.%s.json" % cnt)
         merged_data = {"molecules" : []}
         for f in chunk:
             data = json.load(open(f))
             merged_data["molecules"].extend(data["molecules"])
         json.dump(merged_data,open(outfile,"w"))
         self.logger.info("Merged %s files" % cnt)
     # now we can delete the parts
     self.logger.info("Deleting part files")
     parts = glob.iglob(os.path.join(self.new_data_folder,"molecule.part*"))
     for f in parts:
         os.remove(f)
     self.logger.info("Post-dump merge done")
Ejemplo n.º 7
0
def verify_ids(doc_iter, index, doc_type, step=100000, ):
    '''verify how many docs from input interator/list overlapping with existing docs.'''

    index = index
    doc_type = doc_type
    es = get_es()
    q = {'query': {'ids': {"values": []}}}
    total_cnt = 0
    found_cnt = 0
    out = []
    for doc_batch in iter_n(doc_iter, n=step):
        id_li = [doc['_id'] for doc in doc_batch]
        # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch]
        q['query']['ids']['values'] = id_li
        xres = es.search(index=index, doc_type=doc_type, body=q, _source=False)
        found_cnt += xres['hits']['total']
        total_cnt += len(id_li)
        print(xres['hits']['total'], found_cnt, total_cnt)
        out.extend([x['_id'] for x in xres['hits']['hits']])
    return out
Ejemplo n.º 8
0
    def do_snpeff(self, batch_size=SNPEFF_BATCH_SIZE, force=False, force_use_cache=False):
        self.logger.info("Updating snpeff information from source '%s' (collection:%s)" % (self.fullname,self.collection_name))
        # select Snpeff uploader to get collection name and src_dump _id
        version = self.__class__.__metadata__["assembly"]
        snpeff_class = getattr(snpeff_upload,"Snpeff%sUploader" % version.capitalize())
        snpeff_main_source = snpeff_class.main_source
        snpeff_doc = self.src_dump.find_one({"_id" : snpeff_main_source})
        assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?"
        snpeff_dir = snpeff_doc["data_folder"]
        # -q: when there's an update, there's a message on stderr....
        cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % (snpeff_dir,version)
        # genome files are in "data_folder"/../data
        genomes = glob.glob(os.path.join(snpeff_dir,"..","data","%s_genome.*" % version))
        assert len(genomes) == 1, "Expected only one genome files for '%s', got: %s" % (version,genomes)
        genome = genomes[0]
        annotator = snpeff_parser.SnpeffAnnotator(cmd,logger=self.logger)
        vcf_builder = snpeff_parser.VCFConstruct(genome,logger=self.logger)
        storage = UpsertStorage(None,snpeff_class.name,self.logger)
        col = self.db[self.collection_name]
        total = math.ceil(col.count()/batch_size)
        cnt = 0
        to_process = []

        def process(ids):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt","ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs,version)
            howmany = storage.process(data, batch_size)
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)

        for ids in id_feeder(col, batch_size=batch_size, logger=self.logger, force_use=force_use_cache):
            cnt += 1
            self.logger.debug("Processing batch %s/%s [%.1f]" % (cnt,total,(cnt/total*100)))
            # don't re-compute annotations if already there
            if not force:
                for subids in iter_n(ids,10000):
                    cur = storage.temp_collection.find({'_id' : {'$in' : subids}},{'_id':1})
                    already_ids = [d["_id"] for d in list(cur)]
                    newids = list(set(subids).difference(set(already_ids)))
                    if len(subids) != len(newids):
                        self.logger.debug("%d documents already have snpeff annotations, skip them" % \
                                (len(subids) - len(newids)))
                    to_process.extend(newids)
                    self.logger.debug("Batch filled %d out of %d" % (len(to_process),batch_size))
                    if not (len(to_process) >= batch_size):
                        # can fill more...
                        continue
                    process(to_process)
                    to_process = []
            else:
                to_process = ids
        # for potential remainings
        if to_process:
            process(to_process)
Ejemplo n.º 9
0
def id_feeder(col,
              batch_size=1000,
              build_cache=True,
              logger=logging,
              force_use=False,
              force_build=False):
    """Return an iterator for all _ids in collection "col"
       Search for a valid cache file if available, if not
       return a doc_feeder for that collection. Valid cache is
       a cache file that is newer than the collection.
       "db" can be "target" or "src".
       "build_cache" True will build a cache file as _ids are fetched, 
       if no cache file was found
       "force_use" True will use any existing cache file and won't check whether
       it's valid of not.
       "force_build" True will build a new cache even if current one exists
       and is valid.
    """
    src_db = get_src_db()
    ts = None
    found_meta = True

    if isinstance(col, DocMongoBackend):
        col = col.target_collection

    try:
        if col.database.name == config.DATA_TARGET_DATABASE:
            info = src_db["src_build"].find_one({"_id": col.name})
            if not info:
                logger.warning(
                    "Can't find information for target collection '%s'" %
                    col.name)
            else:
                ts = info["started_at"].timestamp()
        elif col.database.name == config.DATA_SRC_DATABASE:
            info = src_db["src_dump"].find_one({
                "$where":
                "function() {if(this.upload) {for(var index in this.upload.jobs) {if(this.upload.jobs[index].step == \"%s\") return this;}}}"
                % col.name
            })
            if not info:
                logger.warning(
                    "Can't find information for source collection '%s'" %
                    col.name)
            else:
                ts = info["upload"]["jobs"][col.name]["started_at"].timestamp()
        else:
            logging.warning(
                "Can't find metadata for collection '%s' (not a target, not a source collection)"
                % col)
            found_meta = False
            build_cache = False
    except KeyError:
        logger.warning("Couldn't find timestamp in database for '%s'" %
                       col.name)
    except Exception as e:
        logger.info(
            "%s is not a mongo collection, _id cache won't be built (error: %s)"
            % (col, e))
        build_cache = False

    # try to find a cache file
    use_cache = False
    cache_file = None
    cache_format = getattr(config, "CACHE_FORMAT", None)
    if found_meta and getattr(config, "CACHE_FOLDER", None):
        cache_file = get_cache_filename(col.name)
        try:
            # size of empty file differs depending on compression
            empty_size = {None: 0, "xz": 32, "gzip": 25, "bz2": 14}
            if force_build:
                logger.warning("Force building cache file")
                use_cache = False
            # check size, delete if invalid
            elif os.path.getsize(cache_file) <= empty_size.get(
                    cache_format, 32):
                logger.warning("Cache file exists but is empty, delete it")
                os.remove(cache_file)
            elif force_use:
                use_cache = True
                logger.info("Force using cache file")
            else:
                mt = os.path.getmtime(cache_file)
                if ts and mt >= ts:
                    use_cache = True
                else:
                    logger.info("Cache is too old, discard it")
        except FileNotFoundError:
            pass
    if use_cache:
        logger.debug("Found valid cache file for '%s': %s" %
                     (col.name, cache_file))
        with open_compressed_file(cache_file) as cache_in:
            if cache_format:
                iocache = io.TextIOWrapper(cache_in)
            else:
                iocache = cache_in
            for ids in iter_n(iocache, batch_size):
                yield [_id.strip() for _id in ids if _id.strip()]
    else:
        logger.debug(
            "No cache file found (or invalid) for '%s', use doc_feeder" %
            col.name)
        cache_out = None
        cache_temp = None
        if getattr(config, "CACHE_FOLDER",
                   None) and config.CACHE_FOLDER and build_cache:
            if not os.path.exists(config.CACHE_FOLDER):
                os.makedirs(config.CACHE_FOLDER)
            cache_temp = "%s._tmp_" % cache_file
            # clean aborted cache file generation
            for tmpcache in glob.glob(
                    os.path.join(config.CACHE_FOLDER, "%s*" % cache_temp)):
                logger.info("Removing aborted cache file '%s'" % tmpcache)
                os.remove(tmpcache)
            # use temp file and rename once done
            cache_temp = "%s%s" % (cache_temp, get_random_string())
            cache_out = get_compressed_outfile(cache_temp,
                                               compress=cache_format)
            logger.info("Building cache file '%s'" % cache_temp)
        else:
            logger.info(
                "Can't build cache, cache not allowed or no cache folder")
            build_cache = False
        if isinstance(col, Collection):
            doc_feeder_func = partial(doc_feeder,
                                      col,
                                      step=batch_size,
                                      inbatch=True,
                                      fields={"_id": 1})
        elif isinstance(col, DocMongoBackend):
            doc_feeder_func = partial(doc_feeder,
                                      col.target_collection,
                                      step=batch_size,
                                      inbatch=True,
                                      fields={"_id": 1})
        elif isinstance(col, DocESBackend):
            # get_id_list directly return the _id, wrap it to match other
            # doc_feeder_func returned vals. Also return a batch of id
            def wrap_id():
                ids = []
                for _id in col.get_id_list(step=batch_size):
                    ids.append({"_id": _id})
                    if len(ids) >= batch_size:
                        yield ids
                        ids = []
                if ids:
                    yield ids

            doc_feeder_func = partial(wrap_id)
        else:
            raise Exception("Unknown backend %s" % col)
        for doc_ids in doc_feeder_func():
            doc_ids = [_doc["_id"] for _doc in doc_ids]
            if build_cache:
                strout = "\n".join(doc_ids) + "\n"
                if cache_format:
                    # assuming binary format (b/ccompressed)
                    cache_out.write(strout.encode())
                else:
                    cache_out.write(strout)
            yield doc_ids
        if build_cache:
            cache_out.close()
            cache_final = os.path.splitext(cache_temp)[0]
            os.rename(cache_temp, cache_final)