Beispiel #1
0
    def set_chrom(self, batch_size, job_manager):
        # divide & conquer... build batches
        jobs = []
        total = self.target_backend.count()
        btotal = math.ceil(total/batch_size) 
        bnum = 1
        cnt = 0
        results = {"missing" : [], "disagreed" : []}
        root_keys = {}
        # grab ids only, so we can get more and fill queue for each step
        # each round, fill the queue to make sure every cpu slots are always working
        id_batch_size = batch_size * job_manager.process_queue._max_workers * 2
        self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \
                (self.target_backend.target_collection.name, id_batch_size, batch_size))
        for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger):
            for doc_ids in iter_n(big_doc_ids,batch_size):
                yield from asyncio.sleep(0.1)
                cnt += len(doc_ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = "post-merge (chrom)"
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.))
                self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \
                        (bnum,btotal,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(pinfo,
                        partial(chrom_worker, self.target_backend.target_name, doc_ids))
                def processed(f,results, batch_num):
                    try:
                        fres = f.result()
                        results["missing"].extend(fres["missing"])
                        results["disagreed"].extend(fres["disagreed"])
                        # merge root key counts
                        rk = fres["root_keys"]
                        for k in rk:
                            root_keys.setdefault(k,0)
                            root_keys[k] += rk[k]
                        self.logger.info("chrom batch #%d, done" % batch_num)
                    except Exception as e:
                        import traceback
                        self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \
                                (batch_num, e, traceback.format_exc()))
                        raise
                job.add_done_callback(partial(processed, results=results, batch_num=bnum))
                jobs.append(job)
                bnum += 1
        self.logger.info("%d jobs created for merging step" % len(jobs))
        if jobs:
            yield from asyncio.gather(*jobs)
            self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"])))
            if results["missing"] or results["disagreed"]:
                fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
                self.logger.info("Pickling 'chrom' discrepancies into %s" % fn)
                pickle.dump(results,open(fn,"wb"))
            # now store metadata
            root_keys["total"] = root_keys.pop("_id")
            self.logger.info("Root keys: %s" % root_keys)
            src_build = self.source_backend.build
            src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}})


        return results
Beispiel #2
0
    def set_chrom(self, batch_size, job_manager):
        # divide & conquer... build batches
        jobs = []
        total = self.target_backend.count()
        btotal = math.ceil(total/batch_size) 
        bnum = 1
        cnt = 0
        results = {"missing" : [], "disagreed" : []}
        # grab ids only, so we can get more and fill queue for each step
        # each round, fill the queue to make sure every cpu slots are always working
        id_batch_size = batch_size * job_manager.process_queue._max_workers * 2
        self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \
                (self.target_backend.target_collection.name, id_batch_size, batch_size))
        for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger):
            for doc_ids in iter_n(big_doc_ids,batch_size):
                yield from asyncio.sleep(0.1)
                cnt += len(doc_ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = "post-merge (chrom)"
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.))
                self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \
                        (bnum,btotal,cnt,total,(cnt/total*100.)))
                ids = [doc["_id"] for doc in doc_ids]
                job = yield from job_manager.defer_to_process(pinfo,
                        partial(chrom_worker, self.target_backend.target_name, ids))
                def processed(f,results, batch_num):
                    try:
                        fres = f.result()
                        results["missing"].extend(fres["missing"])
                        results["disagreed"].extend(fres["disagreed"])
                        self.logger.info("chrom batch #%d, done" % batch_num)
                    except Exception as e:
                        import traceback
                        self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \
                                (batch_num, e, traceback.format_exc()))
                        raise
                job.add_done_callback(partial(processed, results=results, batch_num=bnum))
                jobs.append(job)
                bnum += 1
        self.logger.info("%d jobs created for merging step" % len(jobs))
        if jobs:
            yield from asyncio.gather(*jobs)
            self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (len(results["missing"]), len(results["disagreed"])))
            if results["missing"] or results["disagreed"]:
                fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
                self.logger.info("Pickling 'chrom' discrepancies into %s" % fn)
                pickle.dump(results,open(fn,"wb"))

        return results
Beispiel #3
0
    def do_snpeff(self,
                  batch_size=SNPEFF_BATCH_SIZE,
                  force=False,
                  force_use_cache=False):
        self.logger.info(
            "Updating snpeff information from source '%s' (collection:%s)" %
            (self.fullname, self.collection_name))
        # select Snpeff uploader to get collection name and src_dump _id
        version = self.__class__.__metadata__["assembly"]
        snpeff_class = getattr(snpeff_upload,
                               "Snpeff%sUploader" % version.capitalize())
        snpeff_main_source = snpeff_class.main_source
        snpeff_doc = self.src_dump.find_one({"_id": snpeff_main_source})
        assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?"
        snpeff_dir = snpeff_doc["download"]["data_folder"]
        # -q: when there's an update, there's a message on stderr....
        cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % (
            snpeff_dir, version)
        # genome files are in "data_folder"/../data
        genomes = glob.glob(
            os.path.join(snpeff_dir, "..", "data", "%s_genome.*" % version))
        assert len(
            genomes
        ) == 1, "Expected only one genome files for '%s', got: %s" % (version,
                                                                      genomes)
        genome = genomes[0]
        annotator = snpeff_parser.SnpeffAnnotator(cmd, logger=self.logger)
        vcf_builder = snpeff_parser.VCFConstruct(genome, logger=self.logger)
        storage = UpsertStorage(None, snpeff_class.name, self.logger)
        col = self.db[self.collection_name]
        total = math.ceil(col.count() / batch_size)
        cnt = 0
        to_process = []

        def process(ids):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt", "ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN -
                                                      len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs, version)
            howmany = storage.process(data, batch_size)
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" %
                                  snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)

        for ids in id_feeder(col,
                             batch_size=batch_size,
                             logger=self.logger,
                             force_use=force_use_cache):
            cnt += 1
            self.logger.debug("Processing batch %s/%s [%.1f]" %
                              (cnt, total, (cnt / total * 100)))
            # don't re-compute annotations if already there
            if not force:
                for subids in iter_n(ids, 10000):
                    cur = storage.temp_collection.find(
                        {'_id': {
                            '$in': subids
                        }}, {'_id': 1})
                    already_ids = [d["_id"] for d in list(cur)]
                    newids = list(set(subids).difference(set(already_ids)))
                    if len(subids) != len(newids):
                        self.logger.debug("%d documents already have snpeff annotations, skip them" % \
                                (len(subids) - len(newids)))
                    to_process.extend(newids)
                    self.logger.debug("Batch filled %d out of %d" %
                                      (len(to_process), batch_size))
                    if not (len(to_process) >= batch_size):
                        # can fill more...
                        continue
                    process(to_process)
                    to_process = []
            else:
                to_process = ids
        # for potential remainings
        if to_process:
            process(to_process)
Beispiel #4
0
 def rebuild(col):
     cur = mongo.id_feeder(col,
                           batch_size=10000,
                           logger=config.logger,
                           force_build=force_build)
     [i for i in cur]  # just iterate
Beispiel #5
0
    def merge_source(self,
                     src_name,
                     batch_size=100000,
                     ids=None,
                     job_manager=None):
        # it's actually not optional
        assert job_manager
        _query = self.generate_document_query(src_name)
        # Note: no need to check if there's an existing document with _id (we want to merge only with an existing document)
        # if the document doesn't exist then the update() call will silently fail.
        # That being said... if no root documents, then there won't be any previously inserted
        # documents, and this update() would just do nothing. So if no root docs, then upsert
        # (update or insert, but do something)
        defined_root_sources = self.get_root_document_sources()
        upsert = not defined_root_sources or src_name in defined_root_sources
        if not upsert:
            self.logger.debug(
                "Documents from source '%s' will be stored only if a previous document exists with same _id"
                % src_name)
        jobs = []
        total = self.source_backend[src_name].count()
        btotal = math.ceil(total / batch_size)
        bnum = 1
        cnt = 0
        got_error = False
        # grab ids only, so we can get more, let's say 10 times more
        id_batch_size = batch_size * 10
        if ids:
            self.logger.info(
                "Merging '%s' specific list of _ids, create merger job with batch_size=%d"
                % (src_name, batch_size))
            id_provider = [ids]
        else:
            self.logger.info(
                "Fetch _ids from '%s' with batch_size=%d, and create merger job with batch_size=%d"
                % (src_name, id_batch_size, batch_size))
            id_provider = id_feeder(self.source_backend[src_name],
                                    batch_size=id_batch_size)
        id_provider = ids and [ids] or id_feeder(self.source_backend[src_name],
                                                 batch_size=id_batch_size,
                                                 logger=self.logger)
        for big_doc_ids in id_provider:
            for doc_ids in iter_n(big_doc_ids, batch_size):
                # try to put some async here to give control back
                # (but everybody knows it's a blocking call: doc_feeder)
                yield from asyncio.sleep(0.1)
                cnt += len(doc_ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = src_name
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal,
                                                            (cnt / total *
                                                             100))
                self.logger.info("Creating merger job #%d/%d, to process '%s' %d/%d (%.1f%%)" % \
                        (bnum,btotal,src_name,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(
                    pinfo,
                    partial(merger_worker, self.source_backend[src_name].name,
                            self.target_backend.target_name, doc_ids,
                            self.get_mapper_for_source(src_name, init=False),
                            upsert, bnum))

                def batch_merged(f, batch_num):
                    nonlocal got_error
                    if type(f.result()) != int:
                        got_error = Exception(
                            "Batch #%s failed while merging source '%s' [%s]" %
                            (batch_num, src_name, f.result()))

                job.add_done_callback(partial(batch_merged, batch_num=bnum))
                jobs.append(job)
                bnum += 1
                # raise error as soon as we know
                if got_error:
                    raise got_error
        self.logger.info("%d jobs created for merging step" % len(jobs))
        tasks = asyncio.gather(*jobs)

        def done(f):
            nonlocal got_error
            if None in f.result():
                got_error = Exception("Some batches failed")
                return
            # compute overall inserted/updated records
            cnt = sum(f.result())

        tasks.add_done_callback(done)
        yield from tasks
        if got_error:
            raise got_error
        else:
            return {"%s" % src_name: cnt}
Beispiel #6
0
    def index(self,
              target_name,
              index_name,
              job_manager,
              steps=["index", "post"],
              batch_size=10000,
              ids=None,
              mode="index"):
        """
        Build an index named "index_name" with data from collection
        "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following
        values:
        - 'purge': will delete index if it exists
        - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, 
                 or, if not pass, ES will be queried to identify which IDs are missing for each batch in
                 order to complete the index.
        - None (default): will create a new index, assuming it doesn't already exist
        """
        assert job_manager
        # check what to do
        if type(steps) == str:
            steps = [steps]
        self.target_name = target_name
        self.index_name = index_name
        self.setup_log()
        self.load_build()

        got_error = False
        cnt = 0

        if "index" in steps:
            _db = mongo.get_target_db()
            target_collection = _db[target_name]
            _mapping = self.get_mapping()
            _extra = self.get_index_creation_settings()
            _meta = {}
            # partially instantiated indexer instance for process workers
            partial_idxer = partial(ESIndexer,
                                    doc_type=self.doc_type,
                                    index=index_name,
                                    es_host=self.host,
                                    step=batch_size,
                                    number_of_shards=self.num_shards,
                                    number_of_replicas=self.num_replicas)
            # instantiate one here for index creation
            es_idxer = partial_idxer()
            if es_idxer.exists_index():
                if mode == "purge":
                    es_idxer.delete_index()
                elif mode != "resume":
                    raise IndexerException(
                        "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)"
                        % index_name)

            if mode != "resume":
                es_idxer.create_index({self.doc_type: _mapping}, _extra)

            jobs = []
            total = target_collection.count()
            btotal = math.ceil(total / batch_size)
            bnum = 1
            if ids:
                self.logger.info(
                    "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d"
                    % (target_name, batch_size))
                id_provider = [ids]
            else:
                self.logger.info(
                    "Fetch _ids from '%s', and create indexer job with batch_size=%d"
                    % (target_name, batch_size))
                id_provider = id_feeder(target_collection,
                                        batch_size=batch_size,
                                        logger=self.logger)
            for ids in id_provider:
                yield from asyncio.sleep(0.0)
                cnt += len(ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = self.target_name
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal,
                                                            (cnt / total *
                                                             100))
                self.logger.info("Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)" % \
                        (bnum,btotal,target_name,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(
                    pinfo,
                    partial(indexer_worker, self.target_name, ids,
                            partial_idxer, bnum, mode))

                def batch_indexed(f, batch_num):
                    nonlocal got_error
                    res = f.result()
                    if type(res) != tuple or type(res[0]) != int:
                        got_error = Exception(
                            "Batch #%s failed while indexing collection '%s' [result:%s]"
                            % (batch_num, self.target_name, repr(f.result())))

                job.add_done_callback(partial(batch_indexed, batch_num=bnum))
                jobs.append(job)
                bnum += 1
                # raise error as soon as we know
                if got_error:
                    raise got_error
            self.logger.info("%d jobs created for indexing step" % len(jobs))
            tasks = asyncio.gather(*jobs)

            def done(f):
                nonlocal got_error
                if None in f.result():
                    got_error = Exception("Some batches failed")
                    return
                # compute overall inserted/updated records
                # returned values looks like [(num,[]),(num,[]),...]
                cnt = sum([val[0] for val in f.result()])
                self.logger.info("Index '%s' successfully created" %
                                 index_name,
                                 extra={"notify": True})

            tasks.add_done_callback(done)
            yield from tasks

        if "post" in steps:
            self.logger.info("Running post-index process for index '%s'" %
                             index_name)
            pinfo = self.get_pinfo()
            pinfo["step"] = "post_index"
            # for some reason (like maintaining object's state between pickling).
            # we can't use process there. Need to use thread to maintain that state without
            # building an unmaintainable monster
            job = yield from job_manager.defer_to_thread(
                pinfo,
                partial(self.post_index,
                        target_name,
                        index_name,
                        job_manager,
                        steps=steps,
                        batch_size=batch_size,
                        ids=ids,
                        mode=mode))

            def posted(f):
                try:
                    res = f.result()
                    self.logger.info(
                        "Post-index process done for index '%s': %s" %
                        (index_name, res))
                except Exception as e:
                    got_error = e
                    self.logger.error(
                        "Post-index process failed for index '%s': %s" %
                        (index_name, e),
                        extra={"notify": True})
                    raise

            job.add_done_callback(posted)
            yield from asyncio.gather(job)  # consume future

        if got_error:
            raise got_error
        else:
            return {"%s" % self.target_name: cnt}
Beispiel #7
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info(
                    "Running inspector on %s (type:%s,data_provider:%s)" %
                    (repr(data_provider), data_provider_type,
                     backend_provider))
                if sample is not None:
                    self.logger.info(
                        "Sample set to %s, inspect only a subset of data",
                        sample)
                if limit is None:
                    self.logger.info("Inspecting all the documents")
                else:
                    nonlocal batch_size
                    # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit
                    if batch_size > limit:
                        batch_size = limit
                    self.logger.info("Inspecting only %s documents", limit)
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                doccnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]

                converters, mode = btinspect.get_converters(mode)

                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    if sample is not None:
                        if random.random() > sample:
                            continue
                    cnt += 1
                    doccnt += batch_size
                    if limit and doccnt > limit:
                        break
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                # just potential converters
                btinspect.run_converters(inspected, converters)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)

                        # when inspecting with "stats" mode, we can get huge number but mongo
                        # can't store more than 2^64, make sure to get rid of big nums there
                        def clean_big_nums(k, v):
                            # TODO: same with float/double? seems mongo handles more there ?
                            if isinstance(v, int) and v > 2**64:
                                return k, math.nan
                            else:
                                return k, v

                        dict_traverse(_map, clean_big_nums)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \
                        (repr(data_provider),data_provider_type,backend_provider))
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]
                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    cnt += 1
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
Beispiel #9
0
    def index(self,
              target_name,
              index_name,
              job_manager,
              steps=["index", "post"],
              batch_size=10000,
              ids=None,
              mode="index",
              worker=None):
        """
        Build an index named "index_name" with data from collection "target_collection".

        "ids" can be passed to selectively index documents.

        "mode" can have the following values:
            - 'purge': will delete index if it exists
            - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs,
                    or, if not pass, ES will be queried to identify which IDs are missing for each batch in
                    order to complete the index.
            - 'merge': will merge data with existing index' documents, used when populated several distinct times (cold/hot merge for instance)
            - None (default): will create a new index, assuming it doesn't already exist
        """
        assert job_manager
        # check what to do
        if type(steps) == str:
            steps = [steps]
        self.target_name = target_name
        self.index_name = index_name
        self.load_build()
        self.setup_log()
        # select proper index worker according to mode:
        if worker is None:  # none specified, choose correct one
            if mode == "merge":
                worker = merge_index_worker
            else:
                worker = new_index_worker

        got_error = False
        cnt = 0

        if "index" in steps:
            self.register_status("indexing",
                                 transient=True,
                                 init=True,
                                 job={"step": "index"})
            assert self.build_doc.get("backend_url")
            target_collection = create_backend(
                self.build_doc["backend_url"]).target_collection
            backend_url = self.build_doc["backend_url"]
            _mapping = self.get_mapping()
            _extra = self.get_index_creation_settings()
            # partially instantiated indexer instance for process workers
            partial_idxer = partial(ESIndexer,
                                    doc_type=self.doc_type,
                                    index=index_name,
                                    es_host=self.host,
                                    step=batch_size,
                                    number_of_shards=self.num_shards,
                                    number_of_replicas=self.num_replicas,
                                    **self.kwargs)
            # instantiate one here for index creation
            es_idxer = partial_idxer()
            if es_idxer.exists_index():
                if mode == "purge":
                    es_idxer.delete_index()
                elif mode not in ["resume", "merge"]:
                    msg = "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name
                    self.register_status("failed", job={"err": msg})
                    raise IndexerException(msg)

            if mode not in ["resume", "merge"]:
                try:
                    es_idxer.create_index({self.doc_type: _mapping}, _extra)
                except Exception as e:
                    self.logger.exception("Failed to create index")
                    self.register_status("failed", job={"err": repr(e)})
                    raise

            def clean_ids(ids):
                # can't use a generator, it's going to be pickled
                cleaned = []
                for _id in ids:
                    if type(_id) != str:
                        self.logger.warning(
                            "_id '%s' has invalid type (!str), skipped",
                            repr(_id))
                        continue
                    if len(_id) > 512:  # this is an ES6 limitation
                        self.logger.warning("_id is too long: '%s'", _id)
                        continue
                    cleaned.append(_id)
                return cleaned

            jobs = []
            total = target_collection.count()
            btotal = math.ceil(total / batch_size)
            bnum = 1
            if ids:
                self.logger.info(
                    "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d",
                    target_name, batch_size)
                id_provider = iter_n(ids, batch_size)
            else:
                self.logger.info(
                    "Fetch _ids from '%s', and create indexer job with batch_size=%d",
                    target_name, batch_size)
                id_provider = id_feeder(target_collection,
                                        batch_size=batch_size,
                                        logger=self.logger)
            for ids in id_provider:
                yield from asyncio.sleep(0.0)
                origcnt = len(ids)
                ids = clean_ids(ids)
                newcnt = len(ids)
                if origcnt != newcnt:
                    self.logger.warning(
                        "%d document(s) can't be indexed and will be skipped (invalid _id)",
                        origcnt - newcnt)
                # progress count
                cnt += len(ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = self.target_name
                try:
                    descprogress = cnt / total * 100
                except ZeroDivisionError:
                    descprogress = 0.0
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal,
                                                            descprogress)
                self.logger.info(
                    "Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)",
                    bnum, btotal, backend_url, cnt, total, descprogress)
                job = yield from job_manager.defer_to_process(
                    pinfo,
                    partial(indexer_worker, backend_url, ids, partial_idxer,
                            bnum, mode, worker))

                def batch_indexed(f, batch_num):
                    nonlocal got_error
                    try:
                        res = f.result()
                        if type(res) != tuple or type(res[0]) != int:
                            got_error = Exception(
                                "Batch #%s failed while indexing collection '%s' [result:%s]"
                                % (batch_num, self.target_name, repr(res)))
                    except Exception as e:
                        got_error = e
                        self.logger.exception("Batch indexed error")
                        return

                job.add_done_callback(partial(batch_indexed, batch_num=bnum))
                jobs.append(job)
                bnum += 1
                # raise error as soon as we know
                if got_error:
                    self.register_status("failed",
                                         job={"err": repr(got_error)})
                    raise got_error
            self.logger.info("%d jobs created for indexing step", len(jobs))
            tasks = asyncio.gather(*jobs)

            def done(f):
                nonlocal got_error
                if None in f.result():
                    got_error = None
                    return
                # compute overall inserted/updated records
                # returned values looks like [(num,[]),(num,[]),...]
                cnt = sum((val[0] for val in f.result()))
                self.register_status("success",
                                     job={"step": "index"},
                                     index={"count": cnt})
                if total != cnt:
                    # raise error if counts don't match, but index is still created,
                    # fully registered in case we want to use it anyways
                    err = "Merged collection has %d documents but %d have been indexed (check logs for more)" % (
                        total, cnt)
                    raise IndexerException(err)
                self.logger.info(
                    "Index '%s' successfully created using merged collection %s",
                    index_name,
                    target_name,
                    extra={"notify": True})

            tasks.add_done_callback(done)
            yield from tasks

        if "post" in steps:
            self.logger.info("Running post-index process for index '%s'",
                             index_name)
            self.register_status("indexing",
                                 transient=True,
                                 init=True,
                                 job={"step": "post-index"})
            pinfo = self.get_pinfo()
            pinfo["step"] = "post_index"
            # for some reason (like maintaining object's state between pickling).
            # we can't use process there. Need to use thread to maintain that state without
            # building an unmaintainable monster
            job = yield from job_manager.defer_to_thread(
                pinfo,
                partial(self.post_index,
                        target_name,
                        index_name,
                        job_manager,
                        steps=steps,
                        batch_size=batch_size,
                        ids=ids,
                        mode=mode))

            def posted(f):
                nonlocal got_error
                try:
                    res = f.result()
                    self.logger.info(
                        "Post-index process done for index '%s': %s",
                        index_name, res)
                    self.register_status("indexing",
                                         job={"step": "post-index"})
                except Exception as e:
                    got_error = e
                    self.logger.exception(
                        "Post-index process failed for index '%s':",
                        index_name,
                        extra={"notify": True})
                    return

            job.add_done_callback(posted)
            yield from job  # consume future

        if got_error:
            self.register_status("failed", job={"err": repr(got_error)})
            raise got_error
        else:
            self.register_status("success")
            return {"%s" % self.index_name: cnt}
Beispiel #10
0
    def set_chrom(self, batch_size, job_manager):
        # divide & conquer... build batches
        jobs = []
        total = self.target_backend.count()
        btotal = math.ceil(total/batch_size)
        bnum = 1
        cnt = 0
        results = {
            "missing" : {
                "count": 0,
                "examples": []
            },
            "disagreed" : {
                "count": 0,
                "examples": []
            }
        }
        root_keys = {}
        # grab ids only, so we can get more and fill queue for each step
        # each round, fill the queue to make sure every cpu slots are always working
        id_batch_size = batch_size * job_manager.process_queue._max_workers * 2
        self.logger.info("Fetch _ids from '%s' with batch_size=%d, and create post-merger job with batch_size=%d" % \
                (self.target_backend.target_collection.name, id_batch_size, batch_size))
        for big_doc_ids in id_feeder(self.target_backend.target_collection, batch_size=id_batch_size, logger=self.logger):
            for doc_ids in iter_n(big_doc_ids,batch_size):
                # yield from asyncio.sleep(0.1)   # comment this out, this raised an error in py3.8
                cnt += len(doc_ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = "post-merge (chrom)"
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum,btotal,(cnt/total*100.))
                self.logger.info("Creating post-merge job #%d/%d to process chrom %d/%d (%.1f%%)" % \
                        (bnum,btotal,cnt,total,(cnt/total*100.)))
                job = yield from job_manager.defer_to_process(pinfo,
                        partial(chrom_worker, self.target_backend.target_name, doc_ids))
                def processed(f,results, batch_num):
                    try:
                        fres = f.result()
                        for errtype in ("missing","disagreed"):
                            if fres[errtype]:
                                results[errtype]["count"] += len(fres[errtype])
                                if len(results[errtype]["examples"]) < self.__class__.MAX_CHROM_EX:
                                    results[errtype]["examples"].extend(fres[errtype])
                        # merge root key counts
                        rk = fres["root_keys"]
                        for k in rk:
                            root_keys.setdefault(k,0)
                            root_keys[k] += rk[k]
                        self.logger.info("chrom batch #%d, done" % batch_num)
                    except Exception as e:
                        import traceback
                        self.logger.error("chrom batch #%d, error in processed (set_chrom): %s:\n%s" % \
                                (batch_num, e, traceback.format_exc()))
                        raise
                job.add_done_callback(partial(processed, results=results, batch_num=bnum))
                jobs.append(job)
                bnum += 1
        self.logger.info("%d jobs created for merging step" % len(jobs))
        if jobs:
            yield from asyncio.gather(*jobs)
            self.logger.info("Found %d missing 'chrom' and %d where resources disagreed" % (results["missing"]["count"], results["disagreed"]["count"]))
            if results["missing"]["count"] or results["disagreed"]["count"]:
                fn = "chrom_%s_%s.pickle" % (self.target_backend.target_name,datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
                self.logger.info("Pickling 'chrom' discrepancies into %s" % fn)
                pickle.dump(results,open(fn,"wb"))
            # now store metadata
            root_keys["total"] = root_keys.pop("_id")
            self.logger.info("Root keys: %s" % root_keys)
            src_build = self.source_backend.build
            src_build.update({'_id': self.target_backend.target_name},{"$set":{"_meta.stats":root_keys}})

        return results
Beispiel #11
0
 def rebuild(col):
     cur = mongo.id_feeder(col,batch_size=10000,logger=config.logger,force_build=force_build)
     [i for i in cur] # just iterate
Beispiel #12
0
    def do_index(self, job_manager, batch_size, ids, mode, **kwargs):

        client = MongoClient(**self.mongo_client_args)
        database = client[self.mongo_database_name]
        collection = database[self.mongo_collection_name]

        if ids:
            self.logger.info(("Indexing from '%s' with specific list of _ids, "
                              "create indexer job with batch_size=%d."),
                             self.mongo_collection_name, batch_size)
            # use user provided ids in batch
            id_provider = iter_n(ids, batch_size)
        else:
            self.logger.info(("Fetch _ids from '%s', and create "
                              "indexer job with batch_size=%d."),
                             self.mongo_collection_name, batch_size)
            # use ids from the target mongodb collection in batch
            id_provider = id_feeder(collection, batch_size, logger=self.logger)

        jobs = []  # asyncio.Future(s)
        error = None  # the first Exception

        total = len(ids) if ids else collection.count()
        schedule = Schedule(total, batch_size)

        def batch_finished(future):
            nonlocal error
            try:
                schedule.finished += future.result()
            except Exception as exc:
                self.logger.warning(exc)
                error = exc

        for batch_num, ids in zip(schedule, id_provider):
            yield from asyncio.sleep(0.0)

            # when one batch failed, and job scheduling has not completed,
            # stop scheduling and cancel all on-going jobs, to fail quickly.

            if error:
                for job in jobs:
                    if not job.done():
                        job.cancel()
                raise error

            self.logger.info(schedule)

            pinfo = self.pinfo.get_pinfo(
                schedule.suffix(self.mongo_collection_name))

            job = yield from job_manager.defer_to_process(
                pinfo, dispatch, self.mongo_client_args,
                self.mongo_database_name, self.mongo_collection_name,
                self.es_client_args, self.es_blkidx_args, self.es_index_name,
                ids, mode, batch_num)
            job.add_done_callback(batch_finished)
            jobs.append(job)

        self.logger.info(schedule)
        yield from asyncio.gather(*jobs)

        schedule.completed()
        self.logger.notify(schedule)
        return {"count": total, "created_at": datetime.now().astimezone()}
Beispiel #13
0
def export_ids(col_name):
    """
    Export all _ids from collection named col_name.
    If col_name refers to a build where a cold_collection is defined,
    will also extract _ids and sort/uniq them to have the full list of _ids
    of the actual merged (cold+hot) collection
    Output file is stored in DATA_EXPORT_FOLDER/ids,
    defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is
    returned as the end, if successful.
    """
    # prepare output directory
    DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None)
    if not DATA_EXPORT_FOLDER:
        DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export")
    ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids")
    if not os.path.exists(ids_export_folder):
        logging.debug("Creating export/ids folder: %s" % ids_export_folder)
        os.makedirs(ids_export_folder)
    build = get_src_build().find_one({"_id":col_name})
    cold = None
    if build:
        col = get_target_db()[col_name]
        if build.get("build_config",{}).get("cold_collection"):
            cold_name = build["build_config"]["cold_collection"]
            cold = get_target_db()[cold_name]
            logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name))
    else:
        # it's a src
        col = get_src_db()[col_name]
    
    # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore,
    # so we're sure to work with latest data. If cache is valid, this will be pretty fast
    logging.info("Screening _ids in collection '%s'" % col.name)
    for _id in id_feeder(col,validate_only=True):
        pass
    # now accessing cache
    col_ids_cache = get_cache_filename(col.name)
    assert os.path.exists(col_ids_cache)
    logging.info("Now using cache file %s" % col_ids_cache)
    if cold:
        logging.info("Screening _ids in cold collection '%s'" % cold.name)
        for _id in id_feeder(cold,validate_only=True):
            pass
        # now accessing cache
        cold_ids_cache = get_cache_filename(cold.name)
        assert os.path.exists(cold_ids_cache)
        logging.info("Now using cache file %s" % cold_ids_cache)
    outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name)
    # NOTE: can't use anyfile to open cache files and send _id through pipes
    # because it would load _id in memory (unless using hacks) so use cat (and
    # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe
    # on the shell
    if cold:
        fout = anyfile(outfn,"wb")
        colext = os.path.splitext(col_ids_cache)[1]
        coldext = os.path.splitext(cold_ids_cache)[1]
        assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext)
        comp = colext.replace(".","")
        supportedcomps = ["xz","gz",""] # no compression allowed as well
        assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps)
        # IDs sent to pipe's input (sort) then compress it (xz)
        pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE)
        psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True)
        pcat.stdout.close() # will raise end of pipe error when finished
        if comp:
            pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout)
        else:
            # just print stdin to stdout
            pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout)
        psort.stdout.close()
        try:
            logging.info("Running pipe to compute list of unique _ids")
            (out,err) = pcomp.communicate() # run the pipe! (blocking)
            if err:
                raise Exception(err)
        except Exception as e:
            logging.error("Error while running pipe to export _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise
    else:
        logging.info("Copying cache _id file")
        try:
            shutil.copyfile(col_ids_cache,outfn)
        except Exception as e:
            logging.error("Error while exporting _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise

    logging.info("Done exporting _ids to '%s'" % outfn)
    return outfn
Beispiel #14
0
    def diff_cols(self,
                  old_db_col_names,
                  new_db_col_names,
                  batch_size=100000,
                  steps=["count", "content", "mapping"],
                  mode=None,
                  exclude=[]):
        """
        Compare new with old collections and produce diff files. Root keys can be excluded from
        comparison with "exclude" parameter.
        *_db_col_names can be: 
         1. a colleciton name (as a string) asusming they are
            in the target database.
         2. tuple with 2 elements, the first one is then either "source" or "target"
            to respectively specify src or target database, and the second element is
            the collection name.
         3. tuple with 3 elements (URI,db,collection), looking like:
            ("mongodb://*****:*****@host","dbname","collection"), allowing to specify
            any connection on any server
        steps: 'count' will count the root keys for every documents in new collection 
               (to check number of docs from datasources).
               'content' will perform diff on actual content.
               'mapping' will perform diff on ES mappings (if target collection involved)
        mode: 'purge' will remove any existing files for this comparison.
        """
        new = create_backend(new_db_col_names)
        old = create_backend(old_db_col_names)
        # check what to do
        if type(steps) == str:
            steps = [steps]

        diff_folder = generate_diff_folder(old_db_col_names, new_db_col_names)

        if os.path.exists(diff_folder):
            if mode == "purge" and os.path.exists(diff_folder):
                rmdashfr(diff_folder)
            else:
                raise FileExistsError(
                    "Found existing files in '%s', use mode='purge'" %
                    diff_folder)
        if not os.path.exists(diff_folder):
            os.makedirs(diff_folder)

        # create metadata file storing info about how we created the diff
        # and some summary data
        diff_stats = {
            "update": 0,
            "add": 0,
            "delete": 0,
            "mapping_changed": False
        }
        metadata = {
            "diff": {
                "type": self.diff_type,
                "func": self.diff_func.__name__,
                "version": "%s.%s" % (old.version, new.version),
                "stats": diff_stats,  # ref to diff_stats
                "files": [],
                # when "new" is a target collection:
                "mapping_file": None,
                "info": {
                    "generated_on": str(datetime.now()),
                    "exclude": exclude,
                    "steps": steps,
                    "mode": mode,
                    "batch_size": batch_size
                }
            },
            "old": {
                "backend": old_db_col_names,
                "version": old.version
            },
            "new": {
                "backend": new_db_col_names,
                "version": new.version
            },
            # when "new" is a target collection:
            "_meta": {},
            "build_config": {},
        }
        if isinstance(
                new, DocMongoBackend
        ) and new.target_collection.database.name == btconfig.DATA_TARGET_DATABASE:
            build_doc = get_src_build().find_one(
                {"_id": new.target_collection.name})
            if not build_doc:
                raise DifferException("Collection '%s' has no corresponding build document" % \
                        new.target_collection.name)
            metadata["_meta"] = build_doc.get("_meta", {})
            metadata["build_config"] = build_doc.get("build_config")

        # dump it here for minimum information, in case we don't go further
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)

        got_error = False
        if "mapping" in steps:

            def diff_mapping(old, new, diff_folder):
                summary = {}
                old_build = get_src_build().find_one(
                    {"_id": old.target_collection.name})
                new_build = get_src_build().find_one(
                    {"_id": new.target_collection.name})
                if old_build and new_build:
                    # mapping diff always in jsondiff
                    mapping_diff = jsondiff(old_build["mapping"],
                                            new_build["mapping"])
                    if mapping_diff:
                        file_name = os.path.join(diff_folder, "mapping.pyobj")
                        dump(mapping_diff, file_name)
                        md5 = md5sum(file_name)
                        summary["mapping_file"] = {
                            "name": os.path.basename(file_name),
                            "md5sum": md5
                        }
                else:
                    self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \
                            (old.target_collection.name,new.target_collection.name))
                return summary

            def mapping_diffed(f):
                res = f.result()
                if res.get("mapping_file"):
                    nonlocal got_error
                    # check mapping differences: only "add" ops are allowed, as any others actions would be
                    # ingored by ES once applied (you can't update/delete elements of an existing mapping)
                    mf = os.path.join(diff_folder, res["mapping_file"]["name"])
                    ops = loadobj(mf)
                    for op in ops:
                        if op["op"] != "add":
                            err = DifferException("Found diff operation '%s' in mapping file, " % op["op"] + \
                                " only 'add' operations are allowed. You can still produce the " + \
                                "diff by removing 'mapping' from 'steps' arguments. " + \
                                "Ex: steps=['count','content']. Diff operation was: %s" % op)
                            got_error = err
                    metadata["diff"]["mapping_file"] = mf
                    diff_stats["mapping_changed"] = True
                self.logger.info(
                    "Diff file containing mapping differences generated: %s" %
                    res.get("mapping_file"))

            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "mapping: old vs new",
                "description": ""
            }
            job = yield from self.job_manager.defer_to_thread(
                pinfo, partial(diff_mapping, old, new, diff_folder))
            job.add_done_callback(mapping_diffed)
            yield from job
            if got_error:
                raise got_error

        if "count" in steps:
            cnt = 0
            pinfo = {
                "category": "diff",
                "step": "count",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "description": ""
            }

            self.logger.info("Counting root keys in '%s'" % new.target_name)
            diff_stats["root_keys"] = {}
            jobs = []
            data_new = id_feeder(new, batch_size=batch_size)
            for id_list in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt
                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_count, id_list, new_db_col_names, cnt))
                jobs.append(job)

            def counted(f):
                root_keys = {}
                # merge the counts
                for d in f.result():
                    for k in d:
                        root_keys.setdefault(k, 0)
                        root_keys[k] += d[k]
                self.logger.info("root keys count: %s" % root_keys)
                diff_stats["root_keys"] = root_keys

            tasks = asyncio.gather(*jobs)
            tasks.add_done_callback(counted)
            yield from tasks
            self.logger.info(
                "Finished counting keys in the new collection: %s" %
                diff_stats["root_keys"])

        if "content" in steps:
            skip = 0
            cnt = 0
            jobs = []
            pinfo = {
                "category": "diff",
                "source": "%s vs %s" % (new.target_name, old.target_name),
                "step": "content: new vs old",
                "description": ""
            }
            data_new = id_feeder(new, batch_size=batch_size)
            selfcontained = "selfcontained" in self.diff_type
            for id_list_new in data_new:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["update"] += res["update"]
                    diff_stats["add"] += res["add"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Updated: {}, Added: {})".format(
                        res["update"], res["add"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_new_vs_old, id_list_new,
                            old_db_col_names, new_db_col_names, cnt,
                            diff_folder, self.diff_func, exclude,
                            selfcontained))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
                .format(diff_stats["update"], diff_stats["add"]))

            data_old = id_feeder(old, batch_size=batch_size)
            jobs = []
            pinfo["step"] = "content: old vs new"
            for id_list_old in data_old:
                cnt += 1
                pinfo["description"] = "batch #%s" % cnt

                def diffed(f):
                    res = f.result()
                    diff_stats["delete"] += res["delete"]
                    if res.get("diff_file"):
                        metadata["diff"]["files"].append(res["diff_file"])
                    self.logger.info("(Deleted: {})".format(res["delete"]))

                self.logger.info("Creating diff worker for batch #%s" % cnt)
                job = yield from self.job_manager.defer_to_process(
                    pinfo,
                    partial(diff_worker_old_vs_new, id_list_old,
                            new_db_col_names, cnt, diff_folder))
                job.add_done_callback(diffed)
                jobs.append(job)
            yield from asyncio.gather(*jobs)
            self.logger.info(
                "Finished calculating diff for the old collection. Total number of docs deleted: {}"
                .format(diff_stats["delete"]))

        self.logger.info(
            "Summary: (Updated: {}, Added: {}, Deleted: {}, Mapping changed: {})"
            .format(diff_stats["update"], diff_stats["add"],
                    diff_stats["delete"], diff_stats["mapping_changed"]))

        # pickle again with potentially more information (diff_stats)
        json.dump(metadata,
                  open(os.path.join(diff_folder, "metadata.json"), "w"),
                  indent=True)
        strargs = "[old=%s,new=%s,steps=%s,diff_stats=%s]" % (
            old_db_col_names, new_db_col_names, steps, diff_stats)
        self.logger.info("success %s" % strargs, extra={"notify": True})
        return diff_stats
Beispiel #15
0
    def do_snpeff(self, batch_size=SNPEFF_BATCH_SIZE, force=False, force_use_cache=False):
        self.logger.info("Updating snpeff information from source '%s' (collection:%s)" % (self.fullname,self.collection_name))
        # select Snpeff uploader to get collection name and src_dump _id
        version = self.__class__.__metadata__["assembly"]
        snpeff_class = getattr(snpeff_upload,"Snpeff%sUploader" % version.capitalize())
        snpeff_main_source = snpeff_class.main_source
        snpeff_doc = self.src_dump.find_one({"_id" : snpeff_main_source})
        assert snpeff_doc, "No snpeff information found, has it been dumped & uploaded ?"
        snpeff_dir = snpeff_doc["data_folder"]
        # -q: when there's an update, there's a message on stderr....
        cmd = "java -Xmx4g -jar %s/snpEff/snpEff.jar -t -noStats -noExpandIUB %s" % (snpeff_dir,version)
        # genome files are in "data_folder"/../data
        genomes = glob.glob(os.path.join(snpeff_dir,"..","data","%s_genome.*" % version))
        assert len(genomes) == 1, "Expected only one genome files for '%s', got: %s" % (version,genomes)
        genome = genomes[0]
        annotator = snpeff_parser.SnpeffAnnotator(cmd,logger=self.logger)
        vcf_builder = snpeff_parser.VCFConstruct(genome,logger=self.logger)
        storage = UpsertStorage(None,snpeff_class.name,self.logger)
        col = self.db[self.collection_name]
        total = math.ceil(col.count()/batch_size)
        cnt = 0
        to_process = []

        def process(ids):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt","ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs,version)
            howmany = storage.process(data, batch_size)
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)

        for ids in id_feeder(col, batch_size=batch_size, logger=self.logger, force_use=force_use_cache):
            cnt += 1
            self.logger.debug("Processing batch %s/%s [%.1f]" % (cnt,total,(cnt/total*100)))
            # don't re-compute annotations if already there
            if not force:
                for subids in iter_n(ids,10000):
                    cur = storage.temp_collection.find({'_id' : {'$in' : subids}},{'_id':1})
                    already_ids = [d["_id"] for d in list(cur)]
                    newids = list(set(subids).difference(set(already_ids)))
                    if len(subids) != len(newids):
                        self.logger.debug("%d documents already have snpeff annotations, skip them" % \
                                (len(subids) - len(newids)))
                    to_process.extend(newids)
                    self.logger.debug("Batch filled %d out of %d" % (len(to_process),batch_size))
                    if not (len(to_process) >= batch_size):
                        # can fill more...
                        continue
                    process(to_process)
                    to_process = []
            else:
                to_process = ids
        # for potential remainings
        if to_process:
            process(to_process)