Exemple #1
0
        def process(ids):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt", "ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN -
                                                      len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs, version)
            howmany = storage.process(data, batch_size)
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" %
                                  snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)
Exemple #2
0
        def process(ids, bnum):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt", "ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN -
                                                      len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs, version)
            try:
                howmany = storage.process(data, batch_size)
            except Exception as e:
                # batch failed, rebuild it (it was a generator, we don't know where it is now)
                # and retry one by one
                data = annotate_start_end(hgvs_vcfs, version)
                howmany = 0
                for doc in data:
                    try:
                        howmany += storage.process([doc], batch_size=1)
                    except Exception as e:
                        self.logger.exception(
                            "Couldn't annotate document _id '%s', skip it: %s"
                            % (doc["_id"], e))
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" %
                                  snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)
Exemple #3
0
        def process(ids):
            self.logger.info("%d documents to annotate" % len(ids))
            hgvs_vcfs = vcf_builder.build_vcfs(ids)
            # merge "vcf" and snpeff annotations keys when possible
            # (it no snpeff data, we keep 'vcf' data)
            for annot in annotator.annotate(hgvs_vcfs):
                hgvs_vcfs[annot["_id"]].update(annot)
            # trim if sequence is to big
            for _id in hgvs_vcfs:
                vcf = hgvs_vcfs[_id]
                for k in ["alt","ref"]:
                    if len(vcf["vcf"][k]) > MAX_REF_ALT_LEN:
                        msg = "...(trimmed)"
                        vcf["vcf"][k] = vcf["vcf"][k][:MAX_REF_ALT_LEN - len(msg)] + msg
                hgvs_vcfs[_id] = vcf

            data = annotate_start_end(hgvs_vcfs,version)
            howmany = storage.process(data, batch_size)
            if howmany:
                # we need to update some metadata info about snpeff b/c data has changed
                # so cache could be invalid
                self.logger.debug("Invalidating cache for '%s'" % snpeff_class.name)
                mongo.invalidate_cache(snpeff_class.name)
Exemple #4
0
def sync_mongo_jsondiff_worker(diff_file,
                               old_db_col_names,
                               new_db_col_names,
                               batch_size,
                               cnt,
                               force=False,
                               selfcontained=False,
                               metadata={},
                               debug=False):
    """Worker to sync data between a new and an old mongo collection"""
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it" %
                     os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(
            diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name,
                            logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff[
        "source"], "Source is different in diff file '%s': %s" % (
            diff_file, diff["source"])

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection,
                         step=batch_size,
                         inbatch=False,
                         query={'_id': {
                             '$in': diff["add"]
                         }})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res