예제 #1
0
def pushDruidDataset(ds_dir, druid_adm, ds_name):
    readySolutions()
    with open(ds_dir + "/dsinfo.json", "r", encoding="utf-8") as inp:
        ds_info = json.loads(inp.read())
    filter_set = defineFilterSchema(ds_info["meta"])

    return druid_adm.uploadDataset(ds_name, ds_info["flt_schema"],
                                   os.path.abspath(ds_dir + "/fdata.json.gz"),
                                   filter_set.getZygosityNames(),
                                   os.path.abspath(ds_dir + "/druid_rq.json"))
예제 #2
0
def portionFavorDruidPush(ds_dir, druid_adm, favor_storage, portion_no):
    readySolutions()
    filter_set = defineFilterSchema(favor_storage.getMetaData())
    fdata_path = os.path.abspath(ds_dir + "/__fdata.json.gz")

    with gzip.open(fdata_path, "wt", encoding="utf-8") as outp:
        for rec_no, record in favor_storage.loadRecords(portion_no):
            flt_data = filter_set.process(rec_no, record)
            flt_data.update(favor_storage.internalFltData(rec_no))
            print(json.dumps(flt_data, ensure_ascii=False), file=outp)

    flt_schema_data = filter_set.dump()

    report_fname = (os.path.abspath(ds_dir + "/druid_rq.json")
                    if portion_no == 0 else None)

    druid_adm.uploadDataset("xl_FAVOR",
                            flt_schema_data,
                            fdata_path,
                            filter_set.getZygosityNames(),
                            report_fname=report_fname,
                            portion_mode=True)
예제 #3
0
def createDS(ds_dir,
             mongo_conn,
             druid_adm,
             ds_name,
             ds_source,
             ds_kind,
             ds_inv=None,
             report_lines=False,
             favor_storage=None,
             no_druid_push=False):
    readySolutions()
    assert (ds_kind == "xl") == (druid_adm is not None)

    time_start = datetime.now()
    logging.info("Dataset %s creation started at %s\tVersion: %s" %
                 (ds_name, str(time_start), AnfisaConfig.getAnfisaVersion()))
    date_loaded = time_start.isoformat()

    if ds_source is not None:
        input_reader = JsonLineReader(ds_source)
        metadata_record = input_reader.readOne()
    else:
        metadata_record = favor_storage.getMetaData()
        input_reader = None

    if metadata_record.get("record_type") != "metadata":
        logging.critical("No metadata line in %s" % ds_source)
        assert False

    if "versions" in metadata_record:
        annotation_version = metadata_record["versions"].get("annotations")
        if annotation_version:
            ver = map(int, annotation_version.split('.'))
            if list(ver) < [0, 6]:
                logging.critical(
                    "Annotation version not supported (0.6.* expected): %s" %
                    annotation_version)
                assert False
        metadata_record["versions"][
            "Anfisa load"] = AnfisaConfig.getAnfisaVersion()

    view_aspects = defineViewSchema(metadata_record)
    view_checker = ViewDataChecker(view_aspects)
    filter_set = defineFilterSchema(metadata_record)

    if input_reader:
        if report_lines:
            print("Processing...", file=sys.stderr)

        if ds_kind == "ws":
            trans_prep = TransformPreparator_WS(
                filter_set.getTranscriptDescrSeq(), filter_set, True)
        else:
            trans_prep = TransformPreparator_XL(druid_adm)

        with DataDiskStorageWriter(True, ds_dir, filter_set, trans_prep,
                                   view_checker, report_lines) as ds_out:
            for record in input_reader:
                ds_out.saveRecord(record)
                if report_lines and ds_out.getTotal() % report_lines == 0:
                    sys.stderr.write("\r%d lines..." % ds_out.getTotal())
            total = ds_out.getTotal()
        input_reader.close()
        if report_lines:
            print("\nTotal lines: %d" % total, file=sys.stderr)
        trans_prep.finishUp()
    else:
        record = favor_storage.getRecordData(0)
        view_checker.regValue(0, record)
        total = metadata_record["variants"]

    rep_out = StringIO()
    is_ok = view_checker.finishUp(rep_out, no_mode=input_reader is None)
    is_ok &= filter_set.reportProblems(rep_out)

    flt_schema_data = filter_set.dump()
    if ds_kind == "xl" and input_reader:
        is_ok &= druid_adm.uploadDataset(
            ds_name, flt_schema_data,
            os.path.abspath(ds_dir + "/fdata.json.gz"),
            filter_set.getZygosityNames(),
            os.path.abspath(ds_dir + "/druid_rq.json"), no_druid_push)

    if is_ok:
        try:
            ds_doc_dir = prepareDocDir(ds_dir + "/doc", ds_inv)
        except Exception:
            logException("Exception on documentation build\n"
                         "Ignored in create process\n"
                         "Use mode doc-push to repair documentation")
            ds_doc_dir = []
        ds_info = {
            "date_loaded": date_loaded,
            "doc": ds_doc_dir,
            "flt_schema": flt_schema_data,
            "kind": ds_kind,
            "meta": metadata_record,
            "modes": [],
            "mongo": ds_name,
            "name": ds_name,
            "root": ds_name,
            "zygosity_var": filter_set.getZygosityVarName(),
            "total": total,
            "view_schema": view_aspects.dump()
        }

        with open(ds_dir + "/dsinfo.json", "w", encoding="utf-8") as outp:
            print(json.dumps(ds_info, sort_keys=True, indent=4), file=outp)

        with open(ds_dir + "/stat.json", "w", encoding="utf-8") as outp:
            print(json.dumps(view_checker.dump(), sort_keys=True, indent=4),
                  file=outp)

        mongo_agent = mongo_conn.getDSAgent(ds_name, ds_kind)
        mongo_agent.updateCreationDate(date_loaded, ds_source)

        with open(ds_dir + "/doc/info.html", "w", encoding="utf-8") as outp:
            reportDS(outp, ds_info, mongo_agent)

        with open(ds_dir + "/active", "w", encoding="utf-8") as outp:
            print("", file=outp)
        print("Dataset %s kind=%s succesively created" % (ds_name, ds_kind),
              file=rep_out)
    else:
        print("Process terminated", file=rep_out)

    with open(ds_dir + "/create.log", "w", encoding="utf-8") as outp:
        print(rep_out.getvalue(), file=outp)

    print(rep_out.getvalue())
    time_done = datetime.now()
    logging.info("Dataset %s creation finished at %s for %s" %
                 (ds_name, str(time_done), str(time_done - time_start)))
예제 #4
0
def createDataSet(app_config, name, kind, mongo,
        source, ds_inventory, report_lines, date_loaded):
    global DRUID_ADM
    vault_dir = app_config["data-vault"]
    if not os.path.isdir(vault_dir):
        print("No vault directory:", vault_dir, file = sys.stderr)
        assert False
    checkDSName(name, kind)
    ds_dir = os.path.abspath(vault_dir + "/" + name)
    if not mongo:
        mongo = name
    if os.path.exists(ds_dir):
        print("Dataset exists:", ds_dir, file = sys.stderr)
        assert False
    assert (kind == "xl") == (DRUID_ADM is not None)
    os.mkdir(ds_dir)

    anfisa_version = AnfisaConfig.getAnfisaVersion()
    prepareSolutions()
    post_proc = None
    view_aspects = defineViewSchema()
    view_checker = ViewDataChecker(view_aspects)
    filter_set = defineFilterSchema()

    if kind == "ws":
        trans_prep = TranscriptPreparator(
            filter_set.getTranscriptDescrSeq(), True)
    else:
        trans_prep = None

    if report_lines:
        print("Processing...", file = sys.stderr)

    data_rec_no = 0
    metadata_record = None

    vdata_out = Popen(sys.executable + " -m utils.ixbz2 --calm -o " +
        ds_dir + "/vdata.ixbz2 /dev/stdin", shell = True,
        stdin = PIPE, stderr = PIPE,
        bufsize = 1, universal_newlines = False, # line buffer
        close_fds = True)

    vdata_stdin = TextIOWrapper(vdata_out.stdin, encoding = "utf-8",
        line_buffering = True)

    with    gzip.open(ds_dir + "/fdata.json.gz", 'wb') as fdata_stream, \
            gzip.open(ds_dir + "/pdata.json.gz", 'wb') as pdata_stream, \
            JsonLineReader(source) as input:
        fdata_out = TextIOWrapper(fdata_stream,
            encoding = "utf-8", line_buffering = True)
        pdata_out = TextIOWrapper(pdata_stream,
            encoding = "utf-8", line_buffering = True)
        for inp_rec_no, record in enumerate(input):
            if post_proc is not None:
                post_proc.transform(inp_rec_no, record)
            if record.get("record_type") == "metadata":
                assert inp_rec_no == 0
                metadata_record = record
                if "versions" in metadata_record:
                    metadata_record["versions"]["Anfisa load"] = anfisa_version
                filter_set.setMeta(metadata_record)
                continue
            flt_data = filter_set.process(data_rec_no, record)
            view_checker.regValue(data_rec_no, record)
            print(json.dumps(record, ensure_ascii = False), file = vdata_stdin)
            pre_data = PresentationData.make(record)
            if DRUID_ADM is not None:
                DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no)
            if trans_prep is not None:
                trans_prep.doRec(record, flt_data)
            print(json.dumps(flt_data, ensure_ascii = False), file = fdata_out)
            print(json.dumps(pre_data, ensure_ascii = False), file = pdata_out)
            data_rec_no += 1
            if report_lines and data_rec_no % report_lines == 0:
                sys.stderr.write("\r%d lines..." % data_rec_no)
    if report_lines:
        print("\nTotal lines: %d" % data_rec_no, file = sys.stderr)

    _, vreport_data = vdata_out.communicate()
    for line in str(vreport_data, encoding="utf-8").splitlines():
        print(line, file = sys.stderr)
    vdata_out.wait()

    rep_out = StringIO()
    is_ok = view_checker.finishUp(rep_out)
    is_ok &= filter_set.reportProblems(rep_out)

    if trans_prep is not None:
        total_item_count = trans_prep.finishUp()
    else:
        total_item_count = None

    flt_schema_data = filter_set.dump()
    if kind == "xl":
        is_ok &= DRUID_ADM.uploadDataset(name, flt_schema_data,
            os.path.abspath(ds_dir + "/fdata.json.gz"),
            os.path.abspath(ds_dir + "/druid_rq.json"))

    if is_ok:
        ds_doc_dir = ds_dir + "/doc"
        ds_info = {
            "name": name,
            "kind": kind,
            "view_schema": view_aspects.dump(),
            "flt_schema": flt_schema_data,
            "total": data_rec_no,
            "mongo": mongo,
            "modes": [],
            "meta": metadata_record,
            "doc": prepareDocDir(ds_doc_dir, ds_inventory),
            "date_loaded": date_loaded}

        if total_item_count is not None:
            ds_info["total_items"] = total_item_count

        with open(ds_dir + "/dsinfo.json", "w", encoding = "utf-8") as outp:
            print(json.dumps(ds_info, sort_keys = True, indent = 4),
                file = outp)

        with open(ds_dir + "/stat.json", "w", encoding = "utf-8") as outp:
            print(json.dumps(view_checker.dump(), sort_keys = True,
                indent = 4), file = outp)

        mongo_conn = MongoConnector(app_config["mongo-db"],
            app_config.get("mongo-host"), app_config.get("mongo-port"))
        mongo_agent = mongo_conn.getDSAgent(name, kind)
        mongo_agent.checkCreationDate(date_loaded, source)

        with open(ds_dir + "/doc/info.html", "w", encoding = "utf-8") as outp:
            reportDS(outp, ds_info, mongo_agent)

        with open(ds_dir + "/active", "w", encoding = "utf-8") as outp:
            print("", file = outp)
        print("Dataset %s kind=%s succesively created" % (
            name, kind), file = rep_out)
    else:
        print("Process terminated", file = rep_out)

    with open(ds_dir + "/create.log",
            "w", encoding = "utf-8") as outp:
        print(rep_out.getvalue(), file = outp)

    print(rep_out.getvalue())