def pushDruidDataset(ds_dir, druid_adm, ds_name): readySolutions() with open(ds_dir + "/dsinfo.json", "r", encoding="utf-8") as inp: ds_info = json.loads(inp.read()) filter_set = defineFilterSchema(ds_info["meta"]) return druid_adm.uploadDataset(ds_name, ds_info["flt_schema"], os.path.abspath(ds_dir + "/fdata.json.gz"), filter_set.getZygosityNames(), os.path.abspath(ds_dir + "/druid_rq.json"))
def portionFavorDruidPush(ds_dir, druid_adm, favor_storage, portion_no): readySolutions() filter_set = defineFilterSchema(favor_storage.getMetaData()) fdata_path = os.path.abspath(ds_dir + "/__fdata.json.gz") with gzip.open(fdata_path, "wt", encoding="utf-8") as outp: for rec_no, record in favor_storage.loadRecords(portion_no): flt_data = filter_set.process(rec_no, record) flt_data.update(favor_storage.internalFltData(rec_no)) print(json.dumps(flt_data, ensure_ascii=False), file=outp) flt_schema_data = filter_set.dump() report_fname = (os.path.abspath(ds_dir + "/druid_rq.json") if portion_no == 0 else None) druid_adm.uploadDataset("xl_FAVOR", flt_schema_data, fdata_path, filter_set.getZygosityNames(), report_fname=report_fname, portion_mode=True)
def createDS(ds_dir, mongo_conn, druid_adm, ds_name, ds_source, ds_kind, ds_inv=None, report_lines=False, favor_storage=None, no_druid_push=False): readySolutions() assert (ds_kind == "xl") == (druid_adm is not None) time_start = datetime.now() logging.info("Dataset %s creation started at %s\tVersion: %s" % (ds_name, str(time_start), AnfisaConfig.getAnfisaVersion())) date_loaded = time_start.isoformat() if ds_source is not None: input_reader = JsonLineReader(ds_source) metadata_record = input_reader.readOne() else: metadata_record = favor_storage.getMetaData() input_reader = None if metadata_record.get("record_type") != "metadata": logging.critical("No metadata line in %s" % ds_source) assert False if "versions" in metadata_record: annotation_version = metadata_record["versions"].get("annotations") if annotation_version: ver = map(int, annotation_version.split('.')) if list(ver) < [0, 6]: logging.critical( "Annotation version not supported (0.6.* expected): %s" % annotation_version) assert False metadata_record["versions"][ "Anfisa load"] = AnfisaConfig.getAnfisaVersion() view_aspects = defineViewSchema(metadata_record) view_checker = ViewDataChecker(view_aspects) filter_set = defineFilterSchema(metadata_record) if input_reader: if report_lines: print("Processing...", file=sys.stderr) if ds_kind == "ws": trans_prep = TransformPreparator_WS( filter_set.getTranscriptDescrSeq(), filter_set, True) else: trans_prep = TransformPreparator_XL(druid_adm) with DataDiskStorageWriter(True, ds_dir, filter_set, trans_prep, view_checker, report_lines) as ds_out: for record in input_reader: ds_out.saveRecord(record) if report_lines and ds_out.getTotal() % report_lines == 0: sys.stderr.write("\r%d lines..." % ds_out.getTotal()) total = ds_out.getTotal() input_reader.close() if report_lines: print("\nTotal lines: %d" % total, file=sys.stderr) trans_prep.finishUp() else: record = favor_storage.getRecordData(0) view_checker.regValue(0, record) total = metadata_record["variants"] rep_out = StringIO() is_ok = view_checker.finishUp(rep_out, no_mode=input_reader is None) is_ok &= filter_set.reportProblems(rep_out) flt_schema_data = filter_set.dump() if ds_kind == "xl" and input_reader: is_ok &= druid_adm.uploadDataset( ds_name, flt_schema_data, os.path.abspath(ds_dir + "/fdata.json.gz"), filter_set.getZygosityNames(), os.path.abspath(ds_dir + "/druid_rq.json"), no_druid_push) if is_ok: try: ds_doc_dir = prepareDocDir(ds_dir + "/doc", ds_inv) except Exception: logException("Exception on documentation build\n" "Ignored in create process\n" "Use mode doc-push to repair documentation") ds_doc_dir = [] ds_info = { "date_loaded": date_loaded, "doc": ds_doc_dir, "flt_schema": flt_schema_data, "kind": ds_kind, "meta": metadata_record, "modes": [], "mongo": ds_name, "name": ds_name, "root": ds_name, "zygosity_var": filter_set.getZygosityVarName(), "total": total, "view_schema": view_aspects.dump() } with open(ds_dir + "/dsinfo.json", "w", encoding="utf-8") as outp: print(json.dumps(ds_info, sort_keys=True, indent=4), file=outp) with open(ds_dir + "/stat.json", "w", encoding="utf-8") as outp: print(json.dumps(view_checker.dump(), sort_keys=True, indent=4), file=outp) mongo_agent = mongo_conn.getDSAgent(ds_name, ds_kind) mongo_agent.updateCreationDate(date_loaded, ds_source) with open(ds_dir + "/doc/info.html", "w", encoding="utf-8") as outp: reportDS(outp, ds_info, mongo_agent) with open(ds_dir + "/active", "w", encoding="utf-8") as outp: print("", file=outp) print("Dataset %s kind=%s succesively created" % (ds_name, ds_kind), file=rep_out) else: print("Process terminated", file=rep_out) with open(ds_dir + "/create.log", "w", encoding="utf-8") as outp: print(rep_out.getvalue(), file=outp) print(rep_out.getvalue()) time_done = datetime.now() logging.info("Dataset %s creation finished at %s for %s" % (ds_name, str(time_done), str(time_done - time_start)))
def createDataSet(app_config, name, kind, mongo, source, ds_inventory, report_lines, date_loaded): global DRUID_ADM vault_dir = app_config["data-vault"] if not os.path.isdir(vault_dir): print("No vault directory:", vault_dir, file = sys.stderr) assert False checkDSName(name, kind) ds_dir = os.path.abspath(vault_dir + "/" + name) if not mongo: mongo = name if os.path.exists(ds_dir): print("Dataset exists:", ds_dir, file = sys.stderr) assert False assert (kind == "xl") == (DRUID_ADM is not None) os.mkdir(ds_dir) anfisa_version = AnfisaConfig.getAnfisaVersion() prepareSolutions() post_proc = None view_aspects = defineViewSchema() view_checker = ViewDataChecker(view_aspects) filter_set = defineFilterSchema() if kind == "ws": trans_prep = TranscriptPreparator( filter_set.getTranscriptDescrSeq(), True) else: trans_prep = None if report_lines: print("Processing...", file = sys.stderr) data_rec_no = 0 metadata_record = None vdata_out = Popen(sys.executable + " -m utils.ixbz2 --calm -o " + ds_dir + "/vdata.ixbz2 /dev/stdin", shell = True, stdin = PIPE, stderr = PIPE, bufsize = 1, universal_newlines = False, # line buffer close_fds = True) vdata_stdin = TextIOWrapper(vdata_out.stdin, encoding = "utf-8", line_buffering = True) with gzip.open(ds_dir + "/fdata.json.gz", 'wb') as fdata_stream, \ gzip.open(ds_dir + "/pdata.json.gz", 'wb') as pdata_stream, \ JsonLineReader(source) as input: fdata_out = TextIOWrapper(fdata_stream, encoding = "utf-8", line_buffering = True) pdata_out = TextIOWrapper(pdata_stream, encoding = "utf-8", line_buffering = True) for inp_rec_no, record in enumerate(input): if post_proc is not None: post_proc.transform(inp_rec_no, record) if record.get("record_type") == "metadata": assert inp_rec_no == 0 metadata_record = record if "versions" in metadata_record: metadata_record["versions"]["Anfisa load"] = anfisa_version filter_set.setMeta(metadata_record) continue flt_data = filter_set.process(data_rec_no, record) view_checker.regValue(data_rec_no, record) print(json.dumps(record, ensure_ascii = False), file = vdata_stdin) pre_data = PresentationData.make(record) if DRUID_ADM is not None: DRUID_ADM.addFieldsToRec(flt_data, pre_data, data_rec_no) if trans_prep is not None: trans_prep.doRec(record, flt_data) print(json.dumps(flt_data, ensure_ascii = False), file = fdata_out) print(json.dumps(pre_data, ensure_ascii = False), file = pdata_out) data_rec_no += 1 if report_lines and data_rec_no % report_lines == 0: sys.stderr.write("\r%d lines..." % data_rec_no) if report_lines: print("\nTotal lines: %d" % data_rec_no, file = sys.stderr) _, vreport_data = vdata_out.communicate() for line in str(vreport_data, encoding="utf-8").splitlines(): print(line, file = sys.stderr) vdata_out.wait() rep_out = StringIO() is_ok = view_checker.finishUp(rep_out) is_ok &= filter_set.reportProblems(rep_out) if trans_prep is not None: total_item_count = trans_prep.finishUp() else: total_item_count = None flt_schema_data = filter_set.dump() if kind == "xl": is_ok &= DRUID_ADM.uploadDataset(name, flt_schema_data, os.path.abspath(ds_dir + "/fdata.json.gz"), os.path.abspath(ds_dir + "/druid_rq.json")) if is_ok: ds_doc_dir = ds_dir + "/doc" ds_info = { "name": name, "kind": kind, "view_schema": view_aspects.dump(), "flt_schema": flt_schema_data, "total": data_rec_no, "mongo": mongo, "modes": [], "meta": metadata_record, "doc": prepareDocDir(ds_doc_dir, ds_inventory), "date_loaded": date_loaded} if total_item_count is not None: ds_info["total_items"] = total_item_count with open(ds_dir + "/dsinfo.json", "w", encoding = "utf-8") as outp: print(json.dumps(ds_info, sort_keys = True, indent = 4), file = outp) with open(ds_dir + "/stat.json", "w", encoding = "utf-8") as outp: print(json.dumps(view_checker.dump(), sort_keys = True, indent = 4), file = outp) mongo_conn = MongoConnector(app_config["mongo-db"], app_config.get("mongo-host"), app_config.get("mongo-port")) mongo_agent = mongo_conn.getDSAgent(name, kind) mongo_agent.checkCreationDate(date_loaded, source) with open(ds_dir + "/doc/info.html", "w", encoding = "utf-8") as outp: reportDS(outp, ds_info, mongo_agent) with open(ds_dir + "/active", "w", encoding = "utf-8") as outp: print("", file = outp) print("Dataset %s kind=%s succesively created" % ( name, kind), file = rep_out) else: print("Process terminated", file = rep_out) with open(ds_dir + "/create.log", "w", encoding = "utf-8") as outp: print(rep_out.getvalue(), file = outp) print(rep_out.getvalue())