def task_graphviz(args): #logging.info( "called task_graphviz" ) filename = args["input_file"] the_schema = CnsSchema() the_schema.preloaded_schema_list = preload_schema(args) the_schema.jsonld2mem4file(filename) #validate if we can reproduce the same jsonld based on input jsonld_input = file2json(filename) name = os.path.basename(args["input_file"]).replace(".jsonld","") graph_name = re.sub(ur"[-\.]","_", name) ret = run_graphviz(the_schema, graph_name) for key, lines in ret.items(): xdebug_file = os.path.join(args["debug_dir"], name+"_"+key+u".dot") lines2file([lines], xdebug_file)
def task_convert(args): logging.info("called task_convert") filename = "../schema/cns_top_v2.0.jsonld" filename = file2abspath(filename, __file__) loaded_schema = CnsSchema() loaded_schema.import_jsonld(filename) filename = args["input_file"] jsondata = file2json(filename) report = CnsBugReport() for idx, item in enumerate(jsondata): types = [item["mainType"], "Thing"] primary_keys = [idx] cns_item = run_convert(loaded_schema, item, types, primary_keys, report) logging.info(json4debug(cns_item)) #loaded_schema.run_validate(cns_item, report) logging.info(json4debug(report.data))
def task_graphviz(args): #logging.info( "called task_graphviz" ) filename = args["input_file"] loaded_schema = CnsSchema() preloaded_schema_list = preload_schema(args) loaded_schema.import_jsonld(filename, preloaded_schema_list) #validate if we can reproduce the same jsonld based on input jsonld_input = file2json(filename) name = os.path.basename(args["input_file"]).split(u".")[0] name = re.sub(ur"-", "_", name) ret = run_graphviz(loaded_schema, name) for key, lines in ret.items(): xdebug_file = os.path.join(args["debug_dir"], name + "_" + key + u".dot") lines2file([lines], xdebug_file)
def task_validate(args): logging.info( "called task_validate" ) schema_filename = args.get("input_schema") if not schema_filename: schema_filename = "schema/cns_top.jsonld" preloadSchemaList = preload_schema(args) loaded_schema = CnsSchema() loaded_schema.import_jsonld(schema_filename, preloadSchemaList) filepath = args["input_file"] filename_list = glob.glob(filepath) report = init_report() # init xtemplate report[XTEMPLATE] = collections.Counter() for template in loaded_schema.metadata["template"]: d = template["refClass"] p = template["refProperty"] key_cp = u"cp_{}_{}_{}".format(d, d, p) report[XTEMPLATE][key_cp] += 0 logging.info(json4debug(report[XTEMPLATE])) # init class path dependency for template in loaded_schema.metadata["template"]: d = template["refClass"] key_cp = u"parent_{}".format(d) report[XTEMPLATE][key_cp] = loaded_schema.index_inheritance["rdfs:subClassOf"].get(d) for definition in loaded_schema.definition.values(): if "rdfs:Class" in definition["@type"]: d = definition["name"] key_cp = u"parent_{}".format(d) report[XTEMPLATE][key_cp] = loaded_schema.index_inheritance["rdfs:subClassOf"].get(d) #validate lines = [] for filename in filename_list: logging.info(filename) if not os.path.exists(filename): continue if args.get("option") == "jsons": for idx, line in enumerate(file2iter(filename)): if idx % 10000 ==0: logging.info(idx) logging.info(json4debug(report)) json_data = json.loads(line) run_validate(loaded_schema, json_data, report) stat_kg_report_per_item(json_data, None, report["stats"]) # collection entity listing if "CnsLink" not in json_data["@type"]: entity_simple = [ json_data["@type"][0], json_data.get("name",""), "\""+u",".join(json_data.get("alternateName",[]))+"\"" ] lines.append(u",".join(entity_simple)) else: jsondata = file2json(filename) run_validate(loaded_schema, jsondata, report) #out filename = args["output_validate_entity"] logging.info(filename) lines = sorted(lines) fields = ["main_type","name","alternateName"] lines.insert(0, u",".join(fields)) lines2file(lines, filename) #display report logging.info(json4debug(report)) #write report csv write_csv_report(args, report, loaded_schema) filename = args["output_validate_report"].replace("csv","json") logging.info(filename) json2file(report, filename)
def __init__(self): #cnSchema存储 self.schema = CnsSchema() self.report = self.schema.report
class CnsModelTable(): def __init__(self): #cnSchema存储 self.schema = CnsSchema() self.report = self.schema.report def table2mem(self, excel_data): #logging.info(json4debug(excel_data)) self._run_validate_excel_data(excel_data) if self.report.has_bug(): return False self._run_load_excel_data(excel_data) if self.report.has_bug(): return False self.schema.build() if self.report.has_bug(): return False #validate4jsonld json_data = self.schema.mem2jsonld() run_validate_recursive(self.schema, json_data, self.report) if self.report.has_bug(): return False return self.schema def _run_load_excel_data(self, excel_data): excel_data_index = dict([[x["sheetname"], x] for x in excel_data]) for schema_sheet in CNS_SCHEMA_SHEET: sheet_name = schema_sheet["sheetname"] sheet = excel_data_index.get(sheet_name) if not schema_sheet: continue #process rows visited_name = set() for row_index, row in enumerate(sheet["rows"]): if not self._validate_one_row(row, row_index, sheet_name, schema_sheet): continue cns_item = self._convert_one_row(row, schema_sheet) #check for duplicated name if cns_item is None: pass elif cns_item["name"] in visited_name: bug = { "category": "error_excel_row_duplicated_name", "description": "duplicated name sheet={}, row={}, name={}".format( sheet_name, row_index - 1, cns_item["name"]), } self.report.report_bug(bug) else: visited_name.add(cns_item["name"]) def _convert_one_row(self, row, schema_sheet): sheet_name = schema_sheet["sheetname"] if sheet_name == "metadata": self.schema.add_metadata(row["property"], row["value"]) return None #logging.info(json4debug(row)) cns_item = { "@id": schema_sheet["config"]["id_pattern"].format(**row), "@type": copy.deepcopy(schema_sheet["config"]["type_predefined"]), "name": schema_sheet["config"]["name_pattern"].format(**row) } cns_item = _excel2jsonld_item(cns_item, row) if sheet_name == "definition": self.schema.set_definition(cns_item) elif sheet_name == "template": self.schema.add_metadata(sheet_name, cns_item) elif sheet_name == "changelog": self.schema.add_metadata(sheet_name, cns_item) else: assert False return cns_item def _run_validate_excel_data(self, excel_data): excel_data_index = dict([[x["sheetname"], x] for x in excel_data]) cns_index = dict([[x["sheetname"], x] for x in CNS_SCHEMA_SHEET]) #check extra sheets sheet_name_diff = set(excel_data_index) - set(CNS_SCHEMA_SHEET_INDEX) if sheet_name_diff: bug = { "keywords": ["info", "table2mem"], "category": "info_excel_sheet_skip_unsupport", "description": "process excel, skip unsupported sheets [{}]".format( ", ".join(sheet_name_diff)), } #self.report.report_bug(bug) #check missing sheets sheet_name_diff = set(CNS_SCHEMA_SHEET_INDEX) - set(excel_data_index) if sheet_name_diff: bug = { "keywords": ["error", "table2mem"], "category": "error_excel_sheet_missing", "description": "process excel, found missing sheets [{}]".format( ", ".join(sheet_name_diff)), } self.report.report_bug(bug) #validate sheet header for sheet in excel_data: sheet_name = sheet["sheetname"] schema_sheet = CNS_SCHEMA_SHEET_INDEX.get(sheet_name) if not schema_sheet: continue self._validate_one_sheet(sheet_name, sheet, schema_sheet) def _validate_one_sheet(self, sheet_name, sheet, schema_sheet): #validate sheet column header_diff = set(schema_sheet["columns"]) - set(sheet["columns"]) if header_diff: bug = { "category": "warn_excel_column_missing", u"description": "excel (sheet={}) missing columns [{}]".format( sheet_name, u",".join((header_diff))) } self.report.report_bug(bug) def _validate_one_row(self, row, row_index, sheet_name, schema_sheet): ret = True #logging.info(idx) if not _is_valid_row(row): #just skip, no need to report error return False for p in schema_sheet["config"]["non_empty_columns"]: if is_empty_string(row.get(p)): bug = { "category": "error_excel_cell_empty_value", u"description": "excel cell expect non-empty value. sheet={} row={} column={}, found empty value" .format(sheet_name, row_index, p), "value": row, } self.report.report_bug(bug) ret = False return ret