コード例 #1
0
ファイル: cns_graphviz.py プロジェクト: dingmaotu/kgtool
def task_graphviz(args):
    #logging.info( "called task_graphviz" )

    filename = args["input_file"]
    the_schema = CnsSchema()
    the_schema.preloaded_schema_list = preload_schema(args)
    the_schema.jsonld2mem4file(filename)

    #validate if we can reproduce the same jsonld based on input
    jsonld_input = file2json(filename)

    name = os.path.basename(args["input_file"]).replace(".jsonld","")
    graph_name = re.sub(ur"[-\.]","_", name)
    ret = run_graphviz(the_schema, graph_name)
    for key, lines in ret.items():
        xdebug_file = os.path.join(args["debug_dir"], name+"_"+key+u".dot")
        lines2file([lines], xdebug_file)
コード例 #2
0
ファイル: cns_convert.py プロジェクト: dingmaotu/kgtool
def task_convert(args):
    logging.info("called task_convert")
    filename = "../schema/cns_top_v2.0.jsonld"
    filename = file2abspath(filename, __file__)
    loaded_schema = CnsSchema()
    loaded_schema.import_jsonld(filename)

    filename = args["input_file"]
    jsondata = file2json(filename)
    report = CnsBugReport()
    for idx, item in enumerate(jsondata):
        types = [item["mainType"], "Thing"]
        primary_keys = [idx]
        cns_item = run_convert(loaded_schema, item, types, primary_keys,
                               report)
        logging.info(json4debug(cns_item))
        #loaded_schema.run_validate(cns_item, report)
    logging.info(json4debug(report.data))
コード例 #3
0
def task_graphviz(args):
    #logging.info( "called task_graphviz" )

    filename = args["input_file"]
    loaded_schema = CnsSchema()
    preloaded_schema_list = preload_schema(args)
    loaded_schema.import_jsonld(filename, preloaded_schema_list)

    #validate if we can reproduce the same jsonld based on input
    jsonld_input = file2json(filename)

    name = os.path.basename(args["input_file"]).split(u".")[0]
    name = re.sub(ur"-", "_", name)
    ret = run_graphviz(loaded_schema, name)
    for key, lines in ret.items():
        xdebug_file = os.path.join(args["debug_dir"],
                                   name + "_" + key + u".dot")
        lines2file([lines], xdebug_file)
コード例 #4
0
def task_validate(args):
    logging.info( "called task_validate" )
    schema_filename = args.get("input_schema")
    if not schema_filename:
        schema_filename = "schema/cns_top.jsonld"

    preloadSchemaList = preload_schema(args)
    loaded_schema = CnsSchema()
    loaded_schema.import_jsonld(schema_filename, preloadSchemaList)


    filepath = args["input_file"]
    filename_list = glob.glob(filepath)
    report = init_report()

    # init xtemplate
    report[XTEMPLATE] = collections.Counter()
    for template in loaded_schema.metadata["template"]:
        d = template["refClass"]
        p = template["refProperty"]
        key_cp = u"cp_{}_{}_{}".format(d, d, p)
        report[XTEMPLATE][key_cp] += 0
    logging.info(json4debug(report[XTEMPLATE]))

    # init class path dependency
    for template in loaded_schema.metadata["template"]:
        d = template["refClass"]
        key_cp = u"parent_{}".format(d)
        report[XTEMPLATE][key_cp] = loaded_schema.index_inheritance["rdfs:subClassOf"].get(d)

    for definition in loaded_schema.definition.values():
        if "rdfs:Class" in definition["@type"]:
            d = definition["name"]
            key_cp = u"parent_{}".format(d)
            report[XTEMPLATE][key_cp] = loaded_schema.index_inheritance["rdfs:subClassOf"].get(d)

    #validate
    lines = []

    for filename in filename_list:
        logging.info(filename)
        if not os.path.exists(filename):
            continue

        if args.get("option") == "jsons":
            for idx, line in enumerate(file2iter(filename)):
                if idx % 10000 ==0:
                    logging.info(idx)
                    logging.info(json4debug(report))
                json_data = json.loads(line)
                run_validate(loaded_schema, json_data, report)
                stat_kg_report_per_item(json_data, None, report["stats"])

                # collection entity listing
                if "CnsLink" not in json_data["@type"]:
                    entity_simple = [
                        json_data["@type"][0],
                        json_data.get("name",""),
                         "\""+u",".join(json_data.get("alternateName",[]))+"\""
                    ]
                    lines.append(u",".join(entity_simple))

        else:
            jsondata = file2json(filename)
            run_validate(loaded_schema, jsondata, report)

    #out
    filename = args["output_validate_entity"]
    logging.info(filename)
    lines = sorted(lines)

    fields = ["main_type","name","alternateName"]
    lines.insert(0, u",".join(fields))
    lines2file(lines, filename)

    #display report
    logging.info(json4debug(report))

    #write report csv
    write_csv_report(args, report, loaded_schema)

    filename = args["output_validate_report"].replace("csv","json")
    logging.info(filename)
    json2file(report, filename)
コード例 #5
0
ファイル: cns_model_table.py プロジェクト: dingmaotu/kgtool
 def __init__(self):
     #cnSchema存储
     self.schema = CnsSchema()
     self.report = self.schema.report
コード例 #6
0
ファイル: cns_model_table.py プロジェクト: dingmaotu/kgtool
class CnsModelTable():
    def __init__(self):
        #cnSchema存储
        self.schema = CnsSchema()
        self.report = self.schema.report

    def table2mem(self, excel_data):
        #logging.info(json4debug(excel_data))
        self._run_validate_excel_data(excel_data)
        if self.report.has_bug():
            return False

        self._run_load_excel_data(excel_data)
        if self.report.has_bug():
            return False

        self.schema.build()
        if self.report.has_bug():
            return False

        #validate4jsonld
        json_data = self.schema.mem2jsonld()
        run_validate_recursive(self.schema, json_data, self.report)
        if self.report.has_bug():
            return False

        return self.schema

    def _run_load_excel_data(self, excel_data):

        excel_data_index = dict([[x["sheetname"], x] for x in excel_data])
        for schema_sheet in CNS_SCHEMA_SHEET:
            sheet_name = schema_sheet["sheetname"]
            sheet = excel_data_index.get(sheet_name)
            if not schema_sheet:
                continue

            #process rows
            visited_name = set()
            for row_index, row in enumerate(sheet["rows"]):
                if not self._validate_one_row(row, row_index, sheet_name,
                                              schema_sheet):
                    continue

                cns_item = self._convert_one_row(row, schema_sheet)

                #check for duplicated name
                if cns_item is None:
                    pass
                elif cns_item["name"] in visited_name:
                    bug = {
                        "category":
                        "error_excel_row_duplicated_name",
                        "description":
                        "duplicated name sheet={}, row={}, name={}".format(
                            sheet_name, row_index - 1, cns_item["name"]),
                    }
                    self.report.report_bug(bug)
                else:
                    visited_name.add(cns_item["name"])

    def _convert_one_row(self, row, schema_sheet):
        sheet_name = schema_sheet["sheetname"]

        if sheet_name == "metadata":
            self.schema.add_metadata(row["property"], row["value"])
            return None

        #logging.info(json4debug(row))
        cns_item = {
            "@id": schema_sheet["config"]["id_pattern"].format(**row),
            "@type": copy.deepcopy(schema_sheet["config"]["type_predefined"]),
            "name": schema_sheet["config"]["name_pattern"].format(**row)
        }

        cns_item = _excel2jsonld_item(cns_item, row)

        if sheet_name == "definition":
            self.schema.set_definition(cns_item)
        elif sheet_name == "template":
            self.schema.add_metadata(sheet_name, cns_item)
        elif sheet_name == "changelog":
            self.schema.add_metadata(sheet_name, cns_item)
        else:
            assert False
        return cns_item

    def _run_validate_excel_data(self, excel_data):
        excel_data_index = dict([[x["sheetname"], x] for x in excel_data])
        cns_index = dict([[x["sheetname"], x] for x in CNS_SCHEMA_SHEET])

        #check extra sheets
        sheet_name_diff = set(excel_data_index) - set(CNS_SCHEMA_SHEET_INDEX)
        if sheet_name_diff:
            bug = {
                "keywords": ["info", "table2mem"],
                "category":
                "info_excel_sheet_skip_unsupport",
                "description":
                "process excel, skip unsupported sheets [{}]".format(
                    ", ".join(sheet_name_diff)),
            }
            #self.report.report_bug(bug)

        #check missing sheets
        sheet_name_diff = set(CNS_SCHEMA_SHEET_INDEX) - set(excel_data_index)
        if sheet_name_diff:
            bug = {
                "keywords": ["error", "table2mem"],
                "category":
                "error_excel_sheet_missing",
                "description":
                "process excel, found missing sheets [{}]".format(
                    ", ".join(sheet_name_diff)),
            }
            self.report.report_bug(bug)

        #validate sheet header
        for sheet in excel_data:
            sheet_name = sheet["sheetname"]
            schema_sheet = CNS_SCHEMA_SHEET_INDEX.get(sheet_name)
            if not schema_sheet:
                continue

            self._validate_one_sheet(sheet_name, sheet, schema_sheet)

    def _validate_one_sheet(self, sheet_name, sheet, schema_sheet):

        #validate sheet column
        header_diff = set(schema_sheet["columns"]) - set(sheet["columns"])
        if header_diff:
            bug = {
                "category":
                "warn_excel_column_missing",
                u"description":
                "excel (sheet={}) missing columns [{}]".format(
                    sheet_name, u",".join((header_diff)))
            }
            self.report.report_bug(bug)

    def _validate_one_row(self, row, row_index, sheet_name, schema_sheet):
        ret = True
        #logging.info(idx)
        if not _is_valid_row(row):
            #just skip, no need to report error
            return False

        for p in schema_sheet["config"]["non_empty_columns"]:
            if is_empty_string(row.get(p)):
                bug = {
                    "category":
                    "error_excel_cell_empty_value",
                    u"description":
                    "excel cell expect non-empty value. sheet={} row={} column={}, found empty value"
                    .format(sheet_name, row_index, p),
                    "value":
                    row,
                }
                self.report.report_bug(bug)
                ret = False

        return ret