Ejemplo n.º 1
0
    def __init__(self, dataset: str, dir: Path) -> None:
        self.ont = get_ontology(dataset)
        self.sms = get_semantic_models(dataset)
        self.sm_prefix_index = {sm.id[:3]: sm for sm in self.sms}
        self.sm_attr2stypes: Dict[str, Dict[str, List[SemanticType]]] = {}
        assert len(self.sm_prefix_index) == len(
            self.sms), "No duplicated prefix"

        class_uris = set()
        predicates = set()
        for sm in self.sms:
            for n in sm.graph.iter_data_nodes():
                e = n.get_first_incoming_link()
                class_uri = e.get_source_node().label.decode()
                predicate = e.label.decode()

                class_uris.add(class_uri)
                predicates.add(predicate)

        for file in dir.iterdir():
            if file.name.endswith(".df.csv"):
                prefix = file.name[:3]
                self.sm_attr2stypes[prefix] = self.read_serene_stypes(file)
                for attr_lbl, stypes in self.sm_attr2stypes[prefix].items():
                    for stype in stypes:
                        stype.domain = self.recover_class_uris(
                            stype.domain, class_uris)
                        stype.type = self.recover_predicates(
                            stype.type, predicates)
Ejemplo n.º 2
0
def compute_mohsen_stypes(dataset: str, train_sms: List[SemanticModel]):
    sms = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    exec_dir = Path(config.fsys.debug.as_path()) / "tmp" / f"mohsen-styper-{get_short_train_name(train_sms)}"
    if exec_dir.exists():
        shutil.rmtree(exec_dir)
    exec_dir.mkdir(exist_ok=True, parents=True)

    semantic_types = {}

    # now we parallel to save time
    # with ThreadPool(os.cpu_count() // 2) as pool:
    with ThreadPool(6) as pool:
        results = {}
        # because karma re-learn semantic types for every data source, we parallel for every data source
        for sm in sms:
            if sm.id in train_sm_ids:
                local_train_sms = [s for s in train_sms if s.id != sm.id]
            else:
                local_train_sms = train_sms

            local_exec_dir = exec_dir / sm.id
            local_exec_dir.mkdir(exist_ok=True)

            results[sm.id] = pool.apply_async(worker_get_stype, (dataset, local_train_sms, sm, local_exec_dir))

        for sid, result in results.items():
            semantic_types[sid] = result.get()

    output_dir = Path(config.datasets[dataset].karma_version.as_path()) / "semantic-types"
    output_dir.mkdir(exist_ok=True)
    serializeJSON(semantic_types, output_dir / f"{get_short_train_name(train_sms)}.json", indent=4)
    return semantic_types
Ejemplo n.º 3
0
    def __init__(self,
                 dataset: str,
                 train_source_ids: List[str],
                 load_circular_dependency: bool = True,
                 training_examples: Optional[List[Example]] = None):
        """
        :param dataset:
        :param train_source_ids:
        :param top_k_semantic_types:
        :param n_sample:
        :param load_circular_dependency:
        :param training_examples: list of training examples use to build weak models, don't need it at testing time (i.e = NULL), because weak models has been built before
        """
        self.dataset = dataset
        self.source_models = {sm.id: sm for sm in get_semantic_models(dataset)}
        self.train_source_ids = set(train_source_ids)
        self.top_k_semantic_types = Settings.get_instance(
        ).semantic_labeling_top_n_stypes

        self.training_models = [
            self.source_models[sid] for sid in train_source_ids
        ]
        self.typer: SemanticTyper = create_semantic_typer(
            dataset, self.training_models)

        self.testing_models = [
            self.source_models[sid] for sid in set(
                self.source_models.keys()).difference(train_source_ids)
        ]
        self.training_examples = training_examples

        # local models
        self.multival_predicate = MultiValuePredicate.get_instance(
            self.training_models)
        self.statistic = Statistic.get_instance(self.training_models)
        # self.data_constraint = get_data_constraint_model(dataset, self.training_models)
        self.stype_assistant = get_stype_assistant_model(
            dataset, self.training_models)
        self.local_structure = LocalStructure.get_instance(
            self.training_models)
        self.attribute_same_scope = AttributeScope.get_instance(self.dataset)
        self.duplication_tensors = DuplicationTensors.get_instance(
            self.training_models)

        self.primary_key: PrimaryKey = PrimaryKey.get_instance(
            dataset, self.training_models)
        self.cardinality = CardinalityFeatures.get_instance(dataset)

        # STEP 1: add semantic types
        self.typer.semantic_labeling(self.training_models,
                                     self.testing_models,
                                     self.top_k_semantic_types,
                                     eval_train=True)

        # STEP 2: load circular dependency like node_prob
        if load_circular_dependency:
            self.node_prob = NodeProb(self, load_classifier=True)
Ejemplo n.º 4
0
def create_semantic_typer(dataset: str,
                          train_sms: List[SemanticModel]) -> SemanticTyper:
    settings = Settings.get_instance()
    if settings.semantic_labeling_method == Settings.MohsenJWS:
        # noinspection PyTypeChecker
        return MohsenSemanticTyper.get_instance(dataset, train_sms)

    if settings.semantic_labeling_method == Settings.ReImplMinhISWC:
        return SemanticTyper.get_instance(dataset, train_sms)

    if settings.semantic_labeling_method == Settings.MohsenJWS + "-Oracle":
        # noinspection PyTypeChecker
        return SemiOracleSemanticLabeling(
            MohsenSemanticTyper.get_instance(dataset, train_sms))

    if settings.semantic_labeling_method == Settings.ReImplMinhISWC + "-Oracle":
        # noinspection PyTypeChecker
        return SemiOracleSemanticLabeling(
            SemanticTyper.get_instance(dataset, train_sms))

    if settings.semantic_labeling_method == Settings.OracleSL:
        # noinspection PyTypeChecker
        return OracleSemanticLabeling()

    if settings.semantic_labeling_method == "OracleSL-Constraint":
        # noinspection PyTypeChecker
        return ConstraintOracleSemanticLabeling()

    if settings.semantic_labeling_method == "SereneSemanticType":
        sms = get_semantic_models(dataset)
        if dataset == "museum_edm" and train_sms == sms[:14]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s01-s14"
        elif dataset == "museum_edm" and train_sms == sms[14:]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s15-s28"
        elif dataset == "museum_edm" and train_sms == sms[7:21]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s08-s21"
        elif dataset == "museum_crm" and train_sms == sms[:14]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s01-s14"
        elif dataset == "museum_crm" and train_sms == sms[14:]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s15-s28"
        elif dataset == "museum_crm" and train_sms == sms[7:21]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s08-s21"
        else:
            raise Exception("Invalid configuration of serene semantic types")

        # noinspection PyTypeChecker
        return SereneSemanticTypes(dataset, Path(serene_dir))

    raise Exception(
        f"Invalid semantic typer: {settings.semantic_labeling_method}")
Ejemplo n.º 5
0
 def __init__(self, dataset: str, model: TemplateLogLinearModel,
              tf_domain: GrowableBinaryVectorDomain,
              pairwise_domain: GrowableBinaryVectorDomain) -> None:
     self.dataset = dataset
     self.source_models: Dict[str, SemanticModel] = {
         s.id: s
         for s in get_semantic_models(dataset)
     }
     self.inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)
     self.map_inference = BeliefPropagation.get_constructor(InferProb.MAP)
     self.model: TemplateLogLinearModel = model
     for template in model.templates:
         if isinstance(template, CachedTemplateFactorConstructor):
             template.disable_cache()
     self.tf_domain: GrowableBinaryVectorDomain = tf_domain
     self.pairwise_domain = pairwise_domain
     self.example_annotator: ExampleAnnotator = None
     self.max_n_tasks = Settings.get_instance().max_n_tasks
Ejemplo n.º 6
0
 def __init__(self, dataset: str) -> None:
     self.dataset = dataset
     self.attribute_same_scope_matrix: Dict[str, Dict[Tuple[bytes, bytes],
                                                      bool]] = {}
     for sm in get_semantic_models(dataset):
         self.attribute_same_scope_matrix[sm.id] = {}
         attr_paths = [
             attr.label.split(Schema.PATH_DELIMITER) for attr in sm.attrs
         ]
         for i in range(len(sm.attrs)):
             for j in range(i + 1, len(sm.attrs)):
                 is_same_scope = attr_paths[i][:-1] == attr_paths[j][:-1]
                 self.attribute_same_scope_matrix[sm.id][(
                     sm.attrs[i].label.encode('utf-8'),
                     sm.attrs[j].label.encode('utf-8'))] = is_same_scope
                 self.attribute_same_scope_matrix[sm.id][(
                     sm.attrs[j].label.encode('utf-8'),
                     sm.attrs[i].label.encode('utf-8'))] = is_same_scope
Ejemplo n.º 7
0
    def _init(self):
        self.source_mappings: Dict[str, SemanticModel] = {
            s.id: s
            for s in get_semantic_models(self.dataset)
        }
        self.train_columns = [
            col for tbl in self.train_tables for col in tbl.columns
        ]
        self.train_column_stypes: List[str] = []
        for tbl in self.train_tables:
            sm = self.source_mappings[tbl.id]
            for col in tbl.columns:
                dnode = sm.graph.get_node_by_id(
                    sm.get_attr_by_label(col.name).id)
                dlink = dnode.get_first_incoming_link()
                self.train_column_stypes.append(dlink.label.decode("utf-8"))

        self.test_columns = [
            col for tbl in self.test_tables for col in tbl.columns
        ]
        self.name2table: Dict[str, ColumnBasedTable] = {
            tbl.id: tbl
            for tbl in chain(self.train_tables, self.test_tables)
        }
        self.col2idx: Dict[str, int] = {
            col.id: i
            for i, col in enumerate(
                chain(self.train_columns, self.test_columns))
        }
        self.col2types: Dict[str, Tuple[str, str]] = {}
        self.col2dnodes: Dict[str, GraphNode] = {}

        col: Column
        for col in chain(self.train_columns, self.test_columns):
            sm = self.source_mappings[col.table_name]
            attr = sm.get_attr_by_label(col.name)
            dnode = sm.graph.get_node_by_id(attr.id)
            link = dnode.get_first_incoming_link()
            self.col2types[col.id] = (link.get_source_node().label, link.label)
            self.col2dnodes[col.id] = dnode

        assert len(self.col2types) == len(self.train_columns) + len(
            self.test_columns), "column name must be unique"
Ejemplo n.º 8
0
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]):
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        if cache_file.exists():
            SemanticTypeAssistant.logger.debug("Try to load previous run...")
            model, cache_dataset, cache_train_sm_ids = deserialize(cache_file)
            if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}:
                need_rebuilt = False

            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)
            model.triple_adviser = ota

        if need_rebuilt:
            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)

            typer = SemanticTyper.get_instance(dataset, train_sms)
            try:
                typer.load_model()
            except:
                sms = get_semantic_models(dataset)
                train_ids = {sm.id for sm in train_sms}
                typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4)

            model = SemanticTypeAssistant(train_sms, typer, ota)
            model.triple_adviser = None
            serialize((model, dataset, {sm.id for sm in train_sms}), cache_file)
            model.triple_adviser = ota

        _instance = model

    return _instance
Ejemplo n.º 9
0
def serialize_rust_input(dataset: str, workdir: str,
                         train_sms: List[SemanticModel],
                         test_sms: List[SemanticModel], foutput: Path):
    primary_key = PrimaryKey.get_instance(dataset, train_sms)

    sms = get_semantic_models(dataset)
    sm_index = {sm.id: i for i, sm in enumerate(sms)}
    train_sm_idxs = [sm_index[sm.id] for sm in train_sms]
    test_sm_idxs = [sm_index[sm.id] for sm in test_sms]

    predicted_parent_stypes = serialize_stype_assistant(
        dataset, sms, train_sms, test_sms)
    cardinality = CardinalityFeatures.get_instance(dataset)
    semantic_labeling(dataset, train_sms, test_sms)

    data = {
        "dataset": dataset,
        "workdir": str(workdir),
        "semantic_models": [sm.to_dict() for sm in sms],
        "predicted_parent_stypes": {
            "stype_details": predicted_parent_stypes
        },
        "train_sm_idxs": train_sm_idxs,
        "test_sm_idxs": test_sm_idxs,
        "feature_primary_keys": primary_key.to_dict(),
        "feature_cardinality_features": {
            sm_id: {
                "columns": matrix.columns,
                "matrix": matrix.matrix
            }
            for sm_id, matrix in cardinality.cardinality_matrices.items()
        },
        "ont_graph": serialize_ont_graph(dataset)
    }

    serializeJSON(data, foutput, indent=4)
Ejemplo n.º 10
0
            print("Saved!!")

        if len(self.exec_buffer) > 0:
            self.proceed_cmd()
        self.worksheet.save(self.model_file)
        run_in_terminal(noti)

    def app_bottom_toolbar(self):
        return HTML(
            'Tips: ctrl-p (render page), ctrl-u (undo), ctrl-r (redo), ctrl-s (save), ctrl-c (abort current prompt)'
        )


if __name__ == '__main__':
    dataset = "museum_crm"
    sm_names = [sm.id for sm in get_semantic_models("museum_edm")]
    ont = get_ontology(dataset)
    train_sms = get_semantic_models(dataset)
    R2RML.load_python_scripts(
        Path(config.datasets[dataset].python_code.as_path()))

    dataset_dir = Path("/workspace/semantic-modeling/data/museum-jws-crm")
    data_files = []
    # data_files = [file for file in (dataset_dir / "tmp").iterdir() if file.name.startswith("s")]
    for file in (dataset_dir / "sources").iterdir():
        if file.name.startswith("s"):
            data_files.append(file)

    for sm_name in sm_names:
        # if int(sm_name[1:3]) <= 18:
        #     continue
Ejemplo n.º 11
0
            eval_results[chuffed_idx] = {'precision': 0, 'recall': 0, 'f1': 0}
        else:
            ssd = ssds[0]
            # ssd.graph.render()
            result = smodel_eval.f1_precision_recall(gold_graph, ssd.graph,
                                                     DataNodeMode.NO_TOUCH)
            eval_results[chuffed_idx]['precision'] = result['precision']
            eval_results[chuffed_idx]['recall'] = result['recall']
            eval_results[chuffed_idx]['f1'] = result['f1']

    return eval_results


if __name__ == '__main__':
    dataset = "museum_crm"
    sms = get_semantic_models(dataset)
    sms_index = {sm.id[:3]: sm for sm in sms}
    ont = get_ontology(dataset)
    ont.register_namespace("serene", "http://au.csiro.data61/serene/dev#")

    # get serene output by sms
    kfold_results = []
    stype = "ReImplMinhISWC_False_pat"
    for kfold in ["kfold-s01-s14", "kfold-s15-s28", "kfold-s08-s21"]:
        kfold_sms_prefix = {
            sm[:3]
            for sm in get_sm_ids_by_name_range(
                *kfold.replace("kfold-", "").split("-"), [sm.id for sm in sms])
        }

        print("==== KFOLD:", kfold, "====")
Ejemplo n.º 12
0
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples


if __name__ == '__main__':
    dataset = "museum_edm"
    Settings.get_instance(False).parallel_n_process = 6
    Settings.get_instance().max_n_tasks = 160
    Settings.get_instance().semantic_labeling_top_n_stypes = 4
    Settings.get_instance().searching_beam_width = 5
    Settings.get_instance().log_current_settings()

    source_models = get_semantic_models(dataset)
    train_sms = source_models[:6]
    test_sms = [sm for sm in source_models if sm not in train_sms]

    workdir = Path(config.fsys.debug.as_path(
    )) / dataset / "main_experiments" / get_short_train_name(train_sms)
    workdir.mkdir(exist_ok=True, parents=True)

    create_semantic_typer(dataset, train_sms).semantic_labeling(
        train_sms,
        test_sms,
        top_n=Settings.get_instance().semantic_labeling_top_n_stypes,
        eval_train=True)

    model_dir = workdir / "models" / "exp_no_3"
    model = Model.from_file(dataset, model_dir)
Ejemplo n.º 13
0
                        help='Experiment directory, must be existed before')

    args = parser.parse_args()
    try:
        assert args.dataset is not None
        args.kfold = parse_kfold(args.dataset, args.kfold)
    except AssertionError:
        parser.print_help()
        raise

    return args


if __name__ == '__main__':
    args = get_shell_args()
    source_models: List[SemanticModel] = get_semantic_models(args.dataset)
    train_sms = [
        sm for sm in source_models if sm.id in args.kfold['train_sm_ids']
    ]
    test_sms = [
        sm for sm in source_models if sm.id in args.kfold['test_sm_ids']
    ]

    Settings.get_instance(False).semantic_labeling_method = args.semantic_typer
    Settings.get_instance().log_current_settings()

    typer = create_semantic_typer(args.dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, 4, eval_train=True)

    exp_dir = Path(args.exp_dir)
    eval_sources(
Ejemplo n.º 14
0
        # dump result into test_sources
        for source in chain(train_sources, test_sources):
            for col in source.attrs:
                try:
                    if col.label not in result[source.id].columns:
                        # this column is ignored
                        stypes = []
                    else:
                        stypes = result[source.id].columns[col.label]

                    col.semantic_types = [
                        KarmaSemanticType(col.id, stype.domain, stype.type,
                                          "Minhptx-ISWC2016-SemanticLabeling",
                                          stype.weight) for stype in stypes
                    ][:top_n]
                except Exception:
                    self.logger.exception(
                        "Hit exception for source: %s, col: %s",
                        source.get_id(), col.id)
                    raise


if __name__ == '__main__':
    dataset = "museum_crm"
    sources: List[SemanticModel] = get_semantic_models(dataset)[:5]

    train_size = 3
    typer = MinhptxSemanticLabeling(dataset, 200)
    typer.semantic_labeling(sources[:train_size], sources[train_size:], 4)
Ejemplo n.º 15
0
                                    key=lambda x: space['children'][x],
                                    reverse=True)
            self.node_structure_space[n] = NodeStructureSpace(
                n, {x: i
                    for i, x in enumerate(space['parents'].keys())},
                {x: i
                 for i, x in enumerate(children_attrs)},
                [x[1] != b'DATA_NODE' for x in children_attrs],
                [space['children'][x] for x in children_attrs])

    @staticmethod
    def get_instance(train_sms: List[SemanticModel]) -> 'LocalStructure':
        sm_ids = {sm.id for sm in train_sms}

        if LocalStructure.instance is None:
            LocalStructure.instance = LocalStructure(train_sms)
            return LocalStructure.instance

        assert LocalStructure.instance.train_sm_ids == sm_ids
        return LocalStructure.instance


if __name__ == '__main__':
    import ujson

    dataset = "museum_edm"
    train_size = 14
    source_models = get_semantic_models(dataset)[:train_size]

    local_structure = LocalStructure.get_instance(source_models)
    print(ujson.dumps(local_structure.node_structure_space, indent=4))
Ejemplo n.º 16
0
    node_id: crm:E12_Production1
    domain: crm:E12_Production
    type: karma:dummy
    input_attr_path: %s""" % attr_path)


if __name__ == '__main__':
    dataset = "museum_crm"
    ont = get_ontology(dataset)

    dataset_dir = Path(config.datasets[dataset].as_path())
    R2RML.load_python_scripts(
        Path(config.datasets[dataset].python_code.as_path()))

    # train the model first
    train_sms = get_semantic_models(dataset)[:-1]
    styper = SemanticTyper.get_instance(dataset, train_sms)

    # doing interactive modeling
    for tbl in get_raw_data_tables(dataset):
        if tbl.id in [sm.id for sm in train_sms]:
            continue

        print("Processing table:", tbl.id)
        print(tbl.head(10).to_string("double"))

        r2rml = R2RML.load_from_file(dataset_dir / "models-y2rml" /
                                     f"{tbl.id}-model.yml")
        sm = r2rml.apply_cmds(tbl)

        # gen_dummy_sm(sm, tbl)
Ejemplo n.º 17
0
        with Pool() as p:
            tf_cols = p.map(TfidfDatabase._compute_tf,
                            [(self.tokenizer, col) for col in cols])

        for col, tf_col in zip(cols, tf_cols):
            tfidf = numpy.zeros(len(self.vocab))
            for w, tf in tf_col.items():
                if w in self.vocab:
                    tfidf[self.vocab[w]] = tf * numpy.log(
                        self.n_docs / (1 + self.invert_token_idx[w]))
            self.cache_col2tfidf[col.id] = tfidf

    @staticmethod
    def _compute_tf(args):
        tokenizer, col = args
        counter = Counter()
        sents = (subsent for sent in col.get_textual_data()
                 for subsent in sent.decode('utf-8').split("/"))
        for doc in tokenizer.pipe(sents, batch_size=50, n_threads=4):
            counter.update((str(w) for w in doc))

        number_of_token = sum(counter.values())
        for token, val in counter.items():
            counter[token] = val / number_of_token
        return counter


if __name__ == '__main__':
    stype_db = SemanticTypeDB.create(
        "museum_edm", [sm.id for sm in get_semantic_models("museum_edm")[:14]])
    stype_db._build_db()
Ejemplo n.º 18
0
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms,
                            test_sms):
    ont: Ontology = get_ontology(dataset)
    karma_models: List[KarmaModel] = get_karma_models(dataset)
    semantic_models: List[SemanticModel] = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    sdesc_args = dict(
        dataset=dataset,
        train_sm_ids=train_sm_ids,
        use_correct_type=
        False,  # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes
        use_old_semantic_typer=False,
        exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015",
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")
    # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder
    if sdesc_args['sm_type_dir'].exists():
        shutil.rmtree(sdesc_args['sm_type_dir'])
    sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True)

    top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes
    typer = create_semantic_typer(dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True)

    for sm, ksm in zip(semantic_models, karma_models):
        # assign semantic types to learnedSemanticTypes
        sm_alignment = SemanticModelAlignment(sm, ksm)
        for col in ksm.source_columns:
            attr = sm.get_attr_by_label(
                sm.graph.get_node_by_id(
                    sm_alignment.alignment[col.id]).label.decode('utf-8'))
            node = ksm.karma_graph.get_node_by_id(col.id)
            link = node.get_first_incoming_link()

            node.learned_semantic_types = [
                KarmaSemanticType(node.id, stype.domain, stype.type,
                                  typer.__class__.__name__,
                                  stype.confidence_score)
                for stype in attr.semantic_types
            ]
            node.user_semantic_types = [
                KarmaSemanticType(node.id,
                                  link.get_source_node().label.decode(),
                                  link.label.decode(), "User", 1.0)
            ]

        serializeJSON(ksm.to_normalized_json_model(ont),
                      sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json",
                      indent=4)

    # STEP 2: invoking semantic modeling
    modeler = MohsenSemanticModeling(**sdesc_args)
    pred_sms = modeler.sm_prediction(train_sms, test_sms)

    # STEP 3: prediction semantic mapping result
    eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]]
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    for sm, pred_sm in zip(test_sms, pred_sms):
        eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph,
                                                      data_node_mode)
        eval_hist.append([
            sm.id, eval_result["precision"], eval_result["recall"],
            eval_result["f1"],
            smodel_eval.stype_acc(sm.graph, pred_sm.graph)
        ])

    eval_hist.append([
        'average',
        np.average([float(x[1]) for x in eval_hist[1:]]),
        np.average([float(x[2]) for x in eval_hist[1:]]),
        np.average([float(x[3]) for x in eval_hist[1:]]),
        np.average([float(x[4]) for x in eval_hist[1:]])
    ])
    serializeCSV(
        eval_hist,
        sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv")

    # STEP 4: prediction semantic labeling result
    pred_stypes = modeler.semantic_labeling(train_sms, test_sms)
    for pred_stype, sm in zip(pred_stypes, test_sms):
        for attr in sm.attrs:
            if attr.label not in pred_stype:
                attr.semantic_types = []
            else:
                attr.semantic_types = pred_stype[attr.label]
    eval_sources(
        test_sms, sdesc_args["exec_dir"] /
        f"evaluation_result_{scenario.value}_stype.csv")

    # STEP 5: visualize the prediction
    (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True)
    need_render_graphs = [
        (colorize_prediction(
            pred_sm.graph,
            AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]),
         sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png")
        for sm, pred_sm in zip(test_sms, pred_sms)
    ]
    with ThreadPool(32) as p:
        p.map(render_graph, need_render_graphs)

    return eval_hist
Ejemplo n.º 19
0

if __name__ == '__main__':
    # HYPER-ARGS
    args = get_shell_args()

    Settings.get_instance(
        False
    ).semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes
    Settings.get_instance().semantic_labeling_method = args.semantic_typer
    Settings.get_instance().log_current_settings()

    exp_dir = Path(args.exp_dir)
    assert exp_dir.exists()

    source_models = {sm.id: sm for sm in get_semantic_models(args.dataset)}
    train_sms = [source_models[sid] for sid in args.kfold['train_sm_ids']]
    test_sms = [source_models[sid] for sid in args.kfold['test_sm_ids']]

    eval_hist = run_evaluation_workflow(args.dataset, Scenario.SCENARIO_2,
                                        train_sms, test_sms)
    serializeCSV(eval_hist,
                 exp_dir / f"kfold-{get_short_train_name(train_sms)}.test.csv")
    serializeJSON(args,
                  exp_dir /
                  f"kfold-{get_short_train_name(train_sms)}.meta.json",
                  indent=4)
    shutil.move(
        get_cache_dir(args.dataset, train_sms) / "mohsen_jws2015",
        exp_dir / f"kfold-{get_short_train_name(train_sms)}")