Esempio n. 1
0
    def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str],
                 exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None):
        self.dataset: str = dataset
        self.train_sm_ids = train_sm_ids
        self.ont = get_ontology(dataset)
        self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)}

        # can only run once time, trying re-invoke will generate an error
        self.__has_run_modeling = False
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015"
        self.exec_dir: Path = Path(exec_dir)
        self.sm_type_dir = sm_type_dir

        # parameters for mohsen's algorithm
        self.use_old_semantic_typer = use_old_semantic_typer
        self.use_correct_type = use_correct_type
        assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4
        self.num_candidate_semantic_type = 4
        self.multiple_same_property_per_node = True

        self.coherence = 1.0
        self.confidence = 1.0
        self.size_reduction = 0.5

        self.num_candidate_mappings = 50
        self.mapping_branching_factor = 50
        self.topk_steiner_tree = 10

        # take all, not cut off everything
        self.cut_off = int(1e6)
        self.our_and_karma_sm_alignments = {}
Esempio n. 2
0
    def get_instance(dataset: str, train_sms: List[SemanticModel]):
        if PrimaryKey.instance is None:
            cache_file = get_cache_dir(
                dataset, train_sms) / "weak_models" / "primary_keys.json"
            if not cache_file.exists():
                train_sm_ids = {sm.id for sm in train_sms}
                train_tbls = {
                    tbl.id: tbl
                    for tbl in get_data_tables(dataset)
                    if tbl.id in train_sm_ids
                }
                predictions: Dict[str, List[dict]] = defaultdict(lambda: [])
                pesudo_primary_keys = {}

                for sm in train_sms:
                    jsonld_objects = jsonld_generator(sm, train_tbls[sm.id])
                    for n in sm.graph.iter_class_nodes():
                        fields = [
                            e.label.decode("utf-8")
                            for e in n.iter_outgoing_links()
                            if e.get_target_node().is_data_node()
                        ]
                        if len(fields) == 0:
                            continue
                        if 'karma:classLink' in fields:
                            pesudo_primary_keys[n.label] = 'karma:classLink'
                            continue

                        results = extract_node_data(n, jsonld_objects)
                        views = create_unique_views(results, fields)
                        predictions[n.label].append(
                            predict_pesudo_keys(fields, views))

                for class_lbl, preds in predictions.items():
                    total = defaultdict(lambda: 0)
                    for pred in preds:
                        for link_lbl in pred:
                            total[link_lbl] += pred[link_lbl]
                    for link_lbl, count in total.items():
                        total[link_lbl] = count
                    pesudo_primary_keys[class_lbl] = max(total.items(),
                                                         key=lambda x: x[1])[0]

                PrimaryKey.instance = PrimaryKey({
                    k: v.encode('utf-8')
                    for k, v in pesudo_primary_keys.items()
                })
                cache_file.parent.mkdir(exist_ok=True, parents=True)
                serializeJSON(PrimaryKey.instance, cache_file, indent=4)
            else:
                PrimaryKey.instance: PrimaryKey = deserializeJSON(
                    cache_file, Class=PrimaryKey)

        return PrimaryKey.instance
Esempio n. 3
0
    def __init__(self,
                 dataset: str,
                 train_sms: List[SemanticModel],
                 exec_dir: Optional[Path] = None) -> None:
        self.dataset = dataset
        self.train_source_ids = {sm.id for sm in train_sms}
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sms) / "semantic-labeling"
        self.exec_dir = Path(exec_dir)
        self.exec_dir.mkdir(exist_ok=True, parents=True)

        self.model = None
        self.stype_db = SemanticTypeDB.get_stype_db(
            dataset, [sm.id for sm in train_sms], self.exec_dir)
Esempio n. 4
0
def get_data_constraint_model(
    dataset: str,
    train_sms: List[SemanticModel],
) -> DataConstraint:
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(
            dataset, train_sms) / "weak_models" / "data_constraint.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        settings = Settings.get_instance()
        valid_threshold = settings.data_constraint_valid_threshold
        guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold
        n_comparison_samples = settings.data_constraint_n_comparison_samples
        random_seed = settings.random_seed
        n_sample = settings.n_samples

        if cache_file.exists():
            DataConstraint.logger.debug("Try to load previous run...")
            model, cached_dataset, cached_train_sm_ids, extra_args = deserialize(
                cache_file)
            if cached_dataset == dataset \
                    and cached_train_sm_ids == {sm.id for sm in train_sms} \
                    and extra_args == (
                        valid_threshold, guess_datetime_threshold, n_comparison_samples,
                        random_seed, n_sample):
                need_rebuilt = False

        if need_rebuilt:
            DataConstraint.logger.debug("Re-build data-constraint model...")
            data_tables = [
                ColumnBasedTable.from_table(tbl)
                for tbl in get_sampled_data_tables(dataset)
            ]
            model = DataConstraint(train_sms, data_tables, valid_threshold,
                                   guess_datetime_threshold,
                                   n_comparison_samples)
            serialize((model, dataset, {sm.id
                                        for sm in train_sms},
                       (valid_threshold, guess_datetime_threshold,
                        n_comparison_samples, random_seed, n_sample)),
                      cache_file)

        _instance = model
    return _instance
Esempio n. 5
0
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]):
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        if cache_file.exists():
            SemanticTypeAssistant.logger.debug("Try to load previous run...")
            model, cache_dataset, cache_train_sm_ids = deserialize(cache_file)
            if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}:
                need_rebuilt = False

            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)
            model.triple_adviser = ota

        if need_rebuilt:
            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)

            typer = SemanticTyper.get_instance(dataset, train_sms)
            try:
                typer.load_model()
            except:
                sms = get_semantic_models(dataset)
                train_ids = {sm.id for sm in train_sms}
                typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4)

            model = SemanticTypeAssistant(train_sms, typer, ota)
            model.triple_adviser = None
            serialize((model, dataset, {sm.id for sm in train_sms}), cache_file)
            model.triple_adviser = ota

        _instance = model

    return _instance
Esempio n. 6
0
def create_rust_input(dataset: str, scenario: Scenario, train_sms, test_sms):
    train_sm_ids = [sm.id for sm in train_sms]
    exec_dir = get_cache_dir(dataset, train_sms) / "mohsen_jws2015"
    modeler = MohsenSemanticModeling(
        dataset,
        False,
        False,
        train_sm_ids,
        exec_dir=exec_dir,
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")

    candidate_smss = modeler.sm_candidate_generation(train_sms, test_sms)
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    train_sm_ids = {sm.id for sm in train_sms}
    real_test_sm_ids = {sm.id for sm in test_sms if sm.id not in train_sm_ids}

    train_eval_hist = get_eval_hist(train_sm_ids, test_sms, candidate_smss,
                                    data_node_mode)
    test_eval_hist = get_eval_hist(real_test_sm_ids, test_sms, candidate_smss,
                                   data_node_mode)

    serializeCSV(
        train_eval_hist,
        exec_dir / f"evaluation_result_{scenario.value}.train.oracle.csv")
    serializeCSV(
        test_eval_hist,
        exec_dir / f"evaluation_result_{scenario.value}.test.oracle.csv")

    # now create rust bridge
    obj = {}
    for gold_sm, candidate_sms in zip(test_sms, candidate_smss):
        obj[gold_sm.id] = [c.graph.to_dict() for c in candidate_sms]
    serializeJSON(obj, exec_dir / "rust-karma-pred-input.json")
Esempio n. 7
0
    def get_classifier(self, retrain: bool, train_examples: List[Example]):
        # TODO: implement this properly, currently, we have to train and save manually
        cached_file = get_cache_dir(
            self.example_annotator.dataset,
            list(self.example_annotator.train_source_ids)
        ) / "weak_models" / "node_prob_classifier.pkl"
        if not cached_file.exists() or retrain:
            self.logger.debug("Retrain new model")
            raw_X_train = make_data(self, train_examples)
            classifier = LogisticRegression(fit_intercept=True)

            X_train = numpy.asarray(
                [list(features.values())[1:] for features in raw_X_train])
            X_train, y_train = X_train[:, :-1], [
                int(x) for x in X_train[:, -1]
            ]

            scaler = StandardScaler().fit(X_train)
            scaler.transform(X_train)

            try:
                classifier.fit(X_train, y_train)
            except ValueError as e:
                assert str(e).startswith(
                    "This solver needs samples of at least 2 classes in the data"
                )
                # this should be at a starter phase when we don't have any data but use ground-truth to build
                X_train = numpy.vstack([X_train, [0, 0]])
                y_train.append(0)
                classifier.fit(X_train, y_train)

            cached_file.parent.mkdir(exist_ok=True, parents=True)
            serialize((scaler, classifier), cached_file)
            return scaler, classifier

        return deserialize(cached_file)
Esempio n. 8
0
def clear_cache(dataset: str) -> None:
    # only clear cache which are generated for different training models
    cache_dir = get_cache_dir(dataset)
    for item in cache_dir.iterdir():
        if item.is_dir():
            shutil.rmtree(item)
Esempio n. 9
0
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms,
                            test_sms):
    ont: Ontology = get_ontology(dataset)
    karma_models: List[KarmaModel] = get_karma_models(dataset)
    semantic_models: List[SemanticModel] = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    sdesc_args = dict(
        dataset=dataset,
        train_sm_ids=train_sm_ids,
        use_correct_type=
        False,  # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes
        use_old_semantic_typer=False,
        exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015",
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")
    # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder
    if sdesc_args['sm_type_dir'].exists():
        shutil.rmtree(sdesc_args['sm_type_dir'])
    sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True)

    top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes
    typer = create_semantic_typer(dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True)

    for sm, ksm in zip(semantic_models, karma_models):
        # assign semantic types to learnedSemanticTypes
        sm_alignment = SemanticModelAlignment(sm, ksm)
        for col in ksm.source_columns:
            attr = sm.get_attr_by_label(
                sm.graph.get_node_by_id(
                    sm_alignment.alignment[col.id]).label.decode('utf-8'))
            node = ksm.karma_graph.get_node_by_id(col.id)
            link = node.get_first_incoming_link()

            node.learned_semantic_types = [
                KarmaSemanticType(node.id, stype.domain, stype.type,
                                  typer.__class__.__name__,
                                  stype.confidence_score)
                for stype in attr.semantic_types
            ]
            node.user_semantic_types = [
                KarmaSemanticType(node.id,
                                  link.get_source_node().label.decode(),
                                  link.label.decode(), "User", 1.0)
            ]

        serializeJSON(ksm.to_normalized_json_model(ont),
                      sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json",
                      indent=4)

    # STEP 2: invoking semantic modeling
    modeler = MohsenSemanticModeling(**sdesc_args)
    pred_sms = modeler.sm_prediction(train_sms, test_sms)

    # STEP 3: prediction semantic mapping result
    eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]]
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    for sm, pred_sm in zip(test_sms, pred_sms):
        eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph,
                                                      data_node_mode)
        eval_hist.append([
            sm.id, eval_result["precision"], eval_result["recall"],
            eval_result["f1"],
            smodel_eval.stype_acc(sm.graph, pred_sm.graph)
        ])

    eval_hist.append([
        'average',
        np.average([float(x[1]) for x in eval_hist[1:]]),
        np.average([float(x[2]) for x in eval_hist[1:]]),
        np.average([float(x[3]) for x in eval_hist[1:]]),
        np.average([float(x[4]) for x in eval_hist[1:]])
    ])
    serializeCSV(
        eval_hist,
        sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv")

    # STEP 4: prediction semantic labeling result
    pred_stypes = modeler.semantic_labeling(train_sms, test_sms)
    for pred_stype, sm in zip(pred_stypes, test_sms):
        for attr in sm.attrs:
            if attr.label not in pred_stype:
                attr.semantic_types = []
            else:
                attr.semantic_types = pred_stype[attr.label]
    eval_sources(
        test_sms, sdesc_args["exec_dir"] /
        f"evaluation_result_{scenario.value}_stype.csv")

    # STEP 5: visualize the prediction
    (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True)
    need_render_graphs = [
        (colorize_prediction(
            pred_sm.graph,
            AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]),
         sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png")
        for sm, pred_sm in zip(test_sms, pred_sms)
    ]
    with ThreadPool(32) as p:
        p.map(render_graph, need_render_graphs)

    return eval_hist
Esempio n. 10
0

if __name__ == '__main__':
    # HYPER-ARGS
    args = get_shell_args()

    Settings.get_instance(
        False
    ).semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes
    Settings.get_instance().semantic_labeling_method = args.semantic_typer
    Settings.get_instance().log_current_settings()

    exp_dir = Path(args.exp_dir)
    assert exp_dir.exists()

    source_models = {sm.id: sm for sm in get_semantic_models(args.dataset)}
    train_sms = [source_models[sid] for sid in args.kfold['train_sm_ids']]
    test_sms = [source_models[sid] for sid in args.kfold['test_sm_ids']]

    eval_hist = run_evaluation_workflow(args.dataset, Scenario.SCENARIO_2,
                                        train_sms, test_sms)
    serializeCSV(eval_hist,
                 exp_dir / f"kfold-{get_short_train_name(train_sms)}.test.csv")
    serializeJSON(args,
                  exp_dir /
                  f"kfold-{get_short_train_name(train_sms)}.meta.json",
                  indent=4)
    shutil.move(
        get_cache_dir(args.dataset, train_sms) / "mohsen_jws2015",
        exp_dir / f"kfold-{get_short_train_name(train_sms)}")