Esempio n. 1
0
    def __init__(self, dataset: str, train_sms: List[SemanticModel]) -> None:
        input_file = Path(config.datasets[dataset].karma_version.as_path()) / "semantic-types" / f"{get_short_train_name(train_sms)}.json"
        if not input_file.exists():
            compute_mohsen_stypes(dataset, train_sms)

        self.stypes = deserializeJSON(input_file)
        self.train_source_ids = {sm.id for sm in train_sms}
Esempio n. 2
0
def get_semantic_models(dataset: str) -> List[SemanticModel]:
    """Get list of semantic models of a given dataset"""
    global _data_io_vars

    if dataset not in _data_io_vars["semantic_models"]:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'semantic_models.json'
        if cache_file.exists():
            semantic_models = deserializeJSON(cache_file, Class=SemanticModel)
        else:
            mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path())
            R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path()))
            raw_tables = get_raw_data_tables(dataset)
            semantic_models = []
            tables = []
            for i, raw_tbl in enumerate(raw_tables):
                r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml"
                tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl)
                semantic_models.append(sm)
                tables.append(tbl)

            serializeJSON(semantic_models, cache_file)
            _data_io_vars["data_tables"][dataset] = tables

        _data_io_vars["semantic_models"][dataset] = semantic_models

    return _data_io_vars["semantic_models"][dataset]
Esempio n. 3
0
def load_data(dataset):
    data_dir = Path(config.fsys.debug.as_path(
    )) / dataset / "training_workflow" / "examples_generator" / "i0"
    # train_examples: List[Example] = deserializeJSON(data_dir / "train.small.json", Class=Example)
    # test_examples: List[Example] = deserializeJSON(data_dir / "test.small.json", Class=Example)

    train_examples: List[Example] = deserializeJSON(data_dir / "train.json",
                                                    Class=Example)
    test_examples: List[Example] = deserializeJSON(data_dir / "test.json",
                                                   Class=Example)

    # TODO: uncomment below to create small dataset to debug
    # train_examples = [e for e in train_examples if e.model_id.startswith('s03')]
    # train_examples = train_examples[:100]
    # test_examples = test_examples[:100]
    # serializeJSON(train_examples, data_dir / "train.small.json")
    # serializeJSON(test_examples, data_dir / "test.small.json")

    return train_examples, test_examples
Esempio n. 4
0
    def get_instance(dataset: str, train_sms: List[SemanticModel]):
        if PrimaryKey.instance is None:
            cache_file = get_cache_dir(
                dataset, train_sms) / "weak_models" / "primary_keys.json"
            if not cache_file.exists():
                train_sm_ids = {sm.id for sm in train_sms}
                train_tbls = {
                    tbl.id: tbl
                    for tbl in get_data_tables(dataset)
                    if tbl.id in train_sm_ids
                }
                predictions: Dict[str, List[dict]] = defaultdict(lambda: [])
                pesudo_primary_keys = {}

                for sm in train_sms:
                    jsonld_objects = jsonld_generator(sm, train_tbls[sm.id])
                    for n in sm.graph.iter_class_nodes():
                        fields = [
                            e.label.decode("utf-8")
                            for e in n.iter_outgoing_links()
                            if e.get_target_node().is_data_node()
                        ]
                        if len(fields) == 0:
                            continue
                        if 'karma:classLink' in fields:
                            pesudo_primary_keys[n.label] = 'karma:classLink'
                            continue

                        results = extract_node_data(n, jsonld_objects)
                        views = create_unique_views(results, fields)
                        predictions[n.label].append(
                            predict_pesudo_keys(fields, views))

                for class_lbl, preds in predictions.items():
                    total = defaultdict(lambda: 0)
                    for pred in preds:
                        for link_lbl in pred:
                            total[link_lbl] += pred[link_lbl]
                    for link_lbl, count in total.items():
                        total[link_lbl] = count
                    pesudo_primary_keys[class_lbl] = max(total.items(),
                                                         key=lambda x: x[1])[0]

                PrimaryKey.instance = PrimaryKey({
                    k: v.encode('utf-8')
                    for k, v in pesudo_primary_keys.items()
                })
                cache_file.parent.mkdir(exist_ok=True, parents=True)
                serializeJSON(PrimaryKey.instance, cache_file, indent=4)
            else:
                PrimaryKey.instance: PrimaryKey = deserializeJSON(
                    cache_file, Class=PrimaryKey)

        return PrimaryKey.instance
Esempio n. 5
0
def get_ont_graph(dataset: str) -> OntGraph:
    global _ont_graph_vars

    if dataset not in _ont_graph_vars:
        # if it hasn't been cached
        cache_file = Path(config.fsys.debug.as_path() +
                          f'/{dataset}/cached/ont_graph.json')
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont_graph = deserializeJSON(cache_file, Class=OntGraph)
        else:
            ont_graph: OntGraph = build_ont_graph(dataset)
            serializeJSON(ont_graph, cache_file)

        _ont_graph_vars[dataset] = ont_graph

    return _ont_graph_vars[dataset]
Esempio n. 6
0
def make_test_from_prediction(train_sms: List[SemanticModel],
                              evaluate_sms: List[SemanticModel], workdir: Path,
                              model_dir: Path):
    search_history: Dict[str, List[List[dict]]] = deserializeJSON(
        model_dir / "search_history.json")
    evaluate_sms = {sm.id: sm for sm in evaluate_sms}
    train_sm_ids = [sm.id for sm in train_sms]

    test_examples = []
    for sid in search_history:
        for i, gs in enumerate(search_history[sid]):
            for j, g in enumerate(gs):
                eid = Example.generate_example_id(sid, j, i)
                example = make_example(evaluate_sms[sid], Graph.from_dict(g),
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples
Esempio n. 7
0
def evaluate_serene_outputs(
        files: List[Path],
        ont: Ontology,
        gold_sm: Optional[SemanticModel] = None) -> Union[dict, None]:
    try:
        cor_ssd_file = [
            file for file in files if file.name.endswith(".cor_ssd.json")
        ][0]
        ssd_file = [file for file in files
                    if file.name.endswith(".ssd.json")][0]
    except Exception as e:
        raise Exception("Invalid : %s" % files[0], e)

    cor_ssd = SSD.from_file(cor_ssd_file, ont).clear_serene_footprint()
    ssd = SSD.from_file(ssd_file, ont)
    chuffed_ssds = []
    for file in files:
        if file.name.find(".chuffed") != -1:
            objs = deserializeJSON(file)
            chuffed_ssds.append([SSD.from_json(obj, ont) for obj in objs])

    if gold_sm is None:
        # SERENE can filter the cor_ssd graph to remove new-semantic types
        gold_graph = cor_ssd.graph
    else:
        gold_graph = gold_sm.graph

    eval_results = {}
    for chuffed_idx, ssds in enumerate(chuffed_ssds):
        eval_results[chuffed_idx] = {}

        if len(ssds) == 0:
            eval_results[chuffed_idx] = {'precision': 0, 'recall': 0, 'f1': 0}
        else:
            ssd = ssds[0]
            # ssd.graph.render()
            result = smodel_eval.f1_precision_recall(gold_graph, ssd.graph,
                                                     DataNodeMode.NO_TOUCH)
            eval_results[chuffed_idx]['precision'] = result['precision']
            eval_results[chuffed_idx]['recall'] = result['recall']
            eval_results[chuffed_idx]['f1'] = result['f1']

    return eval_results
Esempio n. 8
0
def get_karma_models(dataset: str) -> List[KarmaModel]:
    """Get list of json models of a given dataset"""
    global _data_io_vars

    if dataset not in _data_io_vars["karma_models"]:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'karma_models.json'
        if cache_file.exists():
            karma_models = deserializeJSON(cache_file, Class=KarmaModel)
        else:
            karma_models = []
            model_dir = Path(config.datasets[dataset].karma_version.as_path()) / "models-json"
            ont = get_ontology(dataset)
            for file in sorted(model_dir.iterdir()):
                if file.name.endswith(".json"):
                    karma_models.append(KarmaModel.load_from_file(ont, file))
            serializeJSON(karma_models, cache_file)
        _data_io_vars["karma_models"][dataset] = karma_models

    return _data_io_vars["karma_models"][dataset]
Esempio n. 9
0
def draw_graph(same_dir: bool):
    finput = Path("/tmp/sm_debugging/draw_graphs.json")
    input = deserializeJSON(finput)
    new_id = -1
    for item in finput.parent.iterdir():
        match = re.match("draw_graph_(\d+)$", item.name)
        if match is not None:
            if int(match.groups()[0]) > new_id:
                new_id = int(match.groups()[0])

    if not same_dir:
        new_id += 1

    output = finput.parent / f"draw_graph_{new_id}"
    output.mkdir(exist_ok=True)

    n_graphs = len(list(output.iterdir()))
    graphs = [Graph.from_dict(o) for o in input["graphs"]]
    with ThreadPool() as p:
        p.map(lambda ig: ig[1].render2img(output / f"graph_{ig[0]}.png"),
              enumerate(graphs, start=n_graphs))
Esempio n. 10
0
    with ThreadPool() as p:
        p.map(
            lambda igs: igs[1].render2pdf(output_dir / igs[
                2] / f"example_no_{igs[0]}.pdf"), render_graphs)


if __name__ == "__main__":
    # Load all input
    dataset = sys.argv[1]
    workdir = Path(sys.argv[2])
    train_or_test_file = workdir / sys.argv[3]

    assert workdir.exists()
    assert train_or_test_file.exists()

    semantic_models = {sm.id: sm for sm in get_semantic_models(dataset)}
    timer = pyutils.progress.Timer().start()

    examples = deserializeJSON(train_or_test_file)
    # for example, map_example in zip(train_examples, train_map_examples):
    #     example["map_link2label"] = map_example

    if 'train' in str(train_or_test_file).lower():
        render_examples(examples, train_or_test_file.parent / "train_viz")
    elif 'test' in str(train_or_test_file).lower():
        render_examples(examples, train_or_test_file.parent / "test_viz")
    else:
        print("Cannot detect type is train or test. Exit!!")
        exit(0)
    print("Render examples: %s" % timer.lap().get_total_time(), flush=True)
Esempio n. 11
0
    args = get_shell_args()
    dataset = args.dataset

    settings = Settings.get_instance(False)
    settings.n_samples = args.n_samples
    settings.random_seed = args.seed
    settings.log_current_settings()

    ont = get_ontology(dataset)
    source_dir = Path(
        config.datasets[dataset].as_path()) / "karma-version" / "sources"
    source_dir.mkdir(exist_ok=True, parents=True)
    meta_file = source_dir / ".meta"

    if meta_file.exists():
        meta = deserializeJSON(meta_file)

        if meta['n_samples'] == settings.n_samples and meta[
                'random_seed'] == settings.random_seed:
            print(
                "Don't need to prepare karma sources because it has been generated with same configuration before. Terminating...!"
            )
            exit(0)

    print(f"Generate karma sources for dataset: {dataset}")
    serializeJSON(
        {
            'n_samples': settings.n_samples,
            'random_seed': settings.random_seed
        },
        meta_file,
Esempio n. 12
0
 def from_file(file: Union[str, Path], ont: Ontology) -> 'SSD':
     content = deserializeJSON(file)
     return SSD.from_json(content, ont)
Esempio n. 13
0
    #     if frame.f_code.co_filename.startswith("/Users/rook/workspace/DataIntegration/SourceModeling/"):
    #         print("%s, %s:%d" % (event, frame.f_code.co_filename, frame.f_lineno))
    #     else:
    #         print(".", end="")
    #     return trace
    #
    # args = {'children_uris': set(),
    #     'parents_uris': set(),
    #     'uri': 'http://www.w3.org/2000/01/rdf-schema#Resource'}
    # # OntGraphNode("haha", set(), set())
    # a = OntGraphNode(**args)
    # print("==========")
    # # print(a.uri)
    # # sys.settrace(trace)
    ont_graph: OntGraph = deserializeJSON(
        config.fsys.debug.as_path() + '/%s/cached/ont_graph.json' % dataset,
        OntGraph)
    ont: Ontology = deserialize(config.fsys.debug.as_path() +
                                '/%s/cached/ont.pkl' % dataset)
    # print(a.uri)
    # print("========SIGSEGV IN DEBUG MODE==")

    # ont = Ontology.from_data_source(data_source)
    # ont_graph = build_ont_graph(data_source)
    #
    # # %%
    #
    # # ont_graph.render2txt(config.fsys.debug.as_path() + '/%s/ont_graph.txt' % data_source)
    #
    # # %%
    s1 = ont.full_uri('crm:E63_Beginning_of_Existence')
Esempio n. 14
0
    def _semantic_labeling(
            self, train_source_ids: Set[str], test_source_ids: Set[str]
    ) -> Dict[str, MinhptxSemanticLabelingResult]:
        """Generate semantic labeling for test_sources using train_sources"""
        need_reexec = True

        if Path(self.meta_file).exists():
            # read meta and compare if previous run is compatible with current run
            self.logger.debug("Load information from previous run...")

            meta = deserializeJSON(self.meta_file)
            meta["training_sources"] = set(meta["training_sources"])
            meta["testing_sources"] = set(meta["testing_sources"])
            meta["source_ids"] = set(meta['source_ids'])

            new_meta = self.get_meta(train_source_ids, test_source_ids)
            if len(
                    new_meta.pop("testing_sources").difference(
                        meta.pop("testing_sources"))) == 0:
                if new_meta == meta:
                    need_reexec = False

        if need_reexec:
            self.logger.debug("Re-execute semantic labeling...")

            try:
                # preparing data, want to compute semantic models for all sources in dataset
                data_dir = Path(config.datasets[self.dataset].data.as_path())
                model_dir = Path(
                    config.datasets[self.dataset].models_json.as_path())

                shutil.rmtree(str(self.input_dir))
                for fpath in self.output_dir.iterdir():
                    os.remove(fpath)
                [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True)
                 for x in
                 ["%s_train" % self.dataset,
                  "%s_test" % self.dataset] for y in ["data", "model"]]

                input_train_dir = self.input_dir / ("%s_train" % self.dataset)
                input_test_dir = self.input_dir / ("%s_test" % self.dataset)

                for fpath in sorted(data_dir.iterdir()):
                    model_fname = fpath.stem + "-model.json"
                    if fpath.stem in train_source_ids:
                        self._copy_data(fpath,
                                        input_train_dir / "data" / fpath.name)
                        # seriaalize the model instead of copied because we want to convert uri to simplified uri
                        # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_train_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                    if fpath.stem in test_source_ids:
                        self._copy_data(fpath,
                                        input_test_dir / "data" / fpath.name)
                        # same reason like above
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_test_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                invoke_command(" ".join([
                    config.previous_works.minhptx_iswc2016.cli.as_path(),
                    str(self.input_dir),
                    str(self.output_dir), "--train_dataset",
                    "%s_train" % self.dataset, "--test_dataset",
                    "%s_test" % self.dataset, "--evaluate_train_set", "True",
                    "--reuse_rf_model", "False"
                ]),
                               output2file=self.exec_dir / "execution.log")
            except Exception:
                sys.stdout.flush()
                self.logger.exception(
                    "Error while preparing and invoking semantic labeling api..."
                )
                raise

            serializeJSON(self.get_meta(train_source_ids, test_source_ids),
                          self.meta_file,
                          indent=4)

        # load result
        self.logger.debug("Load previous result...")
        output_files = [
            fpath for fpath in self.output_dir.iterdir()
            if fpath.suffix == ".json"
        ]
        assert len(output_files) == 2
        app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON(
            output_files[0], Class=MinhptxSemanticLabelingResult)
        app_result.update(
            deserializeJSON(output_files[1],
                            Class=MinhptxSemanticLabelingResult))

        return {
            source_id: app_result[source_id]
            for source_id in chain(test_source_ids, train_source_ids)
        }
Esempio n. 15
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
from pathlib import Path
from typing import Dict, Tuple, List, Set, Union, Optional, Any

from semantic_modeling.config import config
from semantic_modeling.utilities.serializable import deserializeJSON, serializeJSON
"""Usually run after generate r2rml and copied from KARMA_HOME"""

dataset = "museum_edm"
model_dir = Path(
    config.datasets[dataset].karma_version.as_path()) / "models-json"
for file in sorted(model_dir.iterdir()):
    sm = deserializeJSON(file)
    sm['id'] = Path(sm['id']).stem
    sm['name'] = sm['id']

    serializeJSON(sm, model_dir / f"{sm['id']}-model.json", indent=4)
    os.remove(file)
Esempio n. 16
0
    Settings.get_instance().parallel_gmtk_n_threads = 6
    Settings.get_instance().parallel_n_process = 2
    training_args = TrainingArgs.parse_shell_args()

    # model = create_default_model(dataset, train_sms, training_args, workdir / "models")
    # model = Model.from_file(dataset, workdir / "models" / "exp_no_0")
    # model = online_learning(model, dataset, train_sms, train_sms, workdir, training_args, iter_range=(1, 3))

    # model = Model.from_file(dataset, workdir / "models" / "exp_no_2")
    # build_test_data(model, dataset, train_sms, test_sms, workdir, 2)

    # predictions = predict_sm(model, dataset, [sm.id for sm in train_sms], test_sms, model_dir)
    # evaluate(test_sms, predictions, model_dir)
    #

    train_examples = deserializeJSON(workdir / "examples" / f"train.2.json",
                                     Class=Example)
    test_examples = deserializeJSON(workdir / "examples" / f"test.json",
                                    Class=Example)
    # test_examples = train_examples
    args = TrainingArgs.parse_shell_args()
    args.parallel_training = True
    args.n_switch = 19
    args.n_epoch = 22
    args.mini_batch_size = 200
    args.shuffle_mini_batch = True
    # args.n_iter_eval = 50
    # args.optparams = {"lr": 0.005, "amsgrad": True}
    # args.optimizer = 'LBFGS'
    # args.optparams = {"lr": 0.1}
    args.optparams = {"lr": 0.1, "amsgrad": True}
    model_bundle = train_model(dataset, [sm.id for sm in train_sms], 120,
Esempio n. 17
0
            numpy.average([x['f1'] for x in average_result])))


if __name__ == "__main__":
    if len(sys.argv) > 1:
        workdir = Path(sys.argv[1])
    else:
        workdir = Path("/workspace/semantic-modeling/debug/museum_crm/run3/")

    kfold_dirs = [
        dpath for dpath in workdir.iterdir() if dpath.name.startswith("kfold")
    ]
    for kfold_dir in kfold_dirs:
        if not kfold_dir.is_dir():
            continue

        rust_input = deserializeJSON(kfold_dir / "rust-input.json")
        dataset = rust_input['dataset']
        semantic_models = get_semantic_models(dataset)
        train_sms = [semantic_models[i] for i in rust_input['train_sm_idxs']]
        test_sms = [semantic_models[i] for i in rust_input['test_sm_idxs']]

        ranker = Ranking(train_sms, test_sms)
        predictions = [
            Prediction(obj)
            for obj in deserializeJSON(kfold_dir / "rust" / "prediction.json")
        ]

        print(kfold_dir.name)
        ranker.rank(predictions)
Esempio n. 18
0
def print_cooccurrence(features_file_content: dict, output_file: Path):
    serializeJSON(features_file_content['cooccurrence'], output_file, indent=4)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        workdir = Path(sys.argv[1])
    else:
        workdir = Path("/workspace/semantic-modeling/debug/museum_crm/run")

    for kfold_dir in workdir.iterdir():
        if kfold_dir.name.startswith("kfold") and kfold_dir.is_dir():
            input = kfold_dir / "rust-input.json"
            output_dir = kfold_dir / "features"
            output_dir.mkdir(exist_ok=True)

            with open(input, "r") as f:
                input = ujson.load(f)

            print_primary_keys(input, output_dir / "pk.txt")
            print_stypes(input, output_dir / "stypes.txt")

            if (kfold_dir / "rust" / "examples.debug.features.json").exists():
                features = deserializeJSON(kfold_dir / "rust" /
                                           "examples.debug.features.json")
                print_triple_features(features,
                                      output_dir / "triple_features.train.csv",
                                      output_dir / "triple_features.test.csv")
                print_cooccurrence(features, output_dir / "cooccurrence.json")