Beispiel #1
0
    def create(dataset: str, train_source_ids: List[str]) -> 'SemanticTypeDB':
        tables = get_sampled_data_tables(dataset)
        train_source_ids = set(train_source_ids)

        train_tables = [
            ColumnBasedTable.from_table(tbl) for tbl in tables
            if tbl.id in train_source_ids
        ]
        test_tables = [
            ColumnBasedTable.from_table(tbl) for tbl in tables
            if tbl.id not in train_source_ids
        ]

        return SemanticTypeDB(dataset, train_tables, test_tables)
Beispiel #2
0
def get_data_constraint_model(
    dataset: str,
    train_sms: List[SemanticModel],
) -> DataConstraint:
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(
            dataset, train_sms) / "weak_models" / "data_constraint.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        settings = Settings.get_instance()
        valid_threshold = settings.data_constraint_valid_threshold
        guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold
        n_comparison_samples = settings.data_constraint_n_comparison_samples
        random_seed = settings.random_seed
        n_sample = settings.n_samples

        if cache_file.exists():
            DataConstraint.logger.debug("Try to load previous run...")
            model, cached_dataset, cached_train_sm_ids, extra_args = deserialize(
                cache_file)
            if cached_dataset == dataset \
                    and cached_train_sm_ids == {sm.id for sm in train_sms} \
                    and extra_args == (
                        valid_threshold, guess_datetime_threshold, n_comparison_samples,
                        random_seed, n_sample):
                need_rebuilt = False

        if need_rebuilt:
            DataConstraint.logger.debug("Re-build data-constraint model...")
            data_tables = [
                ColumnBasedTable.from_table(tbl)
                for tbl in get_sampled_data_tables(dataset)
            ]
            model = DataConstraint(train_sms, data_tables, valid_threshold,
                                   guess_datetime_threshold,
                                   n_comparison_samples)
            serialize((model, dataset, {sm.id
                                        for sm in train_sms},
                       (valid_threshold, guess_datetime_threshold,
                        n_comparison_samples, random_seed, n_sample)),
                      cache_file)

        _instance = model
    return _instance
Beispiel #3
0
    # print(DataTable.load_from_rows("", flatten_rows).to_string())
    keys = list(flatten_rows[0].keys())
    values = [[r[k] for k in keys] for r in flatten_rows]

    serializeCSV([keys] + values, serene_data_dir / f"{sm.id}.csv")
    # create ssds
    ssd = make_ssd(sm, set(keys), ont)
    serializeJSON(ssd.to_dict(), serene_sm_dir / f"{sm.id}.ssd", indent=4)
    # ssd.graph.render(80)


if __name__ == '__main__':
    dataset = "museum_edm"
    ont = get_ontology(dataset)
    # serene_dir = Path(config.as_path()) / "debug" / dataset / "serene"
    serene_dir = Path(
        "/workspace/tmp/serene-python-client/datasets/") / dataset
    serene_data_dir = serene_dir / "dataset"
    serene_sm_dir = serene_dir / "ssd"

    serene_data_dir.mkdir(exist_ok=True, parents=True)
    serene_sm_dir.mkdir(exist_ok=True)

    sms = get_semantic_models(dataset)
    tables = get_sampled_data_tables(dataset)

    for sm, tbl in zip(sms, tables):
        # if not sm.id.startswith("s07"):
        #     continue
        make_dataset(sm, tbl, ont, serene_data_dir, serene_sm_dir)
        # break
Beispiel #4
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import ujson
from pathlib import Path
from typing import Dict, Tuple, List, Set, Union, Optional, Any

from semantic_modeling.config import config
from semantic_modeling.data_io import get_data_tables, get_raw_data_tables, get_semantic_models, get_ontology, \
    get_sampled_data_tables
from semantic_modeling.utilities.serializable import serializeJSON
from transformation.r2rml.commands.modeling import SetInternalLinkCmd, SetSemanticTypeCmd
from transformation.r2rml.r2rml import R2RML

dataset = "museum_crm"
ont = get_ontology(dataset)

source_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "sources"
source_dir.mkdir(exist_ok=True, parents=True)
for tbl in get_sampled_data_tables(dataset):
    serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)
Beispiel #5
0
        config.datasets[dataset].as_path()) / "karma-version" / "sources"
    source_dir.mkdir(exist_ok=True, parents=True)
    meta_file = source_dir / ".meta"

    if meta_file.exists():
        meta = deserializeJSON(meta_file)

        if meta['n_samples'] == settings.n_samples and meta[
                'random_seed'] == settings.random_seed:
            print(
                "Don't need to prepare karma sources because it has been generated with same configuration before. Terminating...!"
            )
            exit(0)

    print(f"Generate karma sources for dataset: {dataset}")
    serializeJSON(
        {
            'n_samples': settings.n_samples,
            'random_seed': settings.random_seed
        },
        meta_file,
        indent=4)

    model_dir = Path(config.datasets[dataset].models_y2rml.as_path())
    # clear cache file
    clear_sampled_data_tables(dataset)

    for tbl, sm in zip(get_sampled_data_tables(dataset),
                       get_semantic_models(dataset)):
        serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)