Exemple #1
0
def compute_mohsen_stypes(dataset: str, train_sms: List[SemanticModel]):
    sms = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    exec_dir = Path(config.fsys.debug.as_path()) / "tmp" / f"mohsen-styper-{get_short_train_name(train_sms)}"
    if exec_dir.exists():
        shutil.rmtree(exec_dir)
    exec_dir.mkdir(exist_ok=True, parents=True)

    semantic_types = {}

    # now we parallel to save time
    # with ThreadPool(os.cpu_count() // 2) as pool:
    with ThreadPool(6) as pool:
        results = {}
        # because karma re-learn semantic types for every data source, we parallel for every data source
        for sm in sms:
            if sm.id in train_sm_ids:
                local_train_sms = [s for s in train_sms if s.id != sm.id]
            else:
                local_train_sms = train_sms

            local_exec_dir = exec_dir / sm.id
            local_exec_dir.mkdir(exist_ok=True)

            results[sm.id] = pool.apply_async(worker_get_stype, (dataset, local_train_sms, sm, local_exec_dir))

        for sid, result in results.items():
            semantic_types[sid] = result.get()

    output_dir = Path(config.datasets[dataset].karma_version.as_path()) / "semantic-types"
    output_dir.mkdir(exist_ok=True)
    serializeJSON(semantic_types, output_dir / f"{get_short_train_name(train_sms)}.json", indent=4)
    return semantic_types
Exemple #2
0
def save_evaluation_result(map_and_nll_examples: Iterable[Tuple[
    MAPAssignmentExample, NegativeLogLikelihoodExample]],
                           fpath: Union[Path, str]) -> None:
    outputs = []
    confusion_matrixes = []

    for map_example, nll_example in map_and_nll_examples:
        real_example: Example = nll_example.variables[0].triple.example
        map_assignment: Dict[
            TripleLabel,
            BinaryVectorValue[bool]] = map_example.get_map_assignment()

        link2labels = {}
        for var, val in map_assignment.items():
            link2labels[var.triple.link.id] = val.val

        desired_assignment = {
            var: var.domain.encode_value(True)
            for var in nll_example.variables
        }
        log_prob = sum(
            f.score_assignment(desired_assignment)
            for f in nll_example.factors) - nll_example.inference.logZ()

        output = OutputExample(real_example.example_id, link2labels, log_prob)
        outputs.append(output)
        confusion_matrixes.append(
            Evaluation.get_confusion_matrix(map_assignment,
                                            nll_example.target_assignment))

    serializeJSON(outputs, fpath)
Exemple #3
0
def build_test_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel], output_dir: Path,
                    n_iter):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    new_data = generate_data(model, dataset, train_sms, discover_sources, 1)
    for sm in discover_sources:
        new_candidate_sms = [
            key for key in new_data[sm.id] if key not in data[sm.id]
        ]
        for key in new_candidate_sms:
            data[sm.id][key] = new_data[sm.id][key]

    test_examples = [
        example for sid in discover_sids for example in data[sid].values()
    ]
    test_examples.sort(key=lambda e: e.example_id)

    serializeJSON(test_examples,
                  output_dir / "examples" / f"test.{n_iter}.json")
Exemple #4
0
def make_dataset(sm: SemanticModel, tbl: DataTable, ont: Ontology,
                 serene_data_dir: Path, serene_sm_dir: Path):
    def cross_products(row: dict) -> Union[List, Dict]:
        single_fields = {}
        multi_fields = {}
        for key, val in row.items():
            if isinstance(val, dict):
                result = cross_products(val)
                if isinstance(result, dict):
                    single_fields[key] = result
                elif isinstance(result, list):
                    multi_fields[key] = result
                else:
                    raise Exception("Invalid result type: %s" % type(result))
            elif isinstance(val, list):
                multi_fields[key] = val
            else:
                single_fields[key] = val

        if len(multi_fields) == 0:
            return single_fields

        rows = []
        keys, field_values = list(zip(*multi_fields.items()))
        for values in itertools.product(*field_values):
            row = copy(single_fields)
            for i, val in enumerate(values):
                row[keys[i]] = val
            rows.append(row)

        return rows

    def flatten_row(row: dict) -> dict:
        new_row = {}
        for key, val in row.items():
            if isinstance(val, dict):
                for k2, v2 in flatten_row(val).items():
                    new_row[f"{key}{Schema.PATH_DELIMITER}{k2}"] = v2
            else:
                new_row[key] = val
        return new_row

    # flatten a data table
    flatten_rows = []
    for row in tbl.rows:
        new_rows = cross_products(row)
        if isinstance(new_rows, dict):
            new_rows = [new_rows]

        for r in new_rows:
            flatten_rows.append(flatten_row(r))

    # print(DataTable.load_from_rows("", flatten_rows).to_string())
    keys = list(flatten_rows[0].keys())
    values = [[r[k] for k in keys] for r in flatten_rows]

    serializeCSV([keys] + values, serene_data_dir / f"{sm.id}.csv")
    # create ssds
    ssd = make_ssd(sm, set(keys), ont)
    serializeJSON(ssd.to_dict(), serene_sm_dir / f"{sm.id}.ssd", indent=4)
Exemple #5
0
def get_semantic_models(dataset: str) -> List[SemanticModel]:
    """Get list of semantic models of a given dataset"""
    global _data_io_vars

    if dataset not in _data_io_vars["semantic_models"]:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'semantic_models.json'
        if cache_file.exists():
            semantic_models = deserializeJSON(cache_file, Class=SemanticModel)
        else:
            mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path())
            R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path()))
            raw_tables = get_raw_data_tables(dataset)
            semantic_models = []
            tables = []
            for i, raw_tbl in enumerate(raw_tables):
                r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml"
                tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl)
                semantic_models.append(sm)
                tables.append(tbl)

            serializeJSON(semantic_models, cache_file)
            _data_io_vars["data_tables"][dataset] = tables

        _data_io_vars["semantic_models"][dataset] = semantic_models

    return _data_io_vars["semantic_models"][dataset]
Exemple #6
0
def online_learning(model: Model,
                    dataset: str,
                    train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel],
                    output_dir: Path,
                    training_args,
                    iter_range=(1, 3)):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    ignore_sids = set(
    )  # those should not include in the discovery_helper process because of no new sources
    logger = get_logger("app")
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    for n_iter in range(*iter_range):
        logger.info("==================================> Iter: %s", n_iter)
        new_data = generate_data(model, dataset, train_sms, discover_sources,
                                 n_iter)
        for sm in discover_sources:
            if sm.id in ignore_sids:
                continue

            new_candidate_sms = [
                key for key in new_data[sm.id] if key not in data[sm.id]
            ]
            if len(new_candidate_sms) == 0:
                # no new candidate sms
                logger.info("No new candidate for source: %s", sm.id)
                ignore_sids.add(sm.id)
            else:
                for key in new_candidate_sms:
                    data[sm.id][key] = new_data[sm.id][key]

        train_examples = [
            example for sm in train_sms if sm.id in discover_sids
            for example in data[sm.id].values()
        ]
        train_examples.sort(key=lambda e: e.example_id)

        serializeJSON(train_examples,
                      output_dir / "examples" / f"train.{n_iter}.json")
        shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json",
                        output_dir / "examples" / f"train.json")

        raw_model, tf_domain, pairwise_domain, __ = train_model(
            dataset, [sm.id for sm in train_sms], 120, train_examples, [],
            training_args, output_dir / "models")
        model = Model(dataset, raw_model, tf_domain, pairwise_domain)

    return model
Exemple #7
0
    def get_instance(dataset: str, train_sms: List[SemanticModel]):
        if PrimaryKey.instance is None:
            cache_file = get_cache_dir(
                dataset, train_sms) / "weak_models" / "primary_keys.json"
            if not cache_file.exists():
                train_sm_ids = {sm.id for sm in train_sms}
                train_tbls = {
                    tbl.id: tbl
                    for tbl in get_data_tables(dataset)
                    if tbl.id in train_sm_ids
                }
                predictions: Dict[str, List[dict]] = defaultdict(lambda: [])
                pesudo_primary_keys = {}

                for sm in train_sms:
                    jsonld_objects = jsonld_generator(sm, train_tbls[sm.id])
                    for n in sm.graph.iter_class_nodes():
                        fields = [
                            e.label.decode("utf-8")
                            for e in n.iter_outgoing_links()
                            if e.get_target_node().is_data_node()
                        ]
                        if len(fields) == 0:
                            continue
                        if 'karma:classLink' in fields:
                            pesudo_primary_keys[n.label] = 'karma:classLink'
                            continue

                        results = extract_node_data(n, jsonld_objects)
                        views = create_unique_views(results, fields)
                        predictions[n.label].append(
                            predict_pesudo_keys(fields, views))

                for class_lbl, preds in predictions.items():
                    total = defaultdict(lambda: 0)
                    for pred in preds:
                        for link_lbl in pred:
                            total[link_lbl] += pred[link_lbl]
                    for link_lbl, count in total.items():
                        total[link_lbl] = count
                    pesudo_primary_keys[class_lbl] = max(total.items(),
                                                         key=lambda x: x[1])[0]

                PrimaryKey.instance = PrimaryKey({
                    k: v.encode('utf-8')
                    for k, v in pesudo_primary_keys.items()
                })
                cache_file.parent.mkdir(exist_ok=True, parents=True)
                serializeJSON(PrimaryKey.instance, cache_file, indent=4)
            else:
                PrimaryKey.instance: PrimaryKey = deserializeJSON(
                    cache_file, Class=PrimaryKey)

        return PrimaryKey.instance
Exemple #8
0
def get_ont_graph(dataset: str) -> OntGraph:
    global _ont_graph_vars

    if dataset not in _ont_graph_vars:
        # if it hasn't been cached
        cache_file = Path(config.fsys.debug.as_path() +
                          f'/{dataset}/cached/ont_graph.json')
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont_graph = deserializeJSON(cache_file, Class=OntGraph)
        else:
            ont_graph: OntGraph = build_ont_graph(dataset)
            serializeJSON(ont_graph, cache_file)

        _ont_graph_vars[dataset] = ont_graph

    return _ont_graph_vars[dataset]
Exemple #9
0
def make_test_from_prediction(train_sms: List[SemanticModel],
                              evaluate_sms: List[SemanticModel], workdir: Path,
                              model_dir: Path):
    search_history: Dict[str, List[List[dict]]] = deserializeJSON(
        model_dir / "search_history.json")
    evaluate_sms = {sm.id: sm for sm in evaluate_sms}
    train_sm_ids = [sm.id for sm in train_sms]

    test_examples = []
    for sid in search_history:
        for i, gs in enumerate(search_history[sid]):
            for j, g in enumerate(gs):
                eid = Example.generate_example_id(sid, j, i)
                example = make_example(evaluate_sms[sid], Graph.from_dict(g),
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples
Exemple #10
0
def get_karma_models(dataset: str) -> List[KarmaModel]:
    """Get list of json models of a given dataset"""
    global _data_io_vars

    if dataset not in _data_io_vars["karma_models"]:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'karma_models.json'
        if cache_file.exists():
            karma_models = deserializeJSON(cache_file, Class=KarmaModel)
        else:
            karma_models = []
            model_dir = Path(config.datasets[dataset].karma_version.as_path()) / "models-json"
            ont = get_ontology(dataset)
            for file in sorted(model_dir.iterdir()):
                if file.name.endswith(".json"):
                    karma_models.append(KarmaModel.load_from_file(ont, file))
            serializeJSON(karma_models, cache_file)
        _data_io_vars["karma_models"][dataset] = karma_models

    return _data_io_vars["karma_models"][dataset]
Exemple #11
0
def create_rust_input(dataset: str, scenario: Scenario, train_sms, test_sms):
    train_sm_ids = [sm.id for sm in train_sms]
    exec_dir = get_cache_dir(dataset, train_sms) / "mohsen_jws2015"
    modeler = MohsenSemanticModeling(
        dataset,
        False,
        False,
        train_sm_ids,
        exec_dir=exec_dir,
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")

    candidate_smss = modeler.sm_candidate_generation(train_sms, test_sms)
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    train_sm_ids = {sm.id for sm in train_sms}
    real_test_sm_ids = {sm.id for sm in test_sms if sm.id not in train_sm_ids}

    train_eval_hist = get_eval_hist(train_sm_ids, test_sms, candidate_smss,
                                    data_node_mode)
    test_eval_hist = get_eval_hist(real_test_sm_ids, test_sms, candidate_smss,
                                   data_node_mode)

    serializeCSV(
        train_eval_hist,
        exec_dir / f"evaluation_result_{scenario.value}.train.oracle.csv")
    serializeCSV(
        test_eval_hist,
        exec_dir / f"evaluation_result_{scenario.value}.test.oracle.csv")

    # now create rust bridge
    obj = {}
    for gold_sm, candidate_sms in zip(test_sms, candidate_smss):
        obj[gold_sm.id] = [c.graph.to_dict() for c in candidate_sms]
    serializeJSON(obj, exec_dir / "rust-karma-pred-input.json")
Exemple #12
0
def serialize_rust_input(dataset: str, workdir: str,
                         train_sms: List[SemanticModel],
                         test_sms: List[SemanticModel], foutput: Path):
    primary_key = PrimaryKey.get_instance(dataset, train_sms)

    sms = get_semantic_models(dataset)
    sm_index = {sm.id: i for i, sm in enumerate(sms)}
    train_sm_idxs = [sm_index[sm.id] for sm in train_sms]
    test_sm_idxs = [sm_index[sm.id] for sm in test_sms]

    predicted_parent_stypes = serialize_stype_assistant(
        dataset, sms, train_sms, test_sms)
    cardinality = CardinalityFeatures.get_instance(dataset)
    semantic_labeling(dataset, train_sms, test_sms)

    data = {
        "dataset": dataset,
        "workdir": str(workdir),
        "semantic_models": [sm.to_dict() for sm in sms],
        "predicted_parent_stypes": {
            "stype_details": predicted_parent_stypes
        },
        "train_sm_idxs": train_sm_idxs,
        "test_sm_idxs": test_sm_idxs,
        "feature_primary_keys": primary_key.to_dict(),
        "feature_cardinality_features": {
            sm_id: {
                "columns": matrix.columns,
                "matrix": matrix.matrix
            }
            for sm_id, matrix in cardinality.cardinality_matrices.items()
        },
        "ont_graph": serialize_ont_graph(dataset)
    }

    serializeJSON(data, foutput, indent=4)
Exemple #13
0
def predict_sm(model: Model, dataset: str, train_sms: List[SemanticModel],
               evaluate_sms: List[SemanticModel], workdir):
    train_sids = [sm.id for sm in train_sms]
    predictions: Dict[str, Graph] = {}
    stat = Statistic.get_instance(train_sms)

    model_bundle = (model.dataset, model.model, model.tf_domain,
                    model.pairwise_domain)
    search_performance_history = {}
    search_history = {}

    with get_pool(Settings.get_instance().parallel_n_process) as pool:
        results = []
        for sm in evaluate_sms:
            result = pool.apply_async(
                generate_candidate_sm,
                (dataset, sm, stat, model_bundle, train_sids))
            results.append(result)

        pred_sms: Tuple[List[Tuple[float, Graph]],
                        List[Tuple[int, float, float, float,
                                   float]], List[List[Graph]]]
        for sm, result in zip(evaluate_sms, results):
            pred_sms = result.get()
            predictions[sm.id] = pred_sms[0][0][1]
            search_performance_history[sm.id] = pred_sms[1]
            search_history[sm.id] = pred_sms[2]

    serializeJSON({sid: o.to_dict()
                   for sid, o in predictions.items()},
                  workdir / "predicted_sms.json")
    serializeJSON(search_performance_history,
                  workdir / "search_performance_history.json",
                  indent=4)
    serializeJSON(
        {
            sid: [[o.to_dict() for o in os] for os in oss]
            for sid, oss in search_history.items()
        }, workdir / "search_history.json")
    return predictions
Exemple #14
0
from semantic_modeling.utilities.serializable import serializeJSON
from transformation.r2rml.commands.modeling import SetInternalLinkCmd, SetSemanticTypeCmd
from transformation.r2rml.r2rml import R2RML

dataset = "museum_crm"
ont = get_ontology(dataset)

r2rml_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "models-r2rml"
r2rml_dir.mkdir(exist_ok=True, parents=True)
model_dir = Path(config.datasets[dataset].models_y2rml.as_path())

model_json_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "models-json"
model_json_dir.mkdir(exist_ok=True, parents=True)

for tbl in get_data_tables(dataset):
    r2rml_file = r2rml_dir / f"{tbl.id}-model.ttl"
    r2rml = R2RML.load_from_file(model_dir / f"{tbl.id}-model.yml")
    # note that we use a cleaned data table, whatever columns need to create/transform have been done.
    # therefore, we will remove all command that aren't SetSemanticType or SetInternalLink
    r2rml.commands = [
        cmd for cmd in r2rml.commands
        if isinstance(cmd, (SetSemanticTypeCmd, SetInternalLinkCmd))
    ]

    sm = r2rml.apply_cmds(tbl)
    r2rml.to_kr2rml(ont, tbl, r2rml_file)
    serializeJSON(sm.to_karma_json_model(ont),
                  model_json_dir / f"{tbl.id}-model.json",
                  indent=4)
Exemple #15
0
def train_model(dataset: str, train_sids: List[str], manual_seed: int,
                train_examples: List[Example], test_examples: List[Example],
                args: TrainingArgs, basedir: Path):
    DenseTensorFunc.manual_seed(manual_seed)

    tf_domain = GrowableBinaryVectorDomain()

    timer = pyutils.progress.Timer().start()
    input_train_examples = train_examples
    input_test_examples = test_examples

    # BUILDING VARIABLES NEEDED FOR THE TRAINING
    example_annotator = ExampleAnnotator(dataset,
                                         train_sids,
                                         training_examples=train_examples)
    train_examples = sequential_map(example_annotator.annotate, train_examples)
    train_examples = _(train_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    pairwise_domain = example_annotator.build_pairwise_domain()
    # Freeze domain now, we've added all feature values observed in training data
    tf_domain.freeze()

    test_examples = sequential_map(example_annotator.annotate, test_examples)
    test_examples = _(test_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    # print domain to debug
    logger.info("Preprocessing take %s" % timer.lap().get_total_time())
    # build random variables
    train_graphs = _(train_examples).submap(lambda t: t.label)
    test_graphs = _(test_examples).submap(lambda t: t.label)

    # build models, select inference method
    model = TemplateLogLinearModel([
        TripleFactorTemplate(
            *TripleFactorTemplate.get_default_args(tf_domain)),
        SubstructureFactorTemplate(
            *SubstructureFactorTemplate.get_default_args(
                pairwise_domain, example_annotator.get_obj_props())),
        # ExternalModelFactorTemplate(*ExternalModelFactorTemplate.get_default_weights())
    ])
    # or load previous training
    # model_dir = config.fsys.debug.as_path() + "/%s/models/exp_no_2" % dataset
    # model, ___, state_dict = deserialize(model_dir + '/gmtk_model.bin')

    inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)
    map_inference = BeliefPropagation.get_constructor(InferProb.MAP)

    train_nll_examples = _(
        train_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    train_map_examples = _(train_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))
    test_nll_examples = _(
        test_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    test_map_examples = _(test_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))

    # select training method/parameters, and evaluation
    n_epoch = args.n_epoch
    params = args.optparams
    mini_batch_size = args.mini_batch_size
    n_switch = args.n_switch

    global_step = 0
    require_closure = False
    if args.optimizer == 'SGD':
        optimizer = PyTorchOptimizer.SGD(parameters=model.get_parameters(),
                                         **params)
    elif args.optimizer == 'ADAM':
        optimizer = PyTorchOptimizer.Adam(parameters=model.get_parameters(),
                                          **params)
    elif args.optimizer == 'LBFGS':
        optimizer = PyTorchOptimizer.LBFGS(parameters=model.get_parameters(),
                                           **params)
        require_closure = True
    else:
        assert False
    # optimizer.optimizer.load_state_dict(state_dict)

    for template in model.templates:
        if hasattr(template, 'after_update_weights'):
            optimizer.register_on_step(template.after_update_weights)

    logger.info(args.to_string())
    logger.info("Template info: \n%s" %
                ("\n" %
                 (["\t" + template.get_info()
                   for template in model.templates])))
    logger.info("Train size: %s, Test size: %s", len(train_nll_examples),
                len(test_nll_examples))

    reporter = TensorBoard(log_dir=basedir)
    # cast to list to keep train_map_examples & train_nll_examples aligned with each other (batch example may shuffle)
    if args.parallel_training:
        batch_nll_example = ParallelBatchExample(list(train_nll_examples), 0)
    else:
        batch_nll_example = BatchExample(list(train_nll_examples), 0)

    # *********************************************** DEBUG CODE
    # for i, triples in enumerate(train_examples):
    #     example = triples[0].example
    #     if example.model_id.startswith("s03") and example.no_sample == 29:
    #         example.pred_sm.render()
    #         render_factor_graph(model.get_factors(train_graphs[i]), train_graphs[i],
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #         exit(0)
    #
    # render_factor_graph(train_nll_examples[0].factors, train_nll_examples[0].variables,
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #
    # loss_val_accum = ValueAccumulator()
    # gradient_accum = Tensor1AccumulatorDict()
    # for weights in model.get_parameters():
    #     gradient_accum.track_obj(weights, DenseTensorFunc.zeros_like(weights.val))
    # **********************************************************

    progress = pyutils.progress.Progress(n_epoch)
    progress.start()

    if n_switch > 0:
        examples = list(batch_nll_example.split_random(mini_batch_size))
    else:
        examples = [batch_nll_example]

    cm_train, cm_test = None, None
    loss_history = []
    param_hists = []

    for i in range(n_epoch):
        logger.info("Iter %s" % i)

        if i >= n_switch:
            examples = [batch_nll_example]

        if args.shuffle_mini_batch and 0 < i < n_switch:
            examples = batch_nll_example.split_random(mini_batch_size)

        average_loss_val = []
        if not require_closure:
            for example in examples:
                optimizer.zero_grad()
                example.accumulate_value_and_gradient(
                    optimizer.get_value_accumulator(),
                    optimizer.get_gradient_accumulator())
                optimizer.average(example.size())

                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())

                # *********************************************** DEBUG GRADIENT
                # numerical_gradient = NumericalGradient(1e-5)
                # for j, e in enumerate(example.examples):
                #     print(f"\rExample {j}/{len(example.examples)}", end="", flush=True)
                #     gradient_accum.clear()
                #     loss_val_accum.clear()
                #     e.accumulate_value_and_gradient(loss_val_accum, gradient_accum)
                #     for template in model.templates:
                #         for weights in template.get_weights():
                #             gradient = gradient_accum.get_value(weights)
                #             approx_gradients = numerical_gradient.compute_gradient(weights, lambda: nll_func(e))
                #             try:
                #                 np.testing.assert_almost_equal(gradient.numpy(), approx_gradients.numpy(), 6)
                #             except Exception:
                #                 logger.exception("Incorrect gradient...")
                #                 print(template,  weights.val.tolist())
                #                 print(["%11.8f" % x for x in gradient.tolist()])
                #                 print(["%11.8f" % x for x in approx_gradients.tolist()])
                #                 print(["%11d" % int(np.isclose(x, y, rtol=0, atol=1e-6)) for x, y in zip(gradient, approx_gradients)])
                #
                #                 raise
                # print("\n")
                # **************************************************************

                optimizer.step()
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1
        else:
            for example in examples:

                def closure():
                    optimizer.zero_grad()
                    example.accumulate_value_and_gradient(
                        optimizer.get_value_accumulator(),
                        optimizer.get_gradient_accumulator())
                    optimizer.average(example.size())
                    optimizer.copy_grad()
                    return optimizer.get_value_accumulator().get_value()

                optimizer.step(closure)
                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1

        if len(average_loss_val) > 1:
            logger.info("Average accum loss: %.10f" %
                        np.average(average_loss_val))

        if optimizer.get_value_accumulator().get_value() < 0:
            break

        if i % args.n_iter_eval == 0 or i == n_epoch - 1:
            cm_train = evaluate(train_map_examples)
            cm_test = evaluate(test_map_examples) or cm_train
            logger.info('train (class_idx=0): %s',
                        cm_train.precision_recall_fbeta(class_idx=0))
            logger.info('train (class_idx=1): %s',
                        cm_train.precision_recall_fbeta(class_idx=1))
            logger.info('test  (class_idx=0): %s',
                        cm_test.precision_recall_fbeta(class_idx=0))
            logger.info('test  (class_idx=1): %s',
                        cm_test.precision_recall_fbeta(class_idx=1))

            reporter.precision_recall_fbeta(cm_train,
                                            global_step,
                                            group='train')
            reporter.precision_recall_fbeta(cm_test, global_step, group='test')

        loss_history.append(np.average(average_loss_val))
        param_hists.append(model.clone_parameters())
        if len(param_hists) > 3:
            param_hists.pop(0)

        if args.optimizer == "ADAM" and len(loss_history) > 4 and all(
                x - y > 0
                for x, y in zip(loss_history[-3:], loss_history[-4:-1])):
            logger.info("Loss increase after 3 epoches. Stop training!")
            break

        progress.finish_one()

    if args.report_final_loss:
        loss_val_accum = ValueAccumulator()
        batch_nll_example.accumulate_value_and_gradient(loss_val_accum, None)
        logger.info("Average accum loss: %.10f" %
                    (loss_val_accum.get_value() / batch_nll_example.size()))

    logger.info("\n\r%s" % progress.summary())
    cm_train.pretty_print("** TRAIN **",
                          precision_recall_fbeta=True,
                          output_stream=logger.info)
    cm_test.pretty_print("** TEST **",
                         precision_recall_fbeta=True,
                         output_stream=logger.info)

    # save model and move everything into another folder for storage
    reporter.close()
    reporter.export(basedir / 'tensorboard_raw.json')

    # clear all cache
    for template in model.templates:
        if isinstance(template, CachedTemplateFactorConstructor):
            template.clear_cache()

    assert len(param_hists) == len(loss_history[-3:])
    min_loss, min_params, min_idx = min(zip(loss_history[-3:], param_hists,
                                            [-3, -2, -1]),
                                        key=lambda x: x[0])
    logger.info("Select parameters at index: %d. Loss = %s", min_idx, min_loss)
    model.update_parameters(min_params)

    serialize(
        (model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()),
        basedir / 'gmtk_model.bin')
    save_evaluation_result(zip(train_map_examples, train_nll_examples),
                           basedir / 'train.output.json')
    save_evaluation_result(zip(test_map_examples, test_nll_examples),
                           basedir / 'test.output.json')
    serializeJSON(input_train_examples, basedir / "train.json")
    serializeJSON(input_test_examples, basedir / "test.json")

    # attempt to copy log file
    try:
        logger.handlers[1].flush()
        shutil.copy(logger.handlers[1].file_handler.baseFilename,
                    str(basedir / "train.log"))
    except:
        logger.exception("Cannot backup log...")

    model_id = get_latest_model_id(basedir) + 1
    move_current_files(basedir, model_id)
    logger.info("Save model %s", model_id)
    return model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()
Exemple #16
0
    def _semantic_labeling(
            self, train_source_ids: Set[str], test_source_ids: Set[str]
    ) -> Dict[str, MinhptxSemanticLabelingResult]:
        """Generate semantic labeling for test_sources using train_sources"""
        need_reexec = True

        if Path(self.meta_file).exists():
            # read meta and compare if previous run is compatible with current run
            self.logger.debug("Load information from previous run...")

            meta = deserializeJSON(self.meta_file)
            meta["training_sources"] = set(meta["training_sources"])
            meta["testing_sources"] = set(meta["testing_sources"])
            meta["source_ids"] = set(meta['source_ids'])

            new_meta = self.get_meta(train_source_ids, test_source_ids)
            if len(
                    new_meta.pop("testing_sources").difference(
                        meta.pop("testing_sources"))) == 0:
                if new_meta == meta:
                    need_reexec = False

        if need_reexec:
            self.logger.debug("Re-execute semantic labeling...")

            try:
                # preparing data, want to compute semantic models for all sources in dataset
                data_dir = Path(config.datasets[self.dataset].data.as_path())
                model_dir = Path(
                    config.datasets[self.dataset].models_json.as_path())

                shutil.rmtree(str(self.input_dir))
                for fpath in self.output_dir.iterdir():
                    os.remove(fpath)
                [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True)
                 for x in
                 ["%s_train" % self.dataset,
                  "%s_test" % self.dataset] for y in ["data", "model"]]

                input_train_dir = self.input_dir / ("%s_train" % self.dataset)
                input_test_dir = self.input_dir / ("%s_test" % self.dataset)

                for fpath in sorted(data_dir.iterdir()):
                    model_fname = fpath.stem + "-model.json"
                    if fpath.stem in train_source_ids:
                        self._copy_data(fpath,
                                        input_train_dir / "data" / fpath.name)
                        # seriaalize the model instead of copied because we want to convert uri to simplified uri
                        # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_train_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                    if fpath.stem in test_source_ids:
                        self._copy_data(fpath,
                                        input_test_dir / "data" / fpath.name)
                        # same reason like above
                        serializeJSON(KarmaModel.load_from_file(
                            self.ont, model_dir /
                            model_fname).to_normalized_json_model(),
                                      input_test_dir / "model" /
                                      f"{fpath.name}.model.json",
                                      indent=4)

                invoke_command(" ".join([
                    config.previous_works.minhptx_iswc2016.cli.as_path(),
                    str(self.input_dir),
                    str(self.output_dir), "--train_dataset",
                    "%s_train" % self.dataset, "--test_dataset",
                    "%s_test" % self.dataset, "--evaluate_train_set", "True",
                    "--reuse_rf_model", "False"
                ]),
                               output2file=self.exec_dir / "execution.log")
            except Exception:
                sys.stdout.flush()
                self.logger.exception(
                    "Error while preparing and invoking semantic labeling api..."
                )
                raise

            serializeJSON(self.get_meta(train_source_ids, test_source_ids),
                          self.meta_file,
                          indent=4)

        # load result
        self.logger.debug("Load previous result...")
        output_files = [
            fpath for fpath in self.output_dir.iterdir()
            if fpath.suffix == ".json"
        ]
        assert len(output_files) == 2
        app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON(
            output_files[0], Class=MinhptxSemanticLabelingResult)
        app_result.update(
            deserializeJSON(output_files[1],
                            Class=MinhptxSemanticLabelingResult))

        return {
            source_id: app_result[source_id]
            for source_id in chain(test_source_ids, train_source_ids)
        }
Exemple #17
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
from pathlib import Path
from typing import Dict, Tuple, List, Set, Union, Optional, Any

from semantic_modeling.config import config
from semantic_modeling.utilities.serializable import deserializeJSON, serializeJSON
"""Usually run after generate r2rml and copied from KARMA_HOME"""

dataset = "museum_edm"
model_dir = Path(
    config.datasets[dataset].karma_version.as_path()) / "models-json"
for file in sorted(model_dir.iterdir()):
    sm = deserializeJSON(file)
    sm['id'] = Path(sm['id']).stem
    sm['name'] = sm['id']

    serializeJSON(sm, model_dir / f"{sm['id']}-model.json", indent=4)
    os.remove(file)
Exemple #18
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import ujson
from pathlib import Path
from typing import Dict, Tuple, List, Set, Union, Optional, Any

from semantic_modeling.config import config
from semantic_modeling.data_io import get_data_tables, get_raw_data_tables, get_semantic_models, get_ontology, \
    get_sampled_data_tables
from semantic_modeling.utilities.serializable import serializeJSON
from transformation.r2rml.commands.modeling import SetInternalLinkCmd, SetSemanticTypeCmd
from transformation.r2rml.r2rml import R2RML

dataset = "museum_crm"
ont = get_ontology(dataset)

source_dir = Path(
    config.datasets[dataset].as_path()) / "karma-version" / "sources"
source_dir.mkdir(exist_ok=True, parents=True)
for tbl in get_sampled_data_tables(dataset):
    serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)
Exemple #19
0
        config.datasets[dataset].as_path()) / "karma-version" / "sources"
    source_dir.mkdir(exist_ok=True, parents=True)
    meta_file = source_dir / ".meta"

    if meta_file.exists():
        meta = deserializeJSON(meta_file)

        if meta['n_samples'] == settings.n_samples and meta[
                'random_seed'] == settings.random_seed:
            print(
                "Don't need to prepare karma sources because it has been generated with same configuration before. Terminating...!"
            )
            exit(0)

    print(f"Generate karma sources for dataset: {dataset}")
    serializeJSON(
        {
            'n_samples': settings.n_samples,
            'random_seed': settings.random_seed
        },
        meta_file,
        indent=4)

    model_dir = Path(config.datasets[dataset].models_y2rml.as_path())
    # clear cache file
    clear_sampled_data_tables(dataset)

    for tbl, sm in zip(get_sampled_data_tables(dataset),
                       get_semantic_models(dataset)):
        serializeJSON(tbl.rows, source_dir / f"{tbl.id}.json", indent=4)
Exemple #20
0
def print_cooccurrence(features_file_content: dict, output_file: Path):
    serializeJSON(features_file_content['cooccurrence'], output_file, indent=4)
Exemple #21
0
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms,
                            test_sms):
    ont: Ontology = get_ontology(dataset)
    karma_models: List[KarmaModel] = get_karma_models(dataset)
    semantic_models: List[SemanticModel] = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    sdesc_args = dict(
        dataset=dataset,
        train_sm_ids=train_sm_ids,
        use_correct_type=
        False,  # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes
        use_old_semantic_typer=False,
        exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015",
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")
    # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder
    if sdesc_args['sm_type_dir'].exists():
        shutil.rmtree(sdesc_args['sm_type_dir'])
    sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True)

    top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes
    typer = create_semantic_typer(dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True)

    for sm, ksm in zip(semantic_models, karma_models):
        # assign semantic types to learnedSemanticTypes
        sm_alignment = SemanticModelAlignment(sm, ksm)
        for col in ksm.source_columns:
            attr = sm.get_attr_by_label(
                sm.graph.get_node_by_id(
                    sm_alignment.alignment[col.id]).label.decode('utf-8'))
            node = ksm.karma_graph.get_node_by_id(col.id)
            link = node.get_first_incoming_link()

            node.learned_semantic_types = [
                KarmaSemanticType(node.id, stype.domain, stype.type,
                                  typer.__class__.__name__,
                                  stype.confidence_score)
                for stype in attr.semantic_types
            ]
            node.user_semantic_types = [
                KarmaSemanticType(node.id,
                                  link.get_source_node().label.decode(),
                                  link.label.decode(), "User", 1.0)
            ]

        serializeJSON(ksm.to_normalized_json_model(ont),
                      sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json",
                      indent=4)

    # STEP 2: invoking semantic modeling
    modeler = MohsenSemanticModeling(**sdesc_args)
    pred_sms = modeler.sm_prediction(train_sms, test_sms)

    # STEP 3: prediction semantic mapping result
    eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]]
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    for sm, pred_sm in zip(test_sms, pred_sms):
        eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph,
                                                      data_node_mode)
        eval_hist.append([
            sm.id, eval_result["precision"], eval_result["recall"],
            eval_result["f1"],
            smodel_eval.stype_acc(sm.graph, pred_sm.graph)
        ])

    eval_hist.append([
        'average',
        np.average([float(x[1]) for x in eval_hist[1:]]),
        np.average([float(x[2]) for x in eval_hist[1:]]),
        np.average([float(x[3]) for x in eval_hist[1:]]),
        np.average([float(x[4]) for x in eval_hist[1:]])
    ])
    serializeCSV(
        eval_hist,
        sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv")

    # STEP 4: prediction semantic labeling result
    pred_stypes = modeler.semantic_labeling(train_sms, test_sms)
    for pred_stype, sm in zip(pred_stypes, test_sms):
        for attr in sm.attrs:
            if attr.label not in pred_stype:
                attr.semantic_types = []
            else:
                attr.semantic_types = pred_stype[attr.label]
    eval_sources(
        test_sms, sdesc_args["exec_dir"] /
        f"evaluation_result_{scenario.value}_stype.csv")

    # STEP 5: visualize the prediction
    (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True)
    need_render_graphs = [
        (colorize_prediction(
            pred_sm.graph,
            AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]),
         sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png")
        for sm, pred_sm in zip(test_sms, pred_sms)
    ]
    with ThreadPool(32) as p:
        p.map(render_graph, need_render_graphs)

    return eval_hist
Exemple #22
0

if __name__ == '__main__':
    # HYPER-ARGS
    args = get_shell_args()

    Settings.get_instance(
        False
    ).semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes
    Settings.get_instance().semantic_labeling_method = args.semantic_typer
    Settings.get_instance().log_current_settings()

    exp_dir = Path(args.exp_dir)
    assert exp_dir.exists()

    source_models = {sm.id: sm for sm in get_semantic_models(args.dataset)}
    train_sms = [source_models[sid] for sid in args.kfold['train_sm_ids']]
    test_sms = [source_models[sid] for sid in args.kfold['test_sm_ids']]

    eval_hist = run_evaluation_workflow(args.dataset, Scenario.SCENARIO_2,
                                        train_sms, test_sms)
    serializeCSV(eval_hist,
                 exp_dir / f"kfold-{get_short_train_name(train_sms)}.test.csv")
    serializeJSON(args,
                  exp_dir /
                  f"kfold-{get_short_train_name(train_sms)}.meta.json",
                  indent=4)
    shutil.move(
        get_cache_dir(args.dataset, train_sms) / "mohsen_jws2015",
        exp_dir / f"kfold-{get_short_train_name(train_sms)}")