Esempio n. 1
0
def generate_candidate_sm(dataset: str, test_sm: SemanticModel,
                          stat: Statistic, model_bundle, train_source_ids):
    # generate candidate
    ont = get_ontology(dataset)
    ont_graph = get_ont_graph(dataset)
    settings = Settings.get_instance()

    dnodes: Dict[bytes, List[KarmaSemanticType]] = {
        attr.label.encode('utf-8'): attr.semantic_types
        for attr in test_sm.attrs
    }

    ota = EmpiricalTripleAdviser(
        ont_graph, ont, stat.p_triple,
        settings.searching_triple_adviser_max_candidate)
    graph_explorer_builder = GraphExplorerBuilder(
        ota,
        max_data_node_hop=settings.searching_max_data_node_hop,
        max_class_node_hop=settings.searching_max_class_node_hop)

    for attr, semantic_types in dnodes.items():
        ota.add_data_node(attr, semantic_types)

    model = Model(*model_bundle)
    args = PGMBeamSearchArgs(
        test_sm.id,
        custom_search_discovery,
        Tracker(track_search_nodes=False),
        partial(model.predict_sm_probs, test_sm.id, train_source_ids),
        graph_explorer_builder,
        early_terminate_func=None,
        beam_width=settings.training_beam_width,
        gold_sm=test_sm.graph,
        source_attributes=test_sm.attrs,
        pre_filter_func=filter_unlikely_graph,
    )
    started_node = PGMStartSearchNode(
        args.get_and_increment_id(), args,
        [a.label.encode('utf-8') for a in test_sm.attrs])

    args._tmp_random_state = numpy.random.RandomState(
        Settings.get_instance().random_seed)

    results: List[PGMSearchNode] = beam_search(
        [started_node],
        beam_width=settings.training_beam_width,
        n_results=settings.searching_n_explore_result,
        args=args)

    candidate_sms = {}
    for search_node in args._tmp_tracker_for_storing_search_discovery_nodes:
        g = search_node.get_value().graph
        candidate_sms[graph_to_hashable_string(g)] = g

    for search_node in results:
        g = search_node.get_value().graph
        candidate_sms[graph_to_hashable_string(g)] = g

    return candidate_sms
Esempio n. 2
0
def filter_unlikely_graph(g: MergeGraph) -> bool:
    settings = Settings.get_instance()
    max_n_duplications = settings.mrf_max_n_duplications
    max_n_duplication_types = settings.mrf_max_n_duplication_types

    for n in g.iter_class_nodes():
        # FILTER middle nodes
        if n.n_incoming_links == 1 and n.n_outgoing_links == 1:
            link = next(iter(n.iter_outgoing_links()))
            if link.get_target_node().is_class_node():
                return False

        # FILTER: max_size_duplication_group <= 7 and max_n_duplications <= 4
        n_duplication_types = 0
        for e_lbl, es in _(n.iter_outgoing_links()).imap(lambda e: (e.label, e)).group_by_key().get_value():
            if len(es) > max_n_duplications:
                return False

            if len(es) > 1:
                n_duplication_types += 1

        if n_duplication_types > max_n_duplication_types:
            return False

    return True
Esempio n. 3
0
    def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str],
                 exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None):
        self.dataset: str = dataset
        self.train_sm_ids = train_sm_ids
        self.ont = get_ontology(dataset)
        self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)}

        # can only run once time, trying re-invoke will generate an error
        self.__has_run_modeling = False
        if exec_dir is None:
            exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015"
        self.exec_dir: Path = Path(exec_dir)
        self.sm_type_dir = sm_type_dir

        # parameters for mohsen's algorithm
        self.use_old_semantic_typer = use_old_semantic_typer
        self.use_correct_type = use_correct_type
        assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4
        self.num_candidate_semantic_type = 4
        self.multiple_same_property_per_node = True

        self.coherence = 1.0
        self.confidence = 1.0
        self.size_reduction = 0.5

        self.num_candidate_mappings = 50
        self.mapping_branching_factor = 50
        self.topk_steiner_tree = 10

        # take all, not cut off everything
        self.cut_off = int(1e6)
        self.our_and_karma_sm_alignments = {}
Esempio n. 4
0
    def predict_log_probs(self, examples: List[Example]):
        log_probs = []

        for es in _(examples).isplit(self.max_n_tasks):
            varss = [self.get_variables(e) for e in es]
            # varss = self.parallel_get_variables(es)
            factorss = [self.model.get_factors(vars) for vars in varss]
            inferences = [
                self.inference(f, v) for f, v in zip(factorss, varss)
            ]

            desired_assignments = [{
                var: var.domain.encode_value(True)
                for var in vars
            } for vars in varss]

            logZs = parallel_marginal_inference(
                inferences,
                n_threads=Settings.get_instance().parallel_gmtk_n_threads)
            log_probs += [
                sum(
                    f.score_assignment(desired_assignments[i])
                    for f in factorss[i]) - logZs[i] for i in range(len(es))
            ]

        return log_probs
Esempio n. 5
0
def generate_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                  discover_sources: List[SemanticModel], n_iter):
    data = {}
    stat = Statistic.get_instance(train_sms)
    train_sids = [sm.id for sm in train_sms]
    model_bundle = (model.dataset, model.model, model.tf_domain,
                    model.pairwise_domain)

    with get_pool(Settings.get_instance().parallel_n_process) as pool:
        results = []
        for source in discover_sources:
            result: AsyncResult[Dict[bytes, Graph]] = pool.apply_async(
                generate_candidate_sm,
                (dataset, source, stat, model_bundle, train_sids))
            results.append(result)

        for source, result in zip(discover_sources, results):
            candidate_sms = result.get()
            for i, key in enumerate(candidate_sms):
                candidate_sms[key] = make_example(
                    source, candidate_sms[key],
                    Example.generate_example_id(source.id, i, n_iter),
                    train_sids)

            data[source.id] = candidate_sms

    return data
Esempio n. 6
0
    def __init__(self, multi_val_predicate: MultiValuePredicate, structure: LocalStructure):
        self.tensors: Dict[bytes, Dict[int, List[DenseTensor]]] = {}
        features = DenseTensorFunc.from_array([0, 0])
        n_features = features.size()[0]

        max_n_dups = Settings.get_instance().mrf_max_n_duplications

        for lbl, space in structure.node_structure_space.items():
            self.tensors[lbl] = {}
            for ctriple, child_idx in space.children.items():
                self.tensors[lbl][child_idx] = [None, None]
                for n_dup in range(2, max_n_dups + 1):
                    tensor = DenseTensorFunc.zeros((2 ** n_dup, n_features))
                    dims = [2] * n_dup
                    dims.append(n_features)
                    for count, current_val_index, values in iter_values(n_dup, 0):
                        if len(values) <= 1:
                            features[0] = 0
                            features[1] = 0
                        else:
                            multi_val_prob = multi_val_predicate.compute_prob(ctriple[0], len(values))
                            features[0] = max(multi_val_prob, 0.01)
                            features[1] = max(1 - multi_val_prob, 0.01)

                        tensor[count, :] = features

                    self.tensors[lbl][child_idx].append(tensor.view_shape(dims))
Esempio n. 7
0
    def __init__(self,
                 dataset: str,
                 train_source_ids: List[str],
                 load_circular_dependency: bool = True,
                 training_examples: Optional[List[Example]] = None):
        """
        :param dataset:
        :param train_source_ids:
        :param top_k_semantic_types:
        :param n_sample:
        :param load_circular_dependency:
        :param training_examples: list of training examples use to build weak models, don't need it at testing time (i.e = NULL), because weak models has been built before
        """
        self.dataset = dataset
        self.source_models = {sm.id: sm for sm in get_semantic_models(dataset)}
        self.train_source_ids = set(train_source_ids)
        self.top_k_semantic_types = Settings.get_instance(
        ).semantic_labeling_top_n_stypes

        self.training_models = [
            self.source_models[sid] for sid in train_source_ids
        ]
        self.typer: SemanticTyper = create_semantic_typer(
            dataset, self.training_models)

        self.testing_models = [
            self.source_models[sid] for sid in set(
                self.source_models.keys()).difference(train_source_ids)
        ]
        self.training_examples = training_examples

        # local models
        self.multival_predicate = MultiValuePredicate.get_instance(
            self.training_models)
        self.statistic = Statistic.get_instance(self.training_models)
        # self.data_constraint = get_data_constraint_model(dataset, self.training_models)
        self.stype_assistant = get_stype_assistant_model(
            dataset, self.training_models)
        self.local_structure = LocalStructure.get_instance(
            self.training_models)
        self.attribute_same_scope = AttributeScope.get_instance(self.dataset)
        self.duplication_tensors = DuplicationTensors.get_instance(
            self.training_models)

        self.primary_key: PrimaryKey = PrimaryKey.get_instance(
            dataset, self.training_models)
        self.cardinality = CardinalityFeatures.get_instance(dataset)

        # STEP 1: add semantic types
        self.typer.semantic_labeling(self.training_models,
                                     self.testing_models,
                                     self.top_k_semantic_types,
                                     eval_train=True)

        # STEP 2: load circular dependency like node_prob
        if load_circular_dependency:
            self.node_prob = NodeProb(self, load_classifier=True)
Esempio n. 8
0
def make_example(sm: SemanticModel,
                 g: Graph,
                 example_id,
                 train_sids: List[str] = None) -> Example:
    settings = Settings.get_instance()
    if settings.auto_labeling_method == Settings.ALGO_AUTO_LBL_MAX_F1:
        link2label, prime2x = AutoLabel.auto_label_max_f1(sm.graph, g,
                                                          False)[:2]
        example = Example(sm.graph, g, link2label, prime2x)
        example.set_meta(example_id, train_sids)

        return example
    assert False
Esempio n. 9
0
def semantic_labeling(dataset: str, train_sms: List[SemanticModel],
                      test_sms: List[SemanticModel]):
    if Settings.get_instance().semantic_labeling_simulate_testing:
        for sm in train_sms:
            custom_train_sms = [s for s in train_sms if s.id != sm.id]
            create_semantic_typer(dataset, custom_train_sms).semantic_labeling(
                custom_train_sms, [sm],
                top_n=Settings.get_instance().semantic_labeling_top_n_stypes,
                eval_train=False)
            SemanticTyper.instance = None  # clear cache
            SemanticTypeDB.instance = None

        create_semantic_typer(dataset, train_sms).semantic_labeling(
            train_sms,
            test_sms,
            top_n=Settings.get_instance().semantic_labeling_top_n_stypes,
            eval_train=False)
    else:
        create_semantic_typer(dataset, train_sms).semantic_labeling(
            train_sms,
            test_sms,
            top_n=Settings.get_instance().semantic_labeling_top_n_stypes,
            eval_train=True)
Esempio n. 10
0
def create_semantic_typer(dataset: str,
                          train_sms: List[SemanticModel]) -> SemanticTyper:
    settings = Settings.get_instance()
    if settings.semantic_labeling_method == Settings.MohsenJWS:
        # noinspection PyTypeChecker
        return MohsenSemanticTyper.get_instance(dataset, train_sms)

    if settings.semantic_labeling_method == Settings.ReImplMinhISWC:
        return SemanticTyper.get_instance(dataset, train_sms)

    if settings.semantic_labeling_method == Settings.MohsenJWS + "-Oracle":
        # noinspection PyTypeChecker
        return SemiOracleSemanticLabeling(
            MohsenSemanticTyper.get_instance(dataset, train_sms))

    if settings.semantic_labeling_method == Settings.ReImplMinhISWC + "-Oracle":
        # noinspection PyTypeChecker
        return SemiOracleSemanticLabeling(
            SemanticTyper.get_instance(dataset, train_sms))

    if settings.semantic_labeling_method == Settings.OracleSL:
        # noinspection PyTypeChecker
        return OracleSemanticLabeling()

    if settings.semantic_labeling_method == "OracleSL-Constraint":
        # noinspection PyTypeChecker
        return ConstraintOracleSemanticLabeling()

    if settings.semantic_labeling_method == "SereneSemanticType":
        sms = get_semantic_models(dataset)
        if dataset == "museum_edm" and train_sms == sms[:14]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s01-s14"
        elif dataset == "museum_edm" and train_sms == sms[14:]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s15-s28"
        elif dataset == "museum_edm" and train_sms == sms[7:21]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s08-s21"
        elif dataset == "museum_crm" and train_sms == sms[:14]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s01-s14"
        elif dataset == "museum_crm" and train_sms == sms[14:]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s15-s28"
        elif dataset == "museum_crm" and train_sms == sms[7:21]:
            serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s08-s21"
        else:
            raise Exception("Invalid configuration of serene semantic types")

        # noinspection PyTypeChecker
        return SereneSemanticTypes(dataset, Path(serene_dir))

    raise Exception(
        f"Invalid semantic typer: {settings.semantic_labeling_method}")
Esempio n. 11
0
def get_sampled_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['sampled_data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / "sampled_tables.pkl"
        if cache_file.exists():
            tables = deserialize(cache_file)
        else:
            tables = get_data_tables(dataset)
            settings = Settings.get_instance()
            tables = [tbl.sample(settings.n_samples, settings.random_seed) for tbl in tables]
            serialize(tables, cache_file)
        _data_io_vars["sampled_data_tables"][dataset] = tables

    return _data_io_vars["sampled_data_tables"][dataset]
Esempio n. 12
0
def get_data_constraint_model(
    dataset: str,
    train_sms: List[SemanticModel],
) -> DataConstraint:
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(
            dataset, train_sms) / "weak_models" / "data_constraint.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        settings = Settings.get_instance()
        valid_threshold = settings.data_constraint_valid_threshold
        guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold
        n_comparison_samples = settings.data_constraint_n_comparison_samples
        random_seed = settings.random_seed
        n_sample = settings.n_samples

        if cache_file.exists():
            DataConstraint.logger.debug("Try to load previous run...")
            model, cached_dataset, cached_train_sm_ids, extra_args = deserialize(
                cache_file)
            if cached_dataset == dataset \
                    and cached_train_sm_ids == {sm.id for sm in train_sms} \
                    and extra_args == (
                        valid_threshold, guess_datetime_threshold, n_comparison_samples,
                        random_seed, n_sample):
                need_rebuilt = False

        if need_rebuilt:
            DataConstraint.logger.debug("Re-build data-constraint model...")
            data_tables = [
                ColumnBasedTable.from_table(tbl)
                for tbl in get_sampled_data_tables(dataset)
            ]
            model = DataConstraint(train_sms, data_tables, valid_threshold,
                                   guess_datetime_threshold,
                                   n_comparison_samples)
            serialize((model, dataset, {sm.id
                                        for sm in train_sms},
                       (valid_threshold, guess_datetime_threshold,
                        n_comparison_samples, random_seed, n_sample)),
                      cache_file)

        _instance = model
    return _instance
Esempio n. 13
0
 def __init__(self, dataset: str, model: TemplateLogLinearModel,
              tf_domain: GrowableBinaryVectorDomain,
              pairwise_domain: GrowableBinaryVectorDomain) -> None:
     self.dataset = dataset
     self.source_models: Dict[str, SemanticModel] = {
         s.id: s
         for s in get_semantic_models(dataset)
     }
     self.inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)
     self.map_inference = BeliefPropagation.get_constructor(InferProb.MAP)
     self.model: TemplateLogLinearModel = model
     for template in model.templates:
         if isinstance(template, CachedTemplateFactorConstructor):
             template.disable_cache()
     self.tf_domain: GrowableBinaryVectorDomain = tf_domain
     self.pairwise_domain = pairwise_domain
     self.example_annotator: ExampleAnnotator = None
     self.max_n_tasks = Settings.get_instance().max_n_tasks
Esempio n. 14
0
def predict_sm(model: Model, dataset: str, train_sms: List[SemanticModel],
               evaluate_sms: List[SemanticModel], workdir):
    train_sids = [sm.id for sm in train_sms]
    predictions: Dict[str, Graph] = {}
    stat = Statistic.get_instance(train_sms)

    model_bundle = (model.dataset, model.model, model.tf_domain,
                    model.pairwise_domain)
    search_performance_history = {}
    search_history = {}

    with get_pool(Settings.get_instance().parallel_n_process) as pool:
        results = []
        for sm in evaluate_sms:
            result = pool.apply_async(
                generate_candidate_sm,
                (dataset, sm, stat, model_bundle, train_sids))
            results.append(result)

        pred_sms: Tuple[List[Tuple[float, Graph]],
                        List[Tuple[int, float, float, float,
                                   float]], List[List[Graph]]]
        for sm, result in zip(evaluate_sms, results):
            pred_sms = result.get()
            predictions[sm.id] = pred_sms[0][0][1]
            search_performance_history[sm.id] = pred_sms[1]
            search_history[sm.id] = pred_sms[2]

    serializeJSON({sid: o.to_dict()
                   for sid, o in predictions.items()},
                  workdir / "predicted_sms.json")
    serializeJSON(search_performance_history,
                  workdir / "search_performance_history.json",
                  indent=4)
    serializeJSON(
        {
            sid: [[o.to_dict() for o in os] for os in oss]
            for sid, oss in search_history.items()
        }, workdir / "search_history.json")
    return predictions
Esempio n. 15
0
    def __init__(self, all_children_weights: DenseTensor,
                 pairwise_pk_weights: DenseTensor,
                 pairwise_scope_weights: DenseTensor,
                 duplication_weights: Dict[str, DenseTensor],
                 pairwise_domain: GrowableBinaryVectorDomain[str]) -> None:
        self.settings = Settings.get_instance()

        self.all_children_weights: Weights = Weights(all_children_weights)
        self.pairwise_pk_weights: Weights = Weights(pairwise_pk_weights)
        self.pairwise_scope_weights: Weights = Weights(pairwise_scope_weights)
        self.duplication_weights: Dict[str, Weights] = {
            k: Weights(v)
            for k, v in duplication_weights.items()
        }

        self.pairwise_domain = pairwise_domain
        self.boolean_domain = BooleanVectorDomain.get_instance()
        # use to compute pairwise factor's feature tensor
        # similar to DotTensor1WithSufficientStatisticFactor.get_feature_tensor#domain_tensor
        self.pairwise_indice_func_tensor = DenseTensorFunc.from_array(
            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0,
                                                        1]]).view(4, 4, 1)
Esempio n. 16
0
def serialize_stype_assistant(dataset: str, sms: List[SemanticModel],
                              train_sms: List[SemanticModel],
                              test_sms: List[SemanticModel]):
    predicted_parent_stypes = SemanticTyper.get_instance(
        dataset, train_sms).semantic_labeling_parent(
            train_sms,
            test_sms,
            top_n=Settings.get_instance().semantic_labeling_top_n_stypes,
            eval_train=True)
    results = []

    for sm in sms:
        if sm.id not in predicted_parent_stypes:
            results.append({})
            continue

        all_parent_stypes = predicted_parent_stypes[sm.id]
        result = {}

        for attr_id, g_parent_stypes in all_parent_stypes.items():
            result[sm.graph.get_node_by_id(attr_id).label] = [{
                "stype": {
                    "domain": stype[0],
                    "type": stype[1],
                    "confidence_score": score
                },
                "parent_stypes": [{
                    "domain":
                    parent_stype[0] if parent_stype is not None else "",
                    "type":
                    parent_stype[1] if parent_stype is not None else "",
                    "confidence_score":
                    parent_stype_score
                } for parent_stype, parent_stype_score in parent_stypes]
            } for stype, score, parent_stypes in g_parent_stypes]
        results.append(result)

    return results
Esempio n. 17
0
    def __init__(self, dataset: str, train_source_ids: List[str]) -> None:
        self.n_processes = Settings.get_instance().parallel_n_annotators
        self.annotator = ExampleAnnotator(dataset, train_source_ids)

        self.processes = []
        for i in range(self.n_processes):
            parent_conn, child_conn = multiprocessing.Pipe()
            process = multiprocessing.Process(
                target=ParallelAnnotator.parallel_annotate,
                name=f"parallel-annotator-{i}",
                args=(child_conn, ))
            self.processes.append({
                "parent_conn": parent_conn,
                "child_conn": child_conn,
                "process": process
            })
            process.start()

        for proc_info in self.processes:
            proc_info['parent_conn'].send({
                "message": "start",
                "dataset": dataset,
                "train_sm_ids": train_source_ids
            })
Esempio n. 18
0
def generate_candidate_sm(dataset: str, test_sm: SemanticModel,
                          stat: Statistic, model_bundle, train_source_ids):
    # generate candidate
    ont = get_ontology(dataset)
    ont_graph = get_ont_graph(dataset)
    settings = Settings.get_instance()

    dnodes: Dict[bytes, List[KarmaSemanticType]] = {
        attr.label.encode('utf-8'): attr.semantic_types
        for attr in test_sm.attrs
    }

    ota = EmpiricalTripleAdviser(
        ont_graph, ont, stat.p_triple,
        settings.searching_triple_adviser_max_candidate)
    graph_explorer_builder = GraphExplorerBuilder(
        ota,
        max_data_node_hop=settings.searching_max_data_node_hop,
        max_class_node_hop=settings.searching_max_class_node_hop)

    for attr, semantic_types in dnodes.items():
        ota.add_data_node(attr, semantic_types)

    early_stopping = EarlyStopping()
    model = Model(*model_bundle)
    args = PGMBeamSearchArgs(
        test_sm.id,
        discovering_func,
        Tracker(track_search_nodes=True),
        partial(model.predict_sm_probs, test_sm.id, train_source_ids),
        graph_explorer_builder,
        # early_terminate_func=early_stopping.early_stopping,
        early_terminate_func=None,
        beam_width=settings.searching_beam_width,
        gold_sm=test_sm.graph,
        source_attributes=test_sm.attrs,
        pre_filter_func=filter_unlikely_graph,
    )
    started_nodes = [
        PGMStartSearchNode(args.get_and_increment_id(), args,
                           [a.label.encode('utf-8') for a in test_sm.attrs])
    ]

    results: List[PGMSearchNode] = beam_search(
        started_nodes,
        beam_width=settings.searching_beam_width,
        n_results=settings.searching_n_explore_result,
        args=args)

    # *****************************************************************************************************************'
    # DEBUG CODE
    output_dir = Path(config.fsys.debug.as_path() + "/tmp/final/")
    # for search_node in args.tracker.list_search_nodes:
    #     search_node.beam_search_args = None
    # serialize(args.tracker.list_search_nodes, output_dir / "search_nodes2.pkl")

    # for file in output_dir.iterdir():
    #     if file.is_dir():
    #         shutil.rmtree(file)
    #     else:
    #         os.remove(file)
    #
    # for i, search_nodes in enumerate(args.tracker.list_search_nodes):
    #     if len(search_nodes) == 0:
    #         continue
    #
    #     sub_output_dir = output_dir / str(i)
    #     sub_output_dir.mkdir(exist_ok=True, parents=True)
    #
    #     for j, r in enumerate(search_nodes[:30]):
    #         pred_sm = r.get_value().graph
    #         pred_sm.set_name(str(r.get_score()).encode('utf-8'))
    #
    #         g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0])
    #         g.render2img(sub_output_dir / f"{j}.png")
    #         serialize(pred_sm, sub_output_dir / f"{j}.pkl")
    #
    # sub_output_dir = output_dir / "result"
    # sub_output_dir.mkdir(exist_ok=True, parents=True)
    #
    # for i, r in enumerate(results):
    #     pred_sm = r.get_value().graph
    #     pred_sm.set_name(str(r.get_score()).encode('utf-8'))
    #
    #     g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0])
    #     g.render2img(sub_output_dir / f"{i}.png")
    #     serialize(pred_sm, sub_output_dir / f"{i}.pkl")
    #
    # # STEP 4: report performance
    print(
        f"{test_sm.id}: Performance at prev iter:",
        smodel_eval.f1_precision_recall(
            test_sm.graph,
            args.tracker.list_search_nodes[-1][0].get_value().graph,
            DataNodeMode.NO_TOUCH))
    print(
        f"{test_sm.id}: Performance at final iter:",
        smodel_eval.f1_precision_recall(test_sm.graph,
                                        results[0].get_value().graph,
                                        DataNodeMode.NO_TOUCH))
    # *****************************************************************************************************************'
    performances = []
    for iter_no, search_nodes in enumerate(args.tracker.list_search_nodes):
        if len(search_nodes) == 0:
            continue

        x = smodel_eval.f1_precision_recall(test_sm.graph,
                                            search_nodes[0].get_value().graph,
                                            DataNodeMode.NO_TOUCH)
        performances.append((iter_no, search_nodes[0].get_score(),
                             x['precision'], x['recall'], x['f1']))

    x = smodel_eval.f1_precision_recall(test_sm.graph,
                                        results[0].get_value().graph,
                                        DataNodeMode.NO_TOUCH)
    performances.append((len(performances), results[0].get_score(),
                         x['precision'], x['recall'], x['f1']))

    pred_sms = [(search_node.get_score(), search_node.get_value().graph)
                for search_node in results]
    search_history = [[n.get_value().graph for n in search_nodes]
                      for search_nodes in args.tracker.list_search_nodes]
    search_history.append([n.get_value().graph for n in results])

    return pred_sms, performances, search_history
Esempio n. 19
0
    test_examples = []
    for sid in search_history:
        for i, gs in enumerate(search_history[sid]):
            for j, g in enumerate(gs):
                eid = Example.generate_example_id(sid, j, i)
                example = make_example(evaluate_sms[sid], Graph.from_dict(g),
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples


if __name__ == '__main__':
    dataset = "museum_edm"
    Settings.get_instance(False).parallel_n_process = 6
    Settings.get_instance().max_n_tasks = 160
    Settings.get_instance().semantic_labeling_top_n_stypes = 4
    Settings.get_instance().searching_beam_width = 5
    Settings.get_instance().log_current_settings()

    source_models = get_semantic_models(dataset)
    train_sms = source_models[:6]
    test_sms = [sm for sm in source_models if sm not in train_sms]

    workdir = Path(config.fsys.debug.as_path(
    )) / dataset / "main_experiments" / get_short_train_name(train_sms)
    workdir.mkdir(exist_ok=True, parents=True)

    create_semantic_typer(dataset, train_sms).semantic_labeling(
        train_sms,
Esempio n. 20
0
                        help='Number of samples')
    parser.add_argument('--seed',
                        type=int,
                        required=True,
                        default=120,
                        help='Random seed')

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_shell_args()
    dataset = args.dataset

    settings = Settings.get_instance(False)
    settings.n_samples = args.n_samples
    settings.random_seed = args.seed
    settings.log_current_settings()

    ont = get_ontology(dataset)
    source_dir = Path(
        config.datasets[dataset].as_path()) / "karma-version" / "sources"
    source_dir.mkdir(exist_ok=True, parents=True)
    meta_file = source_dir / ".meta"

    if meta_file.exists():
        meta = deserializeJSON(meta_file)

        if meta['n_samples'] == settings.n_samples and meta[
                'random_seed'] == settings.random_seed:
Esempio n. 21
0
    return "scores_%s---%s" % (domain, type)


if __name__ == '__main__':
    dataset = "museum_edm"
    sms = get_semantic_models(dataset)
    ont = get_ontology(dataset)

    serene_dir = Path(
        "/workspace/tmp/serene-python-client/datasets/") / dataset
    top_n = 4
    normalize_score = False
    for semantic_type in [
            "MohsenJWS", "ReImplMinhISWC", "OracleSL-Constraint"
    ]:
        Settings.get_instance(False).semantic_labeling_method = semantic_type

        for kfold in ["kfold-s01-s14", "kfold-s15-s28", "kfold-s08-s21"]:
            serene_stypes = {}
            for sm in sms:
                for n in sm.graph.iter_data_nodes():
                    e = n.get_first_incoming_link()
                    stype = get_serene_style(
                        e.get_source_node().label.decode(), e.label.decode())
                    if stype not in serene_stypes:
                        serene_stypes[stype] = len(serene_stypes)
            serene_stypes["scores_unknown"] = len(serene_stypes)

            header = [
                "", "column_id", "column_name", "confidence", "dataset_id",
                "model_id", "label", "user_label"
Esempio n. 22
0
        raise

    return args


if __name__ == '__main__':
    args = get_shell_args()
    source_models: List[SemanticModel] = get_semantic_models(args.dataset)
    train_sms = [
        sm for sm in source_models if sm.id in args.kfold['train_sm_ids']
    ]
    test_sms = [
        sm for sm in source_models if sm.id in args.kfold['test_sm_ids']
    ]

    Settings.get_instance(False).semantic_labeling_method = args.semantic_typer
    Settings.get_instance().log_current_settings()

    typer = create_semantic_typer(args.dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, 4, eval_train=True)

    exp_dir = Path(args.exp_dir)
    eval_sources(
        train_sms, exp_dir /
        f"{typer.__class__.__name__}_{get_short_train_name(train_sms)}_eval.train.csv"
    )
    eval_sources(
        test_sms, exp_dir /
        f"{typer.__class__.__name__}_{get_short_train_name(train_sms)}_eval.test.csv"
    )
Esempio n. 23
0
                           for n in sm.graph.iter_nodes()})
        example.set_meta(Example.generate_example_id(sm.id, 0, 0),
                         [sm.id for sm in train_sms])
        train_examples.append(example)

    raw_model, tf_domain, pairwise_domain, __ = train_model(
        dataset, [sm.id for sm in train_sms], 120, train_examples, [],
        training_args, basedir)
    return Model(dataset, raw_model, tf_domain, pairwise_domain)


if __name__ == '__main__':
    from semantic_modeling.assembling.learning.evaluate import predict_sm
    # DenseTensorFunc.set_default_type(DType.Double)

    Settings.get_instance(False).parallel_gmtk_n_threads = 12
    Settings.get_instance().log_current_settings()

    dataset = "museum_edm"
    source_models = get_semantic_models(dataset)
    train_sms = source_models[:6]
    train_sm_ids = [sm.id for sm in train_sms]
    test_sms = [sm for sm in source_models if sm.id not in train_sm_ids]

    workdir = Path(config.fsys.debug.as_path(
    )) / dataset / "main_experiments" / get_short_train_name(train_sms)
    workdir.mkdir(exist_ok=True, parents=True)

    create_semantic_typer(dataset, train_sms).semantic_labeling(
        train_sms,
        test_sms,
Esempio n. 24
0
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms,
                            test_sms):
    ont: Ontology = get_ontology(dataset)
    karma_models: List[KarmaModel] = get_karma_models(dataset)
    semantic_models: List[SemanticModel] = get_semantic_models(dataset)
    train_sm_ids = [sm.id for sm in train_sms]

    sdesc_args = dict(
        dataset=dataset,
        train_sm_ids=train_sm_ids,
        use_correct_type=
        False,  # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes
        use_old_semantic_typer=False,
        exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015",
        sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" /
        "models-json-temp")
    # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder
    if sdesc_args['sm_type_dir'].exists():
        shutil.rmtree(sdesc_args['sm_type_dir'])
    sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True)

    top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes
    typer = create_semantic_typer(dataset, train_sms)
    typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True)

    for sm, ksm in zip(semantic_models, karma_models):
        # assign semantic types to learnedSemanticTypes
        sm_alignment = SemanticModelAlignment(sm, ksm)
        for col in ksm.source_columns:
            attr = sm.get_attr_by_label(
                sm.graph.get_node_by_id(
                    sm_alignment.alignment[col.id]).label.decode('utf-8'))
            node = ksm.karma_graph.get_node_by_id(col.id)
            link = node.get_first_incoming_link()

            node.learned_semantic_types = [
                KarmaSemanticType(node.id, stype.domain, stype.type,
                                  typer.__class__.__name__,
                                  stype.confidence_score)
                for stype in attr.semantic_types
            ]
            node.user_semantic_types = [
                KarmaSemanticType(node.id,
                                  link.get_source_node().label.decode(),
                                  link.label.decode(), "User", 1.0)
            ]

        serializeJSON(ksm.to_normalized_json_model(ont),
                      sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json",
                      indent=4)

    # STEP 2: invoking semantic modeling
    modeler = MohsenSemanticModeling(**sdesc_args)
    pred_sms = modeler.sm_prediction(train_sms, test_sms)

    # STEP 3: prediction semantic mapping result
    eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]]
    if scenario == Scenario.SCENARIO_1:
        data_node_mode = DataNodeMode.IGNORE_DATA_NODE
    else:
        data_node_mode = DataNodeMode.NO_TOUCH

    for sm, pred_sm in zip(test_sms, pred_sms):
        eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph,
                                                      data_node_mode)
        eval_hist.append([
            sm.id, eval_result["precision"], eval_result["recall"],
            eval_result["f1"],
            smodel_eval.stype_acc(sm.graph, pred_sm.graph)
        ])

    eval_hist.append([
        'average',
        np.average([float(x[1]) for x in eval_hist[1:]]),
        np.average([float(x[2]) for x in eval_hist[1:]]),
        np.average([float(x[3]) for x in eval_hist[1:]]),
        np.average([float(x[4]) for x in eval_hist[1:]])
    ])
    serializeCSV(
        eval_hist,
        sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv")

    # STEP 4: prediction semantic labeling result
    pred_stypes = modeler.semantic_labeling(train_sms, test_sms)
    for pred_stype, sm in zip(pred_stypes, test_sms):
        for attr in sm.attrs:
            if attr.label not in pred_stype:
                attr.semantic_types = []
            else:
                attr.semantic_types = pred_stype[attr.label]
    eval_sources(
        test_sms, sdesc_args["exec_dir"] /
        f"evaluation_result_{scenario.value}_stype.csv")

    # STEP 5: visualize the prediction
    (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True)
    need_render_graphs = [
        (colorize_prediction(
            pred_sm.graph,
            AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]),
         sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png")
        for sm, pred_sm in zip(test_sms, pred_sms)
    ]
    with ThreadPool(32) as p:
        p.map(render_graph, need_render_graphs)

    return eval_hist