Beispiel #1
0
def make_example(sm: SemanticModel,
                 g: Graph,
                 example_id,
                 train_sids: List[str] = None) -> Example:
    settings = Settings.get_instance()
    if settings.auto_labeling_method == Settings.ALGO_AUTO_LBL_MAX_F1:
        link2label, prime2x = AutoLabel.auto_label_max_f1(sm.graph, g,
                                                          False)[:2]
        example = Example(sm.graph, g, link2label, prime2x)
        example.set_meta(example_id, train_sids)

        return example
    assert False
Beispiel #2
0
def build_test_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel], output_dir: Path,
                    n_iter):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    new_data = generate_data(model, dataset, train_sms, discover_sources, 1)
    for sm in discover_sources:
        new_candidate_sms = [
            key for key in new_data[sm.id] if key not in data[sm.id]
        ]
        for key in new_candidate_sms:
            data[sm.id][key] = new_data[sm.id][key]

    test_examples = [
        example for sid in discover_sids for example in data[sid].values()
    ]
    test_examples.sort(key=lambda e: e.example_id)

    serializeJSON(test_examples,
                  output_dir / "examples" / f"test.{n_iter}.json")
Beispiel #3
0
def generate_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                  discover_sources: List[SemanticModel], n_iter):
    data = {}
    stat = Statistic.get_instance(train_sms)
    train_sids = [sm.id for sm in train_sms]
    model_bundle = (model.dataset, model.model, model.tf_domain,
                    model.pairwise_domain)

    with get_pool(Settings.get_instance().parallel_n_process) as pool:
        results = []
        for source in discover_sources:
            result: AsyncResult[Dict[bytes, Graph]] = pool.apply_async(
                generate_candidate_sm,
                (dataset, source, stat, model_bundle, train_sids))
            results.append(result)

        for source, result in zip(discover_sources, results):
            candidate_sms = result.get()
            for i, key in enumerate(candidate_sms):
                candidate_sms[key] = make_example(
                    source, candidate_sms[key],
                    Example.generate_example_id(source.id, i, n_iter),
                    train_sids)

            data[source.id] = candidate_sms

    return data
Beispiel #4
0
def create_default_model(dataset: str, train_sms: List[SemanticModel],
                         training_args, basedir: Path) -> Model:
    train_examples = []
    for sm in train_sms:
        example = Example(sm.graph, sm.graph,
                          {e.id: True
                           for e in sm.graph.iter_links()},
                          {n.id: n.id
                           for n in sm.graph.iter_nodes()})
        example.set_meta(Example.generate_example_id(sm.id, 0, 0),
                         [sm.id for sm in train_sms])
        train_examples.append(example)

    raw_model, tf_domain, pairwise_domain, __ = train_model(
        dataset, [sm.id for sm in train_sms], 120, train_examples, [],
        training_args, basedir)
    return Model(dataset, raw_model, tf_domain, pairwise_domain)
Beispiel #5
0
def online_learning(model: Model,
                    dataset: str,
                    train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel],
                    output_dir: Path,
                    training_args,
                    iter_range=(1, 3)):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    ignore_sids = set(
    )  # those should not include in the discovery_helper process because of no new sources
    logger = get_logger("app")
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    for n_iter in range(*iter_range):
        logger.info("==================================> Iter: %s", n_iter)
        new_data = generate_data(model, dataset, train_sms, discover_sources,
                                 n_iter)
        for sm in discover_sources:
            if sm.id in ignore_sids:
                continue

            new_candidate_sms = [
                key for key in new_data[sm.id] if key not in data[sm.id]
            ]
            if len(new_candidate_sms) == 0:
                # no new candidate sms
                logger.info("No new candidate for source: %s", sm.id)
                ignore_sids.add(sm.id)
            else:
                for key in new_candidate_sms:
                    data[sm.id][key] = new_data[sm.id][key]

        train_examples = [
            example for sm in train_sms if sm.id in discover_sids
            for example in data[sm.id].values()
        ]
        train_examples.sort(key=lambda e: e.example_id)

        serializeJSON(train_examples,
                      output_dir / "examples" / f"train.{n_iter}.json")
        shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json",
                        output_dir / "examples" / f"train.json")

        raw_model, tf_domain, pairwise_domain, __ = train_model(
            dataset, [sm.id for sm in train_sms], 120, train_examples, [],
            training_args, output_dir / "models")
        model = Model(dataset, raw_model, tf_domain, pairwise_domain)

    return model
Beispiel #6
0
    def predict_sm_probs(self, sm_id: str, train_sm_ids: List[str],
                         gs: List[MergeGraph]):
        examples = [
            Example(None, g, {link.id: True
                              for link in g.iter_links()}, None) for g in gs
        ]
        for example in examples:
            example.set_meta(example.generate_example_id(sm_id, 0, 0),
                             train_sm_ids)

        return self.predict_probs(examples)
Beispiel #7
0
def make_test_from_prediction(train_sms: List[SemanticModel],
                              evaluate_sms: List[SemanticModel], workdir: Path,
                              model_dir: Path):
    search_history: Dict[str, List[List[dict]]] = deserializeJSON(
        model_dir / "search_history.json")
    evaluate_sms = {sm.id: sm for sm in evaluate_sms}
    train_sm_ids = [sm.id for sm in train_sms]

    test_examples = []
    for sid in search_history:
        for i, gs in enumerate(search_history[sid]):
            for j, g in enumerate(gs):
                eid = Example.generate_example_id(sid, j, i)
                example = make_example(evaluate_sms[sid], Graph.from_dict(g),
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples
Beispiel #8
0
    def get_stype_score(self, example: Example) -> Dict[int, float]:
        """Compute stype prob. but store in a map: data node id => prob."""
        stype_score = {}
        source_desc = self.source_models[example.get_model_id()]
        for target in example.pred_sm.iter_data_nodes():
            link = target.get_first_incoming_link()
            source = link.get_source_node()
            for stype in source_desc.get_attr_by_label(
                    target.label.decode("utf-8")).semantic_types:
                if stype.domain.encode(
                        "utf-8") == source.label and stype.type.encode(
                            "utf-8") == link.label:
                    p_link_given_so = stype.confidence_score
                    break
            else:
                p_link_given_so = None

            stype_score[target.id] = p_link_given_so
        return stype_score
Beispiel #9
0
    def annotate(self, example: Example) -> Example:
        # STEP 1: add semantic types... dont' need to do, because example must be either in train or test...
        sm_id: str = example.get_model_id()
        assert sm_id in self.source_models
        example.annotator = self

        is_train_example: bool = sm_id in self.train_source_ids
        source: SemanticModel = self.source_models[sm_id]

        # id2attrs: Dict[int, Attribute] = {attr.id: attr for attr in sources[sm_id].attrs}
        example.node2features = {}
        example.link2features = {}
        stype_score = self.get_stype_score(example)

        # add node features from node_prob weak model
        node_prob_features = self.node_prob.feature_extraction(
            example.pred_sm, stype_score)
        node_probs = self.node_prob.compute_prob(node_prob_features)
        for nid, prob in node_probs.items():
            example.node2features[nid] = dict(node_prob_features[nid])
            example.node2features[nid]['node_prob'] = prob

        stype_assistant = self.stype_assistant.compute_prob(
            sm_id, example.pred_sm)

        # add link features
        for node in example.pred_sm.iter_class_nodes():
            outgoing_links = list(node.iter_outgoing_links())
            numbered_links = numbering_link_labels(outgoing_links)

            for link in outgoing_links:
                target = link.get_target_node()
                total_stype_score = None
                delta_stype_score = None
                ratio_stype_score = None
                p_link_given_so = None
                p_triple = None
                stype_order = None
                data_constraint_features = {}

                if target.is_class_node():
                    p_link_given_so = self.statistic.p_l_given_so(
                        node.label, link.label, target.label,
                        default=0.5)  # half half
                    p_triple = p_link_given_so * example.node2features[
                        link.source_id]['node_prob'] * example.node2features[
                            link.target_id]['node_prob']
                else:
                    target_stypes = source.get_attr_by_label(
                        target.label.decode("utf-8")).semantic_types
                    n_target_stypes = len(target_stypes)
                    total_stype_score = sum(stype.confidence_score
                                            for stype in target_stypes)

                    for i, stype in enumerate(target_stypes):
                        if stype.domain.encode(
                                "utf-8") == node.label and stype.type.encode(
                                    "utf-8") == link.label:
                            # data node, p_link = score of semantic type
                            p_link_given_so = stype.confidence_score
                            if i == 0 and n_target_stypes > 1:
                                delta_stype_score = stype.confidence_score - target_stypes[
                                    1].confidence_score
                            else:
                                delta_stype_score = stype.confidence_score - target_stypes[
                                    0].confidence_score

                            ratio_stype_score = stype.confidence_score / target_stypes[
                                0].confidence_score
                            stype_order = i
                            break

                    if p_link_given_so is not None:
                        p_triple = p_link_given_so * example.node2features[
                            link.source_id]['node_prob']

                    # add data constraint
                    # if is_train_example:
                    #     # we can use link2label, because of known models
                    #     data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id,
                    #                                                                     example.link2label)
                    # else:
                    #     data_constraint_features = self.data_constraint.extract_feature(sm_id, example.pred_sm, target.id)

                example.link2features[link.id] = {
                    'p_triple': p_triple,
                    'p_link_given_so': p_link_given_so,
                    'total_stype_score': total_stype_score,
                    'stype_order': stype_order,
                    'delta_stype_score': delta_stype_score,
                    'ratio_stype_score': ratio_stype_score,
                    # 'local_constraint': data_constraint_features.get("local", None),
                    # 'global_constraint': data_constraint_features.get("global", None),
                    'stype_prob': stype_assistant.get(link.id, None)
                }

                multi_val_prob = self.multival_predicate.compute_prob(
                    link.label, numbered_links[link.id])
                if multi_val_prob is not None:
                    example.link2features[
                        link.id]["multi_val_prob"] = multi_val_prob

        return example