Beispiel #1
0
    def __init__(self, sm: SemanticModel, karma_sm: KarmaModel):
        """Our semantic model and karma semantic model represent same semantic structure, but only name of data nodes are
        changed. We want to build a mapping from data node id in karma_sm to data node id in our sm, such that if we replace
        names of data nodes in karma_sm by name of data nodes in our sm; the new tree still capture same semantic like in
        our sm.

        Our conjecture: semantic of a data node doesn't change when a path from root node to this data node doesn't change.
        """
        try:
            result = f1_precision_recall(sm.graph, karma_sm.graph,
                                         DataNodeMode.IGNORE_LABEL_DATA_NODE)
        except PermutationExploding as e:
            logging.error("PermutationExploding at source: %s", sm.id)
            raise

        bijection = result['_bijection']
        try:
            assert result[
                'f1'] == 1.0 and None not in bijection.prime2x and None not in bijection.x2prime and len(
                    bijection.x2prime) == len(bijection.prime2x)

            # mapping from karma node's id to our sm node's id
            alignment: Dict[int, int] = {}

            for n_prime in karma_sm.graph.iter_class_nodes():
                n = sm.graph.get_node_by_id(bijection.prime2x[n_prime.id])
                edges: Dict[bytes, List[GraphLink]] = _(
                    n.iter_outgoing_links()).imap(
                        lambda e: (e.label, e)).group_by_key().todict()

                e_primes: List[GraphLink]
                for lbl, e_primes in _(n_prime.iter_outgoing_links()).imap(
                        lambda e: (e.label, e)).group_by_key().get_value():
                    assert len(e_primes) == len(edges[lbl])
                    e_primes = [
                        e for e in e_primes
                        if e.get_target_node().is_data_node()
                    ]
                    es = [
                        e for e in edges[lbl]
                        if e.get_target_node().is_data_node()
                    ]

                    assert len(e_primes) == len(es)
                    # order doesn't matter because it doesn't change semantic
                    for ep, e in zip(e_primes, es):
                        alignment[ep.target_id] = e.target_id

            self.alignment = alignment
            self.sm = sm
            self.karma_sm = karma_sm
        except Exception as e:
            sm.graph.render2pdf("/tmp/sm.pdf")
            karma_sm.graph.render2pdf("/tmp/karma_sm.pdf")

            logging.error(
                f"Error when trying to build alignment between models. Source = {sm.id}"
            )
            raise
Beispiel #2
0
    def pred_type(self, col: Column,
                  top_n: int) -> List[Tuple[Tuple[bytes, bytes], float]]:
        X = []
        refcols = [
            refcol for refcol in self.stype_db.train_columns
            if refcol.id != col.id
        ]
        j = self.stype_db.col2idx[col.id]
        for refcol in refcols:
            iref = self.stype_db.col2idx[refcol.id]
            X.append(self.stype_db.similarity_matrix[j, iref])

        result = self.model.predict_proba(X)[:, 1]
        result = _(zip(result, (self.stype_db.col2types[rc.id] for rc in refcols))) \
            .sort(key=lambda x: x[0], reverse=True)
        top_k_st = {}
        for score, stype in result:
            if stype not in top_k_st:
                top_k_st[stype] = score
                if len(top_k_st) == top_n:
                    break

        return sorted([(stype, score) for stype, score in top_k_st.items()],
                      reverse=True,
                      key=lambda x: x[1])
Beispiel #3
0
def filter_unlikely_graph(g: MergeGraph) -> bool:
    settings = Settings.get_instance()
    max_n_duplications = settings.mrf_max_n_duplications
    max_n_duplication_types = settings.mrf_max_n_duplication_types

    for n in g.iter_class_nodes():
        # FILTER middle nodes
        if n.n_incoming_links == 1 and n.n_outgoing_links == 1:
            link = next(iter(n.iter_outgoing_links()))
            if link.get_target_node().is_class_node():
                return False

        # FILTER: max_size_duplication_group <= 7 and max_n_duplications <= 4
        n_duplication_types = 0
        for e_lbl, es in _(n.iter_outgoing_links()).imap(lambda e: (e.label, e)).group_by_key().get_value():
            if len(es) > max_n_duplications:
                return False

            if len(es) > 1:
                n_duplication_types += 1

        if n_duplication_types > max_n_duplication_types:
            return False

    return True
Beispiel #4
0
    def predict_log_probs(self, examples: List[Example]):
        log_probs = []

        for es in _(examples).isplit(self.max_n_tasks):
            varss = [self.get_variables(e) for e in es]
            # varss = self.parallel_get_variables(es)
            factorss = [self.model.get_factors(vars) for vars in varss]
            inferences = [
                self.inference(f, v) for f, v in zip(factorss, varss)
            ]

            desired_assignments = [{
                var: var.domain.encode_value(True)
                for var in vars
            } for vars in varss]

            logZs = parallel_marginal_inference(
                inferences,
                n_threads=Settings.get_instance().parallel_gmtk_n_threads)
            log_probs += [
                sum(
                    f.score_assignment(desired_assignments[i])
                    for f in factorss[i]) - logZs[i] for i in range(len(es))
            ]

        return log_probs
Beispiel #5
0
def get_latest_model_id(workdir) -> int:
    model_ids = _(
        workdir.iterdir()).ifilter(lambda e: e.is_dir() and e.name.startswith(
            'exp_no_')).map(lambda e: int(e.name.replace('exp_no_', '')))
    if len(model_ids) == 0:
        return -1
    return max(model_ids)
Beispiel #6
0
def get_example_detail(model_prefix, no_sample, link_id, model: LogLinearModel,
                       graphs: List[List[TripleLabel]]):
    for graph in graphs:
        example = graph[0].triple.example
        if not example.model_id.startswith(
                model_prefix) or example.no_sample != no_sample:
            continue

        var = _(graph).first(lambda v: v.triple.link.id == link_id)
        assignment = {v: v.get_label_value() for v in graph}
        factors = model.get_factors(graph)
        print(var.triple.link.label)

        for lbl in [False, True]:
            assignment[var] = var.domain.encode_value(lbl)
            print("When value of variable is: ", assignment[var].val)
            score = 0.0
            for factor in factors:
                if factor.touch(var):
                    score += factor.score_assignment(assignment)
                    if isinstance(factor, TripleFactorTemplate.TripleFactor):
                        features = factor.assignment2features(assignment)
                        print('\t. Factor features: ', [
                            (var.triple.features.domain.get_category(idx),
                             features[idx])
                            for idx in var.triple.features.get_active_index()
                        ])
                    else:
                        print("\t .Factor features: ",
                              factor.assignment2features(assignment).tolist())
            print("\t .Score = ", score)
        break
Beispiel #7
0
 def parallel_get_variables(self, examples: List[Example]):
     assert False, "Not ready to use yet"
     ParallelAnnotator.get_instance().annotate(examples)
     return [
         _(self.example_annotator.example2vars(example)) \
             .imap(lambda x: self.example_annotator.build_triple_features(x, self.tf_domain)) \
             .map(lambda x: x.label)
         for example in examples
     ]
Beispiel #8
0
    def get_variables(self, example: Example):
        if self.example_annotator is None:
            # TODO: should handle top_k_semantic_types configuration, and check if training_sources has been changed!!
            self.example_annotator = ExampleAnnotator(self.dataset,
                                                      example.training_sources)
        self.example_annotator.annotate(example)

        return _(self.example_annotator.example2vars(example)) \
            .imap(lambda x: self.example_annotator.build_triple_features(x, self.tf_domain)) \
            .map(lambda x: x.label)
Beispiel #9
0
    def pred_full_stype(
        self, col: Column, top_n: int
    ) -> List[Tuple[Tuple[bytes, bytes], float, Dict[Tuple[bytes, bytes],
                                                     float]]]:
        X = []
        refcols = [
            refcol for refcol in self.stype_db.train_columns
            if refcol.id != col.id
        ]
        j = self.stype_db.col2idx[col.id]
        for refcol in refcols:
            iref = self.stype_db.col2idx[refcol.id]
            X.append(self.stype_db.similarity_matrix[j, iref])

        result = self.model.predict_proba(X)[:, 1]
        result = _(zip(result, (self.stype_db.col2dnodes[rc.id] for rc in refcols))) \
            .sort(key=lambda x: x[0], reverse=True)

        # each top_k_st is map between stype, its score, and list of parent stypes with score
        top_k_st: Dict[Tuple[bytes, bytes],
                       Tuple[float, Dict[Tuple[Tuple[bytes, bytes],
                                               float]]]] = {}
        for score, dnode in result:
            link = dnode.get_first_incoming_link()
            parent = link.get_source_node()
            parent_link = parent.get_first_incoming_link()
            if parent_link is None:
                parent_stype = None
            else:
                parent_stype = (parent_link.get_source_node().label,
                                parent_link.label)

            stype = (parent.label, link.label)
            if stype not in top_k_st:
                if len(top_k_st) == top_n:
                    # ignore stype which doesn't make itself into top k
                    continue

                top_k_st[stype] = (score, {parent_stype: score})
            else:
                # keep looping until we collect enough parent_link, default is top 3
                if parent_stype not in top_k_st[stype][1]:
                    # if we have seen the parent_stype, we don't need to update score because it's already the greatest
                    top_k_st[stype][1][parent_stype] = score

        return sorted([(stype, score, parent_stypes)
                       for stype, (score, parent_stypes) in top_k_st.items()],
                      reverse=True,
                      key=lambda x: x[1])
Beispiel #10
0
    def feature_extraction(self, graph: Graph,
                           stype_score: Dict[int, Optional[float]]):
        node2features = {}
        for node in graph.iter_class_nodes():
            prob_data_nodes = _(node.iter_outgoing_links()) \
                .imap(lambda x: x.get_target_node()) \
                .ifilter(lambda x: x.is_data_node()) \
                .reduce(lambda a, b: a + (stype_score[b.id] or 0), 0)

            similar_nodes = graph.iter_nodes_by_label(node.label)
            minimum_merged_cost = min((get_merged_cost(node, similar_node,
                                                       self.multival_predicate)
                                       for similar_node in similar_nodes))

            node2features[node.id] = [('prob_data_nodes', prob_data_nodes),
                                      ('minimum_merged_cost',
                                       minimum_merged_cost)]
        return node2features
Beispiel #11
0
def evaluate(map_examples) -> Optional[ConfusionMatrix]:
    if len(map_examples) == 0:
        return None
    return _(map_examples).imap(
        lambda e: (e.get_map_assignment(), e.get_target_assignment())).imap(
            lambda e: Evaluation.get_confusion_matrix(*e)).reduce(operator.add)
Beispiel #12
0
def render_factor_graph(model_or_factors: Union[LogLinearModel, List[Factor]],
                        vars: List[TripleLabel], fpath: str):
    if isinstance(model_or_factors, LogLinearModel):
        factors = model_or_factors.get_factors(vars)
    else:
        factors = model_or_factors

    def get_fnode_lbl(fnode: Union[TripleLabel, Factor]) -> bytes:
        if isinstance(fnode, Factor):
            label = fnode.__class__.__name__
        else:
            s = fnode.triple.link.get_source_node()
            t = fnode.triple.link.get_target_node()
            label = "%s:%s--%s:%s" % (s.id, s.label.decode('utf-8'), t.id,
                                      t.label.decode('utf-8'))

        return label.encode('utf-8')

    class Node(GraphNode):
        def __init__(self, fnode: Union[TripleLabel, Factor]) -> None:
            super().__init__()
            self.fnode = fnode

        def get_dot_format(self, max_text_width: int):
            label = self.get_printed_label(max_text_width).encode(
                'unicode_escape').decode()
            if isinstance(self.fnode, Variable):
                return '"%s"[style="filled",color="white",fillcolor="gold",label="%s"];' % (
                    self.id, label)

            return '"%s"[shape="plaintext",style="filled",fillcolor="lightgray",label="%s"];' % (
                self.id, label)

    class Link(GraphLink):
        var2factor = "var2factor"
        var2var = "var2var"

        def __init__(self, link_type: str) -> None:
            super().__init__()
            self.link_type = link_type

        def get_dot_format(self, max_text_width: int):
            label = self.get_printed_label(max_text_width).encode(
                'unicode_escape').decode()
            if self.link_type == Link.var2factor:
                return '"%s" -> "%s"[dir=none,color="brown",fontcolor="black",label="%s"];' % (
                    self.source_id, self.target_id, label)
            return '"%s" -> "%s"[color="brown",style="dashed",fontcolor="black",label="%s"];' % (
                self.source_id, self.target_id, label)

    """Render factor graph for debugging"""
    g = Graph()

    # build graphs
    fnode2id: Dict[Union[Variable, Factor], int] = _(
        vars, factors).enumerate().imap(lambda v: (v[1], v[0])).todict()
    _(vars, factors).forall(lambda fnode: g.real_add_new_node(
        Node(fnode), GraphNodeType.CLASS_NODE, get_fnode_lbl(fnode)))

    for factor in factors:
        for var in factor.unobserved_variables:
            g.real_add_new_link(Link(Link.var2factor),
                                GraphLinkType.UNSPECIFIED, b"", fnode2id[var],
                                fnode2id[factor])
    for var in vars:
        if var.triple.parent is not None:
            g.real_add_new_link(Link(Link.var2var), GraphLinkType.UNSPECIFIED,
                                b"", fnode2id[var.triple.parent.label],
                                fnode2id[var])

    for var in vars:
        var.myid = "%s: %s" % (fnode2id[var], g.get_node_by_id(
            fnode2id[var]).label)
    for factor in factors:
        factor.myid = fnode2id[factor]

    g.render2pdf(fpath)
Beispiel #13
0
def train_model(dataset: str, train_sids: List[str], manual_seed: int,
                train_examples: List[Example], test_examples: List[Example],
                args: TrainingArgs, basedir: Path):
    DenseTensorFunc.manual_seed(manual_seed)

    tf_domain = GrowableBinaryVectorDomain()

    timer = pyutils.progress.Timer().start()
    input_train_examples = train_examples
    input_test_examples = test_examples

    # BUILDING VARIABLES NEEDED FOR THE TRAINING
    example_annotator = ExampleAnnotator(dataset,
                                         train_sids,
                                         training_examples=train_examples)
    train_examples = sequential_map(example_annotator.annotate, train_examples)
    train_examples = _(train_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    pairwise_domain = example_annotator.build_pairwise_domain()
    # Freeze domain now, we've added all feature values observed in training data
    tf_domain.freeze()

    test_examples = sequential_map(example_annotator.annotate, test_examples)
    test_examples = _(test_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    # print domain to debug
    logger.info("Preprocessing take %s" % timer.lap().get_total_time())
    # build random variables
    train_graphs = _(train_examples).submap(lambda t: t.label)
    test_graphs = _(test_examples).submap(lambda t: t.label)

    # build models, select inference method
    model = TemplateLogLinearModel([
        TripleFactorTemplate(
            *TripleFactorTemplate.get_default_args(tf_domain)),
        SubstructureFactorTemplate(
            *SubstructureFactorTemplate.get_default_args(
                pairwise_domain, example_annotator.get_obj_props())),
        # ExternalModelFactorTemplate(*ExternalModelFactorTemplate.get_default_weights())
    ])
    # or load previous training
    # model_dir = config.fsys.debug.as_path() + "/%s/models/exp_no_2" % dataset
    # model, ___, state_dict = deserialize(model_dir + '/gmtk_model.bin')

    inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)
    map_inference = BeliefPropagation.get_constructor(InferProb.MAP)

    train_nll_examples = _(
        train_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    train_map_examples = _(train_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))
    test_nll_examples = _(
        test_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    test_map_examples = _(test_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))

    # select training method/parameters, and evaluation
    n_epoch = args.n_epoch
    params = args.optparams
    mini_batch_size = args.mini_batch_size
    n_switch = args.n_switch

    global_step = 0
    require_closure = False
    if args.optimizer == 'SGD':
        optimizer = PyTorchOptimizer.SGD(parameters=model.get_parameters(),
                                         **params)
    elif args.optimizer == 'ADAM':
        optimizer = PyTorchOptimizer.Adam(parameters=model.get_parameters(),
                                          **params)
    elif args.optimizer == 'LBFGS':
        optimizer = PyTorchOptimizer.LBFGS(parameters=model.get_parameters(),
                                           **params)
        require_closure = True
    else:
        assert False
    # optimizer.optimizer.load_state_dict(state_dict)

    for template in model.templates:
        if hasattr(template, 'after_update_weights'):
            optimizer.register_on_step(template.after_update_weights)

    logger.info(args.to_string())
    logger.info("Template info: \n%s" %
                ("\n" %
                 (["\t" + template.get_info()
                   for template in model.templates])))
    logger.info("Train size: %s, Test size: %s", len(train_nll_examples),
                len(test_nll_examples))

    reporter = TensorBoard(log_dir=basedir)
    # cast to list to keep train_map_examples & train_nll_examples aligned with each other (batch example may shuffle)
    if args.parallel_training:
        batch_nll_example = ParallelBatchExample(list(train_nll_examples), 0)
    else:
        batch_nll_example = BatchExample(list(train_nll_examples), 0)

    # *********************************************** DEBUG CODE
    # for i, triples in enumerate(train_examples):
    #     example = triples[0].example
    #     if example.model_id.startswith("s03") and example.no_sample == 29:
    #         example.pred_sm.render()
    #         render_factor_graph(model.get_factors(train_graphs[i]), train_graphs[i],
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #         exit(0)
    #
    # render_factor_graph(train_nll_examples[0].factors, train_nll_examples[0].variables,
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #
    # loss_val_accum = ValueAccumulator()
    # gradient_accum = Tensor1AccumulatorDict()
    # for weights in model.get_parameters():
    #     gradient_accum.track_obj(weights, DenseTensorFunc.zeros_like(weights.val))
    # **********************************************************

    progress = pyutils.progress.Progress(n_epoch)
    progress.start()

    if n_switch > 0:
        examples = list(batch_nll_example.split_random(mini_batch_size))
    else:
        examples = [batch_nll_example]

    cm_train, cm_test = None, None
    loss_history = []
    param_hists = []

    for i in range(n_epoch):
        logger.info("Iter %s" % i)

        if i >= n_switch:
            examples = [batch_nll_example]

        if args.shuffle_mini_batch and 0 < i < n_switch:
            examples = batch_nll_example.split_random(mini_batch_size)

        average_loss_val = []
        if not require_closure:
            for example in examples:
                optimizer.zero_grad()
                example.accumulate_value_and_gradient(
                    optimizer.get_value_accumulator(),
                    optimizer.get_gradient_accumulator())
                optimizer.average(example.size())

                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())

                # *********************************************** DEBUG GRADIENT
                # numerical_gradient = NumericalGradient(1e-5)
                # for j, e in enumerate(example.examples):
                #     print(f"\rExample {j}/{len(example.examples)}", end="", flush=True)
                #     gradient_accum.clear()
                #     loss_val_accum.clear()
                #     e.accumulate_value_and_gradient(loss_val_accum, gradient_accum)
                #     for template in model.templates:
                #         for weights in template.get_weights():
                #             gradient = gradient_accum.get_value(weights)
                #             approx_gradients = numerical_gradient.compute_gradient(weights, lambda: nll_func(e))
                #             try:
                #                 np.testing.assert_almost_equal(gradient.numpy(), approx_gradients.numpy(), 6)
                #             except Exception:
                #                 logger.exception("Incorrect gradient...")
                #                 print(template,  weights.val.tolist())
                #                 print(["%11.8f" % x for x in gradient.tolist()])
                #                 print(["%11.8f" % x for x in approx_gradients.tolist()])
                #                 print(["%11d" % int(np.isclose(x, y, rtol=0, atol=1e-6)) for x, y in zip(gradient, approx_gradients)])
                #
                #                 raise
                # print("\n")
                # **************************************************************

                optimizer.step()
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1
        else:
            for example in examples:

                def closure():
                    optimizer.zero_grad()
                    example.accumulate_value_and_gradient(
                        optimizer.get_value_accumulator(),
                        optimizer.get_gradient_accumulator())
                    optimizer.average(example.size())
                    optimizer.copy_grad()
                    return optimizer.get_value_accumulator().get_value()

                optimizer.step(closure)
                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1

        if len(average_loss_val) > 1:
            logger.info("Average accum loss: %.10f" %
                        np.average(average_loss_val))

        if optimizer.get_value_accumulator().get_value() < 0:
            break

        if i % args.n_iter_eval == 0 or i == n_epoch - 1:
            cm_train = evaluate(train_map_examples)
            cm_test = evaluate(test_map_examples) or cm_train
            logger.info('train (class_idx=0): %s',
                        cm_train.precision_recall_fbeta(class_idx=0))
            logger.info('train (class_idx=1): %s',
                        cm_train.precision_recall_fbeta(class_idx=1))
            logger.info('test  (class_idx=0): %s',
                        cm_test.precision_recall_fbeta(class_idx=0))
            logger.info('test  (class_idx=1): %s',
                        cm_test.precision_recall_fbeta(class_idx=1))

            reporter.precision_recall_fbeta(cm_train,
                                            global_step,
                                            group='train')
            reporter.precision_recall_fbeta(cm_test, global_step, group='test')

        loss_history.append(np.average(average_loss_val))
        param_hists.append(model.clone_parameters())
        if len(param_hists) > 3:
            param_hists.pop(0)

        if args.optimizer == "ADAM" and len(loss_history) > 4 and all(
                x - y > 0
                for x, y in zip(loss_history[-3:], loss_history[-4:-1])):
            logger.info("Loss increase after 3 epoches. Stop training!")
            break

        progress.finish_one()

    if args.report_final_loss:
        loss_val_accum = ValueAccumulator()
        batch_nll_example.accumulate_value_and_gradient(loss_val_accum, None)
        logger.info("Average accum loss: %.10f" %
                    (loss_val_accum.get_value() / batch_nll_example.size()))

    logger.info("\n\r%s" % progress.summary())
    cm_train.pretty_print("** TRAIN **",
                          precision_recall_fbeta=True,
                          output_stream=logger.info)
    cm_test.pretty_print("** TEST **",
                         precision_recall_fbeta=True,
                         output_stream=logger.info)

    # save model and move everything into another folder for storage
    reporter.close()
    reporter.export(basedir / 'tensorboard_raw.json')

    # clear all cache
    for template in model.templates:
        if isinstance(template, CachedTemplateFactorConstructor):
            template.clear_cache()

    assert len(param_hists) == len(loss_history[-3:])
    min_loss, min_params, min_idx = min(zip(loss_history[-3:], param_hists,
                                            [-3, -2, -1]),
                                        key=lambda x: x[0])
    logger.info("Select parameters at index: %d. Loss = %s", min_idx, min_loss)
    model.update_parameters(min_params)

    serialize(
        (model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()),
        basedir / 'gmtk_model.bin')
    save_evaluation_result(zip(train_map_examples, train_nll_examples),
                           basedir / 'train.output.json')
    save_evaluation_result(zip(test_map_examples, test_nll_examples),
                           basedir / 'test.output.json')
    serializeJSON(input_train_examples, basedir / "train.json")
    serializeJSON(input_test_examples, basedir / "test.json")

    # attempt to copy log file
    try:
        logger.handlers[1].flush()
        shutil.copy(logger.handlers[1].file_handler.baseFilename,
                    str(basedir / "train.log"))
    except:
        logger.exception("Cannot backup log...")

    model_id = get_latest_model_id(basedir) + 1
    move_current_files(basedir, model_id)
    logger.info("Save model %s", model_id)
    return model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()
Beispiel #14
0
    def build_triple_features(
            self, triple: Triple,
            domain: GrowableBinaryVectorDomain[str]) -> Triple:
        json_features = triple.example.link2features[triple.link.id]
        features = TripleFeatures(domain)
        triple_target = triple.target
        triple_source = triple.source
        triple_source_label = triple_source.label.decode('utf-8')

        # if 'multi_val_prob' in json_features:
        #     features += ("multi_val_prob", max(json_features['multi_val_prob'], 0.01))
        #     features += ("single_val_prob", max(1 - json_features['multi_val_prob'], 0.01))

        if json_features['p_link_given_so'] is not None:
            if triple_target.is_data_node():
                name = "(%s---%s)" % (triple_source_label,
                                      triple.link.label.decode("utf-8"))
                features += (f'{name}=True.p_semantic_type',
                             max(json_features['p_link_given_so'], 0.01))
                features += (f'{name}=False.p_semantic_type',
                             max(1 - json_features['p_link_given_so'], 0.01))
                # features += (f'{name}=True.delta_p_semantic_type', min(json_features['delta_stype_score'], -0.01))
                # features += (f'{name}=False.delta_p_semantic_type', max(-1 * json_features['delta_stype_score'], 0.01))
                features += (f'{name}=True.delta_p_semantic_type',
                             max(json_features['delta_stype_score'], 0.01))
                features += (f'{name}=False.delta_p_semantic_type',
                             max(-1 * json_features['delta_stype_score'],
                                 0.01))

                features += (f'{name}=True.ratio_p_semantic_type',
                             1 / json_features['ratio_stype_score'])
                features += (f'{name}=False.ratio_p_semantic_type',
                             json_features['ratio_stype_score'])
                # features += (f'{name}=True.norm-p_semantic_type',
                #              max(json_features['p_link_given_so'] / json_features['total_stype_score'], 0.01))
                # features += (f'{name}=False.norm-p_semantic_type',
                #              max(1 - (json_features['p_link_given_so'] / json_features['total_stype_score']), 0.01))
                features += (f'{name}-order={json_features["stype_order"]}', 1)
                features += (f'{triple_source_label}=True.p_triple',
                             max(json_features['p_triple'], 0.01))
                features += (f'{triple_source_label}=False.p_triple',
                             max(1 - json_features['p_triple'], 0.01))
            else:
                features += (f'{triple_source_label}=True.p_triple',
                             max(json_features['p_triple'], 0.01))
                features += (f'{triple_source_label}=False.p_triple',
                             max(1 - json_features['p_triple'], 0.01))

        # if json_features['local_constraint'] is not None:
        #     features += (f"class={triple_source_label}=True.local_constraint", max(json_features['local_constraint'], 0.01))
        #     features += (f"class={triple_source_label}=False.local_constraint", max(1 - json_features['local_constraint'], 0.01))
        # if json_features['global_constraint'] is not None:
        #     features += (f"class={triple_source_label}=True.global_constraint", max(json_features['global_constraint'], 0.01))
        #     features += (f"class={triple_source_label}=False.global_constraint", max(1 - json_features['global_constraint'], 0.01))

        if json_features['stype_prob'] is not None:
            features += (f"True.stype_prob",
                         max(json_features['stype_prob'], 0.01))
            features += (f"False.stype_prob",
                         max(1 - json_features['stype_prob'], 0.01))

        if triple.target.is_class_node() and _(
                triple.siblings).all(lambda t: t.target.is_class_node()):
            # if domain.has_value("source_node_no_data_child") or not domain.is_frozen:
            features += (
                f"class={triple_source_label}.source_node_no_data_child", 1)

        if triple.target.is_class_node() and len(triple.siblings) == 0:
            # if domain.has_value("no_siblings") or not domain.is_frozen:
            features += (f"class={triple_source_label}.no_siblings", 1)

            if triple.parent is None:
                # if domain.has_value("no_parent_&_no_siblings") or not domain.is_frozen:
                features += (
                    f"class={triple_source_label}.no_parent_&_no_siblings", 1)

        triple.features = features
        return triple
Beispiel #15
0
    topK = 20
    class_idx = 0
    assert len(model.templates) == 2
    triple_factor = model.templates[0]
    triple_factor_weights = triple_factor.weights.view(2, -1)
    features = [(tf_domain.get_category(i), x, triple_factor_weights[1, i])
                for i, x in enumerate(triple_factor_weights[0, :])]
    features.sort(key=lambda x: x[1], reverse=True)
    for f in features:
        print(f)
    substructure = model.templates[1].weights
    print(substructure)

    # re-populate data and output evaluation
    train_examples, test_examples = load_data(data_source)
    _(train_examples, test_examples).iflatten().forall(
        lambda x: build_triple_features(x, tf_domain))
    train_graphs = _(train_examples).submap(lambda t: t.label)
    test_graphs = _(test_examples).submap(lambda t: t.label)

    # get detail explanation of one link
    get_example_detail('s00', 0, 'L014', model, test_graphs)

    # NOTE: uncommment code below to run full-evaluation

    # inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)  # , max_iter=5)
    # map_inference = BeliefPropagation.get_constructor(InferProb.MAP)  # , max_iter=5)
    #
    # train_nll_examples = _(train_graphs).map(lambda vars: NegativeLogLikelihoodExample(vars, model, inference))
    # train_map_examples = _(train_nll_examples).map(lambda example: MAPAssignmentExample(example, map_inference))
    # test_nll_examples = _(test_graphs).map(lambda vars: NegativeLogLikelihoodExample(vars, model, inference))
    # test_map_examples = _(test_nll_examples).map(lambda example: MAPAssignmentExample(example, map_inference))