Beispiel #1
0
    def get_stype_db(dataset: str, train_source_ids: List[str],
                     cache_dir: Path) -> 'SemanticTypeDB':
        if SemanticTypeDB.instance is None:
            cache_file = cache_dir / 'stype_db.pkl'
            if cache_file.exists():
                SemanticTypeDB.logger.debug(
                    "Load SemanticTypeDB from cache file...")
                stype_db: SemanticTypeDB = deserialize(cache_file)
                if set(train_source_ids) != {
                        tbl.id
                        for tbl in stype_db.train_tables
                } or stype_db.dataset != dataset:
                    stype_db = None
            else:
                stype_db = None

            if stype_db is None:
                SemanticTypeDB.logger.debug(
                    "Have to re-create SemanticTypeDB...")
                stype_db = SemanticTypeDB.create(dataset, train_source_ids)
                stype_db._build_db()
                serialize(stype_db, cache_file)

            SemanticTypeDB.instance = stype_db

        return SemanticTypeDB.instance
Beispiel #2
0
def get_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'tables.pkl'
        if cache_file.exists():
            tables = deserialize(cache_file)
        else:
            mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path())
            raw_tables = get_raw_data_tables(dataset)
            R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path()))
            tables = []
            semantic_models = []
            for i, raw_tbl in enumerate(raw_tables):
                r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml"
                tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl)
                tables.append(tbl)
                semantic_models.append(sm)

            serialize(tables, cache_file)
            _data_io_vars['semantic_models'][dataset] = semantic_models  # avoid apply R2RML twice!

        _data_io_vars["data_tables"][dataset] = tables

    return _data_io_vars["data_tables"][dataset]
Beispiel #3
0
    def semantic_labeling(self,
                          train_sources: List[SemanticModel],
                          test_sources: List[SemanticModel],
                          top_n: int,
                          eval_train: bool = False) -> None:
        """Generate semantic labels and store it in test sources"""
        train_sources: Dict[str,
                            SemanticModel] = {s.id: s
                                              for s in train_sources}
        test_sources: Dict[str,
                           SemanticModel] = {s.id: s
                                             for s in test_sources}
        assert set(train_sources.keys()) == self.train_source_ids

        if self.model is None:
            model_file = self.exec_dir / 'model.pkl'

            if model_file.exists():
                self.logger.debug("Load previous trained model...")
                self.model = deserialize(model_file)
            else:
                self.logger.debug("Train new model...")
                x_train, y_train, x_test, y_test = generate_training_data(
                    self.stype_db)
                # clf = LogisticRegression(class_weight="balanced")
                clf = RandomForestClassifier(n_estimators=200,
                                             max_depth=10,
                                             class_weight="balanced",
                                             random_state=120)
                clf = clf.fit(x_train, y_train)
                self.logger.debug("Save model...")
                serialize(clf, model_file)
                self.model = clf

        col_attrs = []
        if eval_train:
            for col in self.stype_db.train_columns:
                if col.table_name not in train_sources: continue
                col_attrs.append(
                    (col, train_sources[col.table_name].get_attr_by_label(
                        col.name)))

        for col in self.stype_db.test_columns:
            if col.table_name not in test_sources: continue
            col_attrs.append(
                (col,
                 test_sources[col.table_name].get_attr_by_label(col.name)))

        for col, attr in col_attrs:
            pred_stypes = self.pred_type(col, top_n)
            attr.semantic_types = [
                SemanticType(stype[0].decode("utf-8"),
                             stype[1].decode("utf-8"), score)
                for stype, score in pred_stypes if score > 0
            ]
Beispiel #4
0
    def load_model(self):
        """Try to load previous model if possible"""
        if self.model is not None:
            return

        model_file = self.exec_dir / 'model.pkl'
        if model_file.exists():
            self.logger.debug("Load previous trained model...")
            self.model = deserialize(model_file)
        else:
            self.logger.error("Cannot load model...")
            raise Exception("Model doesn't exist..")
Beispiel #5
0
def get_ontology(dataset: str) -> Ontology:
    """Get ontology of a given dataset"""
    global _data_io_vars
    if dataset not in _data_io_vars["ont"]:
        # if it has been cached ...
        cache_file = get_cache_dir(dataset) / 'ont.pkl'
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont = deserialize(cache_file)
        else:
            ont = Ontology.from_dataset(dataset)
            serialize(ont, cache_file)
        _data_io_vars["ont"][dataset] = ont

    return _data_io_vars["ont"][dataset]
Beispiel #6
0
def get_sampled_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['sampled_data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / "sampled_tables.pkl"
        if cache_file.exists():
            tables = deserialize(cache_file)
        else:
            tables = get_data_tables(dataset)
            settings = Settings.get_instance()
            tables = [tbl.sample(settings.n_samples, settings.random_seed) for tbl in tables]
            serialize(tables, cache_file)
        _data_io_vars["sampled_data_tables"][dataset] = tables

    return _data_io_vars["sampled_data_tables"][dataset]
Beispiel #7
0
def get_data_constraint_model(
    dataset: str,
    train_sms: List[SemanticModel],
) -> DataConstraint:
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(
            dataset, train_sms) / "weak_models" / "data_constraint.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        settings = Settings.get_instance()
        valid_threshold = settings.data_constraint_valid_threshold
        guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold
        n_comparison_samples = settings.data_constraint_n_comparison_samples
        random_seed = settings.random_seed
        n_sample = settings.n_samples

        if cache_file.exists():
            DataConstraint.logger.debug("Try to load previous run...")
            model, cached_dataset, cached_train_sm_ids, extra_args = deserialize(
                cache_file)
            if cached_dataset == dataset \
                    and cached_train_sm_ids == {sm.id for sm in train_sms} \
                    and extra_args == (
                        valid_threshold, guess_datetime_threshold, n_comparison_samples,
                        random_seed, n_sample):
                need_rebuilt = False

        if need_rebuilt:
            DataConstraint.logger.debug("Re-build data-constraint model...")
            data_tables = [
                ColumnBasedTable.from_table(tbl)
                for tbl in get_sampled_data_tables(dataset)
            ]
            model = DataConstraint(train_sms, data_tables, valid_threshold,
                                   guess_datetime_threshold,
                                   n_comparison_samples)
            serialize((model, dataset, {sm.id
                                        for sm in train_sms},
                       (valid_threshold, guess_datetime_threshold,
                        n_comparison_samples, random_seed, n_sample)),
                      cache_file)

        _instance = model
    return _instance
Beispiel #8
0
def get_raw_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['raw_data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'raw_tables.pkl'
        if cache_file.exists():
            raw_tables = deserialize(cache_file)
        else:
            raw_tables = []
            source_dir = Path(config.datasets[dataset].data.as_path())
            for file in sorted(source_dir.iterdir()):
                if file.name.startswith("."):
                    continue
                raw_tables.append(DataTable.load_from_file(file))

            serialize(raw_tables, cache_file)
        _data_io_vars["raw_data_tables"][dataset] = raw_tables

    return _data_io_vars["raw_data_tables"][dataset]
Beispiel #9
0
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]):
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        if cache_file.exists():
            SemanticTypeAssistant.logger.debug("Try to load previous run...")
            model, cache_dataset, cache_train_sm_ids = deserialize(cache_file)
            if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}:
                need_rebuilt = False

            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)
            model.triple_adviser = ota

        if need_rebuilt:
            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)

            typer = SemanticTyper.get_instance(dataset, train_sms)
            try:
                typer.load_model()
            except:
                sms = get_semantic_models(dataset)
                train_ids = {sm.id for sm in train_sms}
                typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4)

            model = SemanticTypeAssistant(train_sms, typer, ota)
            model.triple_adviser = None
            serialize((model, dataset, {sm.id for sm in train_sms}), cache_file)
            model.triple_adviser = ota

        _instance = model

    return _instance
Beispiel #10
0
    def get_classifier(self, retrain: bool, train_examples: List[Example]):
        # TODO: implement this properly, currently, we have to train and save manually
        cached_file = get_cache_dir(
            self.example_annotator.dataset,
            list(self.example_annotator.train_source_ids)
        ) / "weak_models" / "node_prob_classifier.pkl"
        if not cached_file.exists() or retrain:
            self.logger.debug("Retrain new model")
            raw_X_train = make_data(self, train_examples)
            classifier = LogisticRegression(fit_intercept=True)

            X_train = numpy.asarray(
                [list(features.values())[1:] for features in raw_X_train])
            X_train, y_train = X_train[:, :-1], [
                int(x) for x in X_train[:, -1]
            ]

            scaler = StandardScaler().fit(X_train)
            scaler.transform(X_train)

            try:
                classifier.fit(X_train, y_train)
            except ValueError as e:
                assert str(e).startswith(
                    "This solver needs samples of at least 2 classes in the data"
                )
                # this should be at a starter phase when we don't have any data but use ground-truth to build
                X_train = numpy.vstack([X_train, [0, 0]])
                y_train.append(0)
                classifier.fit(X_train, y_train)

            cached_file.parent.mkdir(exist_ok=True, parents=True)
            serialize((scaler, classifier), cached_file)
            return scaler, classifier

        return deserialize(cached_file)
Beispiel #11
0
    #     else:
    #         print(".", end="")
    #     return trace
    #
    # args = {'children_uris': set(),
    #     'parents_uris': set(),
    #     'uri': 'http://www.w3.org/2000/01/rdf-schema#Resource'}
    # # OntGraphNode("haha", set(), set())
    # a = OntGraphNode(**args)
    # print("==========")
    # # print(a.uri)
    # # sys.settrace(trace)
    ont_graph: OntGraph = deserializeJSON(
        config.fsys.debug.as_path() + '/%s/cached/ont_graph.json' % dataset,
        OntGraph)
    ont: Ontology = deserialize(config.fsys.debug.as_path() +
                                '/%s/cached/ont.pkl' % dataset)
    # print(a.uri)
    # print("========SIGSEGV IN DEBUG MODE==")

    # ont = Ontology.from_data_source(data_source)
    # ont_graph = build_ont_graph(data_source)
    #
    # # %%
    #
    # # ont_graph.render2txt(config.fsys.debug.as_path() + '/%s/ont_graph.txt' % data_source)
    #
    # # %%
    s1 = ont.full_uri('crm:E63_Beginning_of_Existence')
    s2 = ont.full_uri('crm:E52_Time-Span')
    for predicate in ont_graph.get_possible_predicates(s1, s2):
        # if link.label.find('aggregatedCHO') != -1:
Beispiel #12
0
 def from_file(dataset: str, input_dir: Path) -> 'Model':
     model_bin = deserialize(input_dir / 'gmtk_model.bin')
     model: TemplateLogLinearModel = model_bin[0]
     tf_domain: GrowableBinaryVectorDomain = model_bin[1]
     pairwise_domain = model_bin[2]
     return Model(dataset, model, tf_domain, pairwise_domain)
Beispiel #13
0
    if None in bijection.prime2x:
        bijection.prime2x.pop(None)

    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        '_bijection': bijection
    }


if __name__ == '__main__':
    dataset = "museum_crm"
    # %%

    ont = deserialize(config.fsys.debug.as_path() +
                      '/%s/cached/ont.pkl' % dataset)
    ont_graph = deserialize(config.fsys.debug.as_path() +
                            '/%s/cached/ont_graph.pkl' % dataset)
    karma_models = get_karma_models(dataset)

    karma_model = karma_models[59]
    # %%

    pred_sm = deserialize(
        config.fsys.debug.as_path() +
        "/tmp/pred_sm.pkl").get_semantic_model_reassign_id().graph
    gold_sm = karma_model.graph

    # %%
    res = f1_precision_recall(
        gold_sm, pred_sm, data_node_mode=DataNodeMode.IGNORE_LABEL_DATA_NODE)
Beispiel #14
0
                            (var.triple.features.domain.get_category(idx),
                             features[idx])
                            for idx in var.triple.features.get_active_index()
                        ])
                    else:
                        print("\t .Factor features: ",
                              factor.assignment2features(assignment).tolist())
            print("\t .Score = ", score)
        break


if __name__ == '__main__':
    data_source = 'museum_crm'
    model_dir = config.fsys.debug.as_path(
    ) + "/%s/models/exp_no_10" % data_source
    model, tf_domain = deserialize(model_dir + '/gmtk_model.bin')

    # print out top-K features
    topK = 20
    class_idx = 0
    assert len(model.templates) == 2
    triple_factor = model.templates[0]
    triple_factor_weights = triple_factor.weights.view(2, -1)
    features = [(tf_domain.get_category(i), x, triple_factor_weights[1, i])
                for i, x in enumerate(triple_factor_weights[0, :])]
    features.sort(key=lambda x: x[1], reverse=True)
    for f in features:
        print(f)
    substructure = model.templates[1].weights
    print(substructure)