def get_stype_db(dataset: str, train_source_ids: List[str], cache_dir: Path) -> 'SemanticTypeDB': if SemanticTypeDB.instance is None: cache_file = cache_dir / 'stype_db.pkl' if cache_file.exists(): SemanticTypeDB.logger.debug( "Load SemanticTypeDB from cache file...") stype_db: SemanticTypeDB = deserialize(cache_file) if set(train_source_ids) != { tbl.id for tbl in stype_db.train_tables } or stype_db.dataset != dataset: stype_db = None else: stype_db = None if stype_db is None: SemanticTypeDB.logger.debug( "Have to re-create SemanticTypeDB...") stype_db = SemanticTypeDB.create(dataset, train_source_ids) stype_db._build_db() serialize(stype_db, cache_file) SemanticTypeDB.instance = stype_db return SemanticTypeDB.instance
def get_data_tables(dataset: str) -> List[DataTable]: global _data_io_vars if dataset not in _data_io_vars['data_tables']: # if it has been cached... cache_file = get_cache_dir(dataset) / 'tables.pkl' if cache_file.exists(): tables = deserialize(cache_file) else: mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path()) raw_tables = get_raw_data_tables(dataset) R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path())) tables = [] semantic_models = [] for i, raw_tbl in enumerate(raw_tables): r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml" tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl) tables.append(tbl) semantic_models.append(sm) serialize(tables, cache_file) _data_io_vars['semantic_models'][dataset] = semantic_models # avoid apply R2RML twice! _data_io_vars["data_tables"][dataset] = tables return _data_io_vars["data_tables"][dataset]
def semantic_labeling(self, train_sources: List[SemanticModel], test_sources: List[SemanticModel], top_n: int, eval_train: bool = False) -> None: """Generate semantic labels and store it in test sources""" train_sources: Dict[str, SemanticModel] = {s.id: s for s in train_sources} test_sources: Dict[str, SemanticModel] = {s.id: s for s in test_sources} assert set(train_sources.keys()) == self.train_source_ids if self.model is None: model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.debug("Train new model...") x_train, y_train, x_test, y_test = generate_training_data( self.stype_db) # clf = LogisticRegression(class_weight="balanced") clf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=120) clf = clf.fit(x_train, y_train) self.logger.debug("Save model...") serialize(clf, model_file) self.model = clf col_attrs = [] if eval_train: for col in self.stype_db.train_columns: if col.table_name not in train_sources: continue col_attrs.append( (col, train_sources[col.table_name].get_attr_by_label( col.name))) for col in self.stype_db.test_columns: if col.table_name not in test_sources: continue col_attrs.append( (col, test_sources[col.table_name].get_attr_by_label(col.name))) for col, attr in col_attrs: pred_stypes = self.pred_type(col, top_n) attr.semantic_types = [ SemanticType(stype[0].decode("utf-8"), stype[1].decode("utf-8"), score) for stype, score in pred_stypes if score > 0 ]
def load_model(self): """Try to load previous model if possible""" if self.model is not None: return model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.error("Cannot load model...") raise Exception("Model doesn't exist..")
def get_ontology(dataset: str) -> Ontology: """Get ontology of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["ont"]: # if it has been cached ... cache_file = get_cache_dir(dataset) / 'ont.pkl' cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont = deserialize(cache_file) else: ont = Ontology.from_dataset(dataset) serialize(ont, cache_file) _data_io_vars["ont"][dataset] = ont return _data_io_vars["ont"][dataset]
def get_sampled_data_tables(dataset: str) -> List[DataTable]: global _data_io_vars if dataset not in _data_io_vars['sampled_data_tables']: # if it has been cached... cache_file = get_cache_dir(dataset) / "sampled_tables.pkl" if cache_file.exists(): tables = deserialize(cache_file) else: tables = get_data_tables(dataset) settings = Settings.get_instance() tables = [tbl.sample(settings.n_samples, settings.random_seed) for tbl in tables] serialize(tables, cache_file) _data_io_vars["sampled_data_tables"][dataset] = tables return _data_io_vars["sampled_data_tables"][dataset]
def get_data_constraint_model( dataset: str, train_sms: List[SemanticModel], ) -> DataConstraint: global _instance if _instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "data_constraint.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True settings = Settings.get_instance() valid_threshold = settings.data_constraint_valid_threshold guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold n_comparison_samples = settings.data_constraint_n_comparison_samples random_seed = settings.random_seed n_sample = settings.n_samples if cache_file.exists(): DataConstraint.logger.debug("Try to load previous run...") model, cached_dataset, cached_train_sm_ids, extra_args = deserialize( cache_file) if cached_dataset == dataset \ and cached_train_sm_ids == {sm.id for sm in train_sms} \ and extra_args == ( valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample): need_rebuilt = False if need_rebuilt: DataConstraint.logger.debug("Re-build data-constraint model...") data_tables = [ ColumnBasedTable.from_table(tbl) for tbl in get_sampled_data_tables(dataset) ] model = DataConstraint(train_sms, data_tables, valid_threshold, guess_datetime_threshold, n_comparison_samples) serialize((model, dataset, {sm.id for sm in train_sms}, (valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample)), cache_file) _instance = model return _instance
def get_raw_data_tables(dataset: str) -> List[DataTable]: global _data_io_vars if dataset not in _data_io_vars['raw_data_tables']: # if it has been cached... cache_file = get_cache_dir(dataset) / 'raw_tables.pkl' if cache_file.exists(): raw_tables = deserialize(cache_file) else: raw_tables = [] source_dir = Path(config.datasets[dataset].data.as_path()) for file in sorted(source_dir.iterdir()): if file.name.startswith("."): continue raw_tables.append(DataTable.load_from_file(file)) serialize(raw_tables, cache_file) _data_io_vars["raw_data_tables"][dataset] = raw_tables return _data_io_vars["raw_data_tables"][dataset]
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]): global _instance if _instance is None: cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True if cache_file.exists(): SemanticTypeAssistant.logger.debug("Try to load previous run...") model, cache_dataset, cache_train_sm_ids = deserialize(cache_file) if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}: need_rebuilt = False ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) model.triple_adviser = ota if need_rebuilt: ont_graph = get_ont_graph(dataset) ont = get_ontology(dataset) stat = Statistic.get_instance(train_sms) ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15) typer = SemanticTyper.get_instance(dataset, train_sms) try: typer.load_model() except: sms = get_semantic_models(dataset) train_ids = {sm.id for sm in train_sms} typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4) model = SemanticTypeAssistant(train_sms, typer, ota) model.triple_adviser = None serialize((model, dataset, {sm.id for sm in train_sms}), cache_file) model.triple_adviser = ota _instance = model return _instance
def get_classifier(self, retrain: bool, train_examples: List[Example]): # TODO: implement this properly, currently, we have to train and save manually cached_file = get_cache_dir( self.example_annotator.dataset, list(self.example_annotator.train_source_ids) ) / "weak_models" / "node_prob_classifier.pkl" if not cached_file.exists() or retrain: self.logger.debug("Retrain new model") raw_X_train = make_data(self, train_examples) classifier = LogisticRegression(fit_intercept=True) X_train = numpy.asarray( [list(features.values())[1:] for features in raw_X_train]) X_train, y_train = X_train[:, :-1], [ int(x) for x in X_train[:, -1] ] scaler = StandardScaler().fit(X_train) scaler.transform(X_train) try: classifier.fit(X_train, y_train) except ValueError as e: assert str(e).startswith( "This solver needs samples of at least 2 classes in the data" ) # this should be at a starter phase when we don't have any data but use ground-truth to build X_train = numpy.vstack([X_train, [0, 0]]) y_train.append(0) classifier.fit(X_train, y_train) cached_file.parent.mkdir(exist_ok=True, parents=True) serialize((scaler, classifier), cached_file) return scaler, classifier return deserialize(cached_file)
# else: # print(".", end="") # return trace # # args = {'children_uris': set(), # 'parents_uris': set(), # 'uri': 'http://www.w3.org/2000/01/rdf-schema#Resource'} # # OntGraphNode("haha", set(), set()) # a = OntGraphNode(**args) # print("==========") # # print(a.uri) # # sys.settrace(trace) ont_graph: OntGraph = deserializeJSON( config.fsys.debug.as_path() + '/%s/cached/ont_graph.json' % dataset, OntGraph) ont: Ontology = deserialize(config.fsys.debug.as_path() + '/%s/cached/ont.pkl' % dataset) # print(a.uri) # print("========SIGSEGV IN DEBUG MODE==") # ont = Ontology.from_data_source(data_source) # ont_graph = build_ont_graph(data_source) # # # %% # # # ont_graph.render2txt(config.fsys.debug.as_path() + '/%s/ont_graph.txt' % data_source) # # # %% s1 = ont.full_uri('crm:E63_Beginning_of_Existence') s2 = ont.full_uri('crm:E52_Time-Span') for predicate in ont_graph.get_possible_predicates(s1, s2): # if link.label.find('aggregatedCHO') != -1:
def from_file(dataset: str, input_dir: Path) -> 'Model': model_bin = deserialize(input_dir / 'gmtk_model.bin') model: TemplateLogLinearModel = model_bin[0] tf_domain: GrowableBinaryVectorDomain = model_bin[1] pairwise_domain = model_bin[2] return Model(dataset, model, tf_domain, pairwise_domain)
if None in bijection.prime2x: bijection.prime2x.pop(None) return { 'f1': f1, 'precision': precision, 'recall': recall, '_bijection': bijection } if __name__ == '__main__': dataset = "museum_crm" # %% ont = deserialize(config.fsys.debug.as_path() + '/%s/cached/ont.pkl' % dataset) ont_graph = deserialize(config.fsys.debug.as_path() + '/%s/cached/ont_graph.pkl' % dataset) karma_models = get_karma_models(dataset) karma_model = karma_models[59] # %% pred_sm = deserialize( config.fsys.debug.as_path() + "/tmp/pred_sm.pkl").get_semantic_model_reassign_id().graph gold_sm = karma_model.graph # %% res = f1_precision_recall( gold_sm, pred_sm, data_node_mode=DataNodeMode.IGNORE_LABEL_DATA_NODE)
(var.triple.features.domain.get_category(idx), features[idx]) for idx in var.triple.features.get_active_index() ]) else: print("\t .Factor features: ", factor.assignment2features(assignment).tolist()) print("\t .Score = ", score) break if __name__ == '__main__': data_source = 'museum_crm' model_dir = config.fsys.debug.as_path( ) + "/%s/models/exp_no_10" % data_source model, tf_domain = deserialize(model_dir + '/gmtk_model.bin') # print out top-K features topK = 20 class_idx = 0 assert len(model.templates) == 2 triple_factor = model.templates[0] triple_factor_weights = triple_factor.weights.view(2, -1) features = [(tf_domain.get_category(i), x, triple_factor_weights[1, i]) for i, x in enumerate(triple_factor_weights[0, :])] features.sort(key=lambda x: x[1], reverse=True) for f in features: print(f) substructure = model.templates[1].weights print(substructure)