def generate_candidate_sm(dataset: str, test_sm: SemanticModel, stat: Statistic, model_bundle, train_source_ids): # generate candidate ont = get_ontology(dataset) ont_graph = get_ont_graph(dataset) settings = Settings.get_instance() dnodes: Dict[bytes, List[KarmaSemanticType]] = { attr.label.encode('utf-8'): attr.semantic_types for attr in test_sm.attrs } ota = EmpiricalTripleAdviser( ont_graph, ont, stat.p_triple, settings.searching_triple_adviser_max_candidate) graph_explorer_builder = GraphExplorerBuilder( ota, max_data_node_hop=settings.searching_max_data_node_hop, max_class_node_hop=settings.searching_max_class_node_hop) for attr, semantic_types in dnodes.items(): ota.add_data_node(attr, semantic_types) model = Model(*model_bundle) args = PGMBeamSearchArgs( test_sm.id, custom_search_discovery, Tracker(track_search_nodes=False), partial(model.predict_sm_probs, test_sm.id, train_source_ids), graph_explorer_builder, early_terminate_func=None, beam_width=settings.training_beam_width, gold_sm=test_sm.graph, source_attributes=test_sm.attrs, pre_filter_func=filter_unlikely_graph, ) started_node = PGMStartSearchNode( args.get_and_increment_id(), args, [a.label.encode('utf-8') for a in test_sm.attrs]) args._tmp_random_state = numpy.random.RandomState( Settings.get_instance().random_seed) results: List[PGMSearchNode] = beam_search( [started_node], beam_width=settings.training_beam_width, n_results=settings.searching_n_explore_result, args=args) candidate_sms = {} for search_node in args._tmp_tracker_for_storing_search_discovery_nodes: g = search_node.get_value().graph candidate_sms[graph_to_hashable_string(g)] = g for search_node in results: g = search_node.get_value().graph candidate_sms[graph_to_hashable_string(g)] = g return candidate_sms
def filter_unlikely_graph(g: MergeGraph) -> bool: settings = Settings.get_instance() max_n_duplications = settings.mrf_max_n_duplications max_n_duplication_types = settings.mrf_max_n_duplication_types for n in g.iter_class_nodes(): # FILTER middle nodes if n.n_incoming_links == 1 and n.n_outgoing_links == 1: link = next(iter(n.iter_outgoing_links())) if link.get_target_node().is_class_node(): return False # FILTER: max_size_duplication_group <= 7 and max_n_duplications <= 4 n_duplication_types = 0 for e_lbl, es in _(n.iter_outgoing_links()).imap(lambda e: (e.label, e)).group_by_key().get_value(): if len(es) > max_n_duplications: return False if len(es) > 1: n_duplication_types += 1 if n_duplication_types > max_n_duplication_types: return False return True
def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str], exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None): self.dataset: str = dataset self.train_sm_ids = train_sm_ids self.ont = get_ontology(dataset) self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)} # can only run once time, trying re-invoke will generate an error self.__has_run_modeling = False if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015" self.exec_dir: Path = Path(exec_dir) self.sm_type_dir = sm_type_dir # parameters for mohsen's algorithm self.use_old_semantic_typer = use_old_semantic_typer self.use_correct_type = use_correct_type assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4 self.num_candidate_semantic_type = 4 self.multiple_same_property_per_node = True self.coherence = 1.0 self.confidence = 1.0 self.size_reduction = 0.5 self.num_candidate_mappings = 50 self.mapping_branching_factor = 50 self.topk_steiner_tree = 10 # take all, not cut off everything self.cut_off = int(1e6) self.our_and_karma_sm_alignments = {}
def predict_log_probs(self, examples: List[Example]): log_probs = [] for es in _(examples).isplit(self.max_n_tasks): varss = [self.get_variables(e) for e in es] # varss = self.parallel_get_variables(es) factorss = [self.model.get_factors(vars) for vars in varss] inferences = [ self.inference(f, v) for f, v in zip(factorss, varss) ] desired_assignments = [{ var: var.domain.encode_value(True) for var in vars } for vars in varss] logZs = parallel_marginal_inference( inferences, n_threads=Settings.get_instance().parallel_gmtk_n_threads) log_probs += [ sum( f.score_assignment(desired_assignments[i]) for f in factorss[i]) - logZs[i] for i in range(len(es)) ] return log_probs
def generate_data(model: Model, dataset: str, train_sms: List[SemanticModel], discover_sources: List[SemanticModel], n_iter): data = {} stat = Statistic.get_instance(train_sms) train_sids = [sm.id for sm in train_sms] model_bundle = (model.dataset, model.model, model.tf_domain, model.pairwise_domain) with get_pool(Settings.get_instance().parallel_n_process) as pool: results = [] for source in discover_sources: result: AsyncResult[Dict[bytes, Graph]] = pool.apply_async( generate_candidate_sm, (dataset, source, stat, model_bundle, train_sids)) results.append(result) for source, result in zip(discover_sources, results): candidate_sms = result.get() for i, key in enumerate(candidate_sms): candidate_sms[key] = make_example( source, candidate_sms[key], Example.generate_example_id(source.id, i, n_iter), train_sids) data[source.id] = candidate_sms return data
def __init__(self, multi_val_predicate: MultiValuePredicate, structure: LocalStructure): self.tensors: Dict[bytes, Dict[int, List[DenseTensor]]] = {} features = DenseTensorFunc.from_array([0, 0]) n_features = features.size()[0] max_n_dups = Settings.get_instance().mrf_max_n_duplications for lbl, space in structure.node_structure_space.items(): self.tensors[lbl] = {} for ctriple, child_idx in space.children.items(): self.tensors[lbl][child_idx] = [None, None] for n_dup in range(2, max_n_dups + 1): tensor = DenseTensorFunc.zeros((2 ** n_dup, n_features)) dims = [2] * n_dup dims.append(n_features) for count, current_val_index, values in iter_values(n_dup, 0): if len(values) <= 1: features[0] = 0 features[1] = 0 else: multi_val_prob = multi_val_predicate.compute_prob(ctriple[0], len(values)) features[0] = max(multi_val_prob, 0.01) features[1] = max(1 - multi_val_prob, 0.01) tensor[count, :] = features self.tensors[lbl][child_idx].append(tensor.view_shape(dims))
def __init__(self, dataset: str, train_source_ids: List[str], load_circular_dependency: bool = True, training_examples: Optional[List[Example]] = None): """ :param dataset: :param train_source_ids: :param top_k_semantic_types: :param n_sample: :param load_circular_dependency: :param training_examples: list of training examples use to build weak models, don't need it at testing time (i.e = NULL), because weak models has been built before """ self.dataset = dataset self.source_models = {sm.id: sm for sm in get_semantic_models(dataset)} self.train_source_ids = set(train_source_ids) self.top_k_semantic_types = Settings.get_instance( ).semantic_labeling_top_n_stypes self.training_models = [ self.source_models[sid] for sid in train_source_ids ] self.typer: SemanticTyper = create_semantic_typer( dataset, self.training_models) self.testing_models = [ self.source_models[sid] for sid in set( self.source_models.keys()).difference(train_source_ids) ] self.training_examples = training_examples # local models self.multival_predicate = MultiValuePredicate.get_instance( self.training_models) self.statistic = Statistic.get_instance(self.training_models) # self.data_constraint = get_data_constraint_model(dataset, self.training_models) self.stype_assistant = get_stype_assistant_model( dataset, self.training_models) self.local_structure = LocalStructure.get_instance( self.training_models) self.attribute_same_scope = AttributeScope.get_instance(self.dataset) self.duplication_tensors = DuplicationTensors.get_instance( self.training_models) self.primary_key: PrimaryKey = PrimaryKey.get_instance( dataset, self.training_models) self.cardinality = CardinalityFeatures.get_instance(dataset) # STEP 1: add semantic types self.typer.semantic_labeling(self.training_models, self.testing_models, self.top_k_semantic_types, eval_train=True) # STEP 2: load circular dependency like node_prob if load_circular_dependency: self.node_prob = NodeProb(self, load_classifier=True)
def make_example(sm: SemanticModel, g: Graph, example_id, train_sids: List[str] = None) -> Example: settings = Settings.get_instance() if settings.auto_labeling_method == Settings.ALGO_AUTO_LBL_MAX_F1: link2label, prime2x = AutoLabel.auto_label_max_f1(sm.graph, g, False)[:2] example = Example(sm.graph, g, link2label, prime2x) example.set_meta(example_id, train_sids) return example assert False
def semantic_labeling(dataset: str, train_sms: List[SemanticModel], test_sms: List[SemanticModel]): if Settings.get_instance().semantic_labeling_simulate_testing: for sm in train_sms: custom_train_sms = [s for s in train_sms if s.id != sm.id] create_semantic_typer(dataset, custom_train_sms).semantic_labeling( custom_train_sms, [sm], top_n=Settings.get_instance().semantic_labeling_top_n_stypes, eval_train=False) SemanticTyper.instance = None # clear cache SemanticTypeDB.instance = None create_semantic_typer(dataset, train_sms).semantic_labeling( train_sms, test_sms, top_n=Settings.get_instance().semantic_labeling_top_n_stypes, eval_train=False) else: create_semantic_typer(dataset, train_sms).semantic_labeling( train_sms, test_sms, top_n=Settings.get_instance().semantic_labeling_top_n_stypes, eval_train=True)
def create_semantic_typer(dataset: str, train_sms: List[SemanticModel]) -> SemanticTyper: settings = Settings.get_instance() if settings.semantic_labeling_method == Settings.MohsenJWS: # noinspection PyTypeChecker return MohsenSemanticTyper.get_instance(dataset, train_sms) if settings.semantic_labeling_method == Settings.ReImplMinhISWC: return SemanticTyper.get_instance(dataset, train_sms) if settings.semantic_labeling_method == Settings.MohsenJWS + "-Oracle": # noinspection PyTypeChecker return SemiOracleSemanticLabeling( MohsenSemanticTyper.get_instance(dataset, train_sms)) if settings.semantic_labeling_method == Settings.ReImplMinhISWC + "-Oracle": # noinspection PyTypeChecker return SemiOracleSemanticLabeling( SemanticTyper.get_instance(dataset, train_sms)) if settings.semantic_labeling_method == Settings.OracleSL: # noinspection PyTypeChecker return OracleSemanticLabeling() if settings.semantic_labeling_method == "OracleSL-Constraint": # noinspection PyTypeChecker return ConstraintOracleSemanticLabeling() if settings.semantic_labeling_method == "SereneSemanticType": sms = get_semantic_models(dataset) if dataset == "museum_edm" and train_sms == sms[:14]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s01-s14" elif dataset == "museum_edm" and train_sms == sms[14:]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s15-s28" elif dataset == "museum_edm" and train_sms == sms[7:21]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_edm_stypes/kfold-s08-s21" elif dataset == "museum_crm" and train_sms == sms[:14]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s01-s14" elif dataset == "museum_crm" and train_sms == sms[14:]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s15-s28" elif dataset == "museum_crm" and train_sms == sms[7:21]: serene_dir = "/workspace/tmp/serene-python-client/datasets/GOLD/museum_crm_stypes/kfold-s08-s21" else: raise Exception("Invalid configuration of serene semantic types") # noinspection PyTypeChecker return SereneSemanticTypes(dataset, Path(serene_dir)) raise Exception( f"Invalid semantic typer: {settings.semantic_labeling_method}")
def get_sampled_data_tables(dataset: str) -> List[DataTable]: global _data_io_vars if dataset not in _data_io_vars['sampled_data_tables']: # if it has been cached... cache_file = get_cache_dir(dataset) / "sampled_tables.pkl" if cache_file.exists(): tables = deserialize(cache_file) else: tables = get_data_tables(dataset) settings = Settings.get_instance() tables = [tbl.sample(settings.n_samples, settings.random_seed) for tbl in tables] serialize(tables, cache_file) _data_io_vars["sampled_data_tables"][dataset] = tables return _data_io_vars["sampled_data_tables"][dataset]
def get_data_constraint_model( dataset: str, train_sms: List[SemanticModel], ) -> DataConstraint: global _instance if _instance is None: cache_file = get_cache_dir( dataset, train_sms) / "weak_models" / "data_constraint.pkl" cache_file.parent.mkdir(exist_ok=True, parents=True) need_rebuilt = True settings = Settings.get_instance() valid_threshold = settings.data_constraint_valid_threshold guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold n_comparison_samples = settings.data_constraint_n_comparison_samples random_seed = settings.random_seed n_sample = settings.n_samples if cache_file.exists(): DataConstraint.logger.debug("Try to load previous run...") model, cached_dataset, cached_train_sm_ids, extra_args = deserialize( cache_file) if cached_dataset == dataset \ and cached_train_sm_ids == {sm.id for sm in train_sms} \ and extra_args == ( valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample): need_rebuilt = False if need_rebuilt: DataConstraint.logger.debug("Re-build data-constraint model...") data_tables = [ ColumnBasedTable.from_table(tbl) for tbl in get_sampled_data_tables(dataset) ] model = DataConstraint(train_sms, data_tables, valid_threshold, guess_datetime_threshold, n_comparison_samples) serialize((model, dataset, {sm.id for sm in train_sms}, (valid_threshold, guess_datetime_threshold, n_comparison_samples, random_seed, n_sample)), cache_file) _instance = model return _instance
def __init__(self, dataset: str, model: TemplateLogLinearModel, tf_domain: GrowableBinaryVectorDomain, pairwise_domain: GrowableBinaryVectorDomain) -> None: self.dataset = dataset self.source_models: Dict[str, SemanticModel] = { s.id: s for s in get_semantic_models(dataset) } self.inference = BeliefPropagation.get_constructor(InferProb.MARGINAL) self.map_inference = BeliefPropagation.get_constructor(InferProb.MAP) self.model: TemplateLogLinearModel = model for template in model.templates: if isinstance(template, CachedTemplateFactorConstructor): template.disable_cache() self.tf_domain: GrowableBinaryVectorDomain = tf_domain self.pairwise_domain = pairwise_domain self.example_annotator: ExampleAnnotator = None self.max_n_tasks = Settings.get_instance().max_n_tasks
def predict_sm(model: Model, dataset: str, train_sms: List[SemanticModel], evaluate_sms: List[SemanticModel], workdir): train_sids = [sm.id for sm in train_sms] predictions: Dict[str, Graph] = {} stat = Statistic.get_instance(train_sms) model_bundle = (model.dataset, model.model, model.tf_domain, model.pairwise_domain) search_performance_history = {} search_history = {} with get_pool(Settings.get_instance().parallel_n_process) as pool: results = [] for sm in evaluate_sms: result = pool.apply_async( generate_candidate_sm, (dataset, sm, stat, model_bundle, train_sids)) results.append(result) pred_sms: Tuple[List[Tuple[float, Graph]], List[Tuple[int, float, float, float, float]], List[List[Graph]]] for sm, result in zip(evaluate_sms, results): pred_sms = result.get() predictions[sm.id] = pred_sms[0][0][1] search_performance_history[sm.id] = pred_sms[1] search_history[sm.id] = pred_sms[2] serializeJSON({sid: o.to_dict() for sid, o in predictions.items()}, workdir / "predicted_sms.json") serializeJSON(search_performance_history, workdir / "search_performance_history.json", indent=4) serializeJSON( { sid: [[o.to_dict() for o in os] for os in oss] for sid, oss in search_history.items() }, workdir / "search_history.json") return predictions
def __init__(self, all_children_weights: DenseTensor, pairwise_pk_weights: DenseTensor, pairwise_scope_weights: DenseTensor, duplication_weights: Dict[str, DenseTensor], pairwise_domain: GrowableBinaryVectorDomain[str]) -> None: self.settings = Settings.get_instance() self.all_children_weights: Weights = Weights(all_children_weights) self.pairwise_pk_weights: Weights = Weights(pairwise_pk_weights) self.pairwise_scope_weights: Weights = Weights(pairwise_scope_weights) self.duplication_weights: Dict[str, Weights] = { k: Weights(v) for k, v in duplication_weights.items() } self.pairwise_domain = pairwise_domain self.boolean_domain = BooleanVectorDomain.get_instance() # use to compute pairwise factor's feature tensor # similar to DotTensor1WithSufficientStatisticFactor.get_feature_tensor#domain_tensor self.pairwise_indice_func_tensor = DenseTensorFunc.from_array( [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]).view(4, 4, 1)
def serialize_stype_assistant(dataset: str, sms: List[SemanticModel], train_sms: List[SemanticModel], test_sms: List[SemanticModel]): predicted_parent_stypes = SemanticTyper.get_instance( dataset, train_sms).semantic_labeling_parent( train_sms, test_sms, top_n=Settings.get_instance().semantic_labeling_top_n_stypes, eval_train=True) results = [] for sm in sms: if sm.id not in predicted_parent_stypes: results.append({}) continue all_parent_stypes = predicted_parent_stypes[sm.id] result = {} for attr_id, g_parent_stypes in all_parent_stypes.items(): result[sm.graph.get_node_by_id(attr_id).label] = [{ "stype": { "domain": stype[0], "type": stype[1], "confidence_score": score }, "parent_stypes": [{ "domain": parent_stype[0] if parent_stype is not None else "", "type": parent_stype[1] if parent_stype is not None else "", "confidence_score": parent_stype_score } for parent_stype, parent_stype_score in parent_stypes] } for stype, score, parent_stypes in g_parent_stypes] results.append(result) return results
def __init__(self, dataset: str, train_source_ids: List[str]) -> None: self.n_processes = Settings.get_instance().parallel_n_annotators self.annotator = ExampleAnnotator(dataset, train_source_ids) self.processes = [] for i in range(self.n_processes): parent_conn, child_conn = multiprocessing.Pipe() process = multiprocessing.Process( target=ParallelAnnotator.parallel_annotate, name=f"parallel-annotator-{i}", args=(child_conn, )) self.processes.append({ "parent_conn": parent_conn, "child_conn": child_conn, "process": process }) process.start() for proc_info in self.processes: proc_info['parent_conn'].send({ "message": "start", "dataset": dataset, "train_sm_ids": train_source_ids })
def generate_candidate_sm(dataset: str, test_sm: SemanticModel, stat: Statistic, model_bundle, train_source_ids): # generate candidate ont = get_ontology(dataset) ont_graph = get_ont_graph(dataset) settings = Settings.get_instance() dnodes: Dict[bytes, List[KarmaSemanticType]] = { attr.label.encode('utf-8'): attr.semantic_types for attr in test_sm.attrs } ota = EmpiricalTripleAdviser( ont_graph, ont, stat.p_triple, settings.searching_triple_adviser_max_candidate) graph_explorer_builder = GraphExplorerBuilder( ota, max_data_node_hop=settings.searching_max_data_node_hop, max_class_node_hop=settings.searching_max_class_node_hop) for attr, semantic_types in dnodes.items(): ota.add_data_node(attr, semantic_types) early_stopping = EarlyStopping() model = Model(*model_bundle) args = PGMBeamSearchArgs( test_sm.id, discovering_func, Tracker(track_search_nodes=True), partial(model.predict_sm_probs, test_sm.id, train_source_ids), graph_explorer_builder, # early_terminate_func=early_stopping.early_stopping, early_terminate_func=None, beam_width=settings.searching_beam_width, gold_sm=test_sm.graph, source_attributes=test_sm.attrs, pre_filter_func=filter_unlikely_graph, ) started_nodes = [ PGMStartSearchNode(args.get_and_increment_id(), args, [a.label.encode('utf-8') for a in test_sm.attrs]) ] results: List[PGMSearchNode] = beam_search( started_nodes, beam_width=settings.searching_beam_width, n_results=settings.searching_n_explore_result, args=args) # *****************************************************************************************************************' # DEBUG CODE output_dir = Path(config.fsys.debug.as_path() + "/tmp/final/") # for search_node in args.tracker.list_search_nodes: # search_node.beam_search_args = None # serialize(args.tracker.list_search_nodes, output_dir / "search_nodes2.pkl") # for file in output_dir.iterdir(): # if file.is_dir(): # shutil.rmtree(file) # else: # os.remove(file) # # for i, search_nodes in enumerate(args.tracker.list_search_nodes): # if len(search_nodes) == 0: # continue # # sub_output_dir = output_dir / str(i) # sub_output_dir.mkdir(exist_ok=True, parents=True) # # for j, r in enumerate(search_nodes[:30]): # pred_sm = r.get_value().graph # pred_sm.set_name(str(r.get_score()).encode('utf-8')) # # g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0]) # g.render2img(sub_output_dir / f"{j}.png") # serialize(pred_sm, sub_output_dir / f"{j}.pkl") # # sub_output_dir = output_dir / "result" # sub_output_dir.mkdir(exist_ok=True, parents=True) # # for i, r in enumerate(results): # pred_sm = r.get_value().graph # pred_sm.set_name(str(r.get_score()).encode('utf-8')) # # g = colorize_prediction(pred_sm, AutoLabel.auto_label_max_f1(test_sm.graph, pred_sm, False)[0]) # g.render2img(sub_output_dir / f"{i}.png") # serialize(pred_sm, sub_output_dir / f"{i}.pkl") # # # STEP 4: report performance print( f"{test_sm.id}: Performance at prev iter:", smodel_eval.f1_precision_recall( test_sm.graph, args.tracker.list_search_nodes[-1][0].get_value().graph, DataNodeMode.NO_TOUCH)) print( f"{test_sm.id}: Performance at final iter:", smodel_eval.f1_precision_recall(test_sm.graph, results[0].get_value().graph, DataNodeMode.NO_TOUCH)) # *****************************************************************************************************************' performances = [] for iter_no, search_nodes in enumerate(args.tracker.list_search_nodes): if len(search_nodes) == 0: continue x = smodel_eval.f1_precision_recall(test_sm.graph, search_nodes[0].get_value().graph, DataNodeMode.NO_TOUCH) performances.append((iter_no, search_nodes[0].get_score(), x['precision'], x['recall'], x['f1'])) x = smodel_eval.f1_precision_recall(test_sm.graph, results[0].get_value().graph, DataNodeMode.NO_TOUCH) performances.append((len(performances), results[0].get_score(), x['precision'], x['recall'], x['f1'])) pred_sms = [(search_node.get_score(), search_node.get_value().graph) for search_node in results] search_history = [[n.get_value().graph for n in search_nodes] for search_nodes in args.tracker.list_search_nodes] search_history.append([n.get_value().graph for n in results]) return pred_sms, performances, search_history
test_examples = [] for sid in search_history: for i, gs in enumerate(search_history[sid]): for j, g in enumerate(gs): eid = Example.generate_example_id(sid, j, i) example = make_example(evaluate_sms[sid], Graph.from_dict(g), eid, train_sm_ids) test_examples.append(example) serializeJSON(test_examples, workdir / "examples" / "test.json") return test_examples if __name__ == '__main__': dataset = "museum_edm" Settings.get_instance(False).parallel_n_process = 6 Settings.get_instance().max_n_tasks = 160 Settings.get_instance().semantic_labeling_top_n_stypes = 4 Settings.get_instance().searching_beam_width = 5 Settings.get_instance().log_current_settings() source_models = get_semantic_models(dataset) train_sms = source_models[:6] test_sms = [sm for sm in source_models if sm not in train_sms] workdir = Path(config.fsys.debug.as_path( )) / dataset / "main_experiments" / get_short_train_name(train_sms) workdir.mkdir(exist_ok=True, parents=True) create_semantic_typer(dataset, train_sms).semantic_labeling( train_sms,
help='Number of samples') parser.add_argument('--seed', type=int, required=True, default=120, help='Random seed') args = parser.parse_args() return args if __name__ == '__main__': args = get_shell_args() dataset = args.dataset settings = Settings.get_instance(False) settings.n_samples = args.n_samples settings.random_seed = args.seed settings.log_current_settings() ont = get_ontology(dataset) source_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "sources" source_dir.mkdir(exist_ok=True, parents=True) meta_file = source_dir / ".meta" if meta_file.exists(): meta = deserializeJSON(meta_file) if meta['n_samples'] == settings.n_samples and meta[ 'random_seed'] == settings.random_seed:
return "scores_%s---%s" % (domain, type) if __name__ == '__main__': dataset = "museum_edm" sms = get_semantic_models(dataset) ont = get_ontology(dataset) serene_dir = Path( "/workspace/tmp/serene-python-client/datasets/") / dataset top_n = 4 normalize_score = False for semantic_type in [ "MohsenJWS", "ReImplMinhISWC", "OracleSL-Constraint" ]: Settings.get_instance(False).semantic_labeling_method = semantic_type for kfold in ["kfold-s01-s14", "kfold-s15-s28", "kfold-s08-s21"]: serene_stypes = {} for sm in sms: for n in sm.graph.iter_data_nodes(): e = n.get_first_incoming_link() stype = get_serene_style( e.get_source_node().label.decode(), e.label.decode()) if stype not in serene_stypes: serene_stypes[stype] = len(serene_stypes) serene_stypes["scores_unknown"] = len(serene_stypes) header = [ "", "column_id", "column_name", "confidence", "dataset_id", "model_id", "label", "user_label"
raise return args if __name__ == '__main__': args = get_shell_args() source_models: List[SemanticModel] = get_semantic_models(args.dataset) train_sms = [ sm for sm in source_models if sm.id in args.kfold['train_sm_ids'] ] test_sms = [ sm for sm in source_models if sm.id in args.kfold['test_sm_ids'] ] Settings.get_instance(False).semantic_labeling_method = args.semantic_typer Settings.get_instance().log_current_settings() typer = create_semantic_typer(args.dataset, train_sms) typer.semantic_labeling(train_sms, test_sms, 4, eval_train=True) exp_dir = Path(args.exp_dir) eval_sources( train_sms, exp_dir / f"{typer.__class__.__name__}_{get_short_train_name(train_sms)}_eval.train.csv" ) eval_sources( test_sms, exp_dir / f"{typer.__class__.__name__}_{get_short_train_name(train_sms)}_eval.test.csv" )
for n in sm.graph.iter_nodes()}) example.set_meta(Example.generate_example_id(sm.id, 0, 0), [sm.id for sm in train_sms]) train_examples.append(example) raw_model, tf_domain, pairwise_domain, __ = train_model( dataset, [sm.id for sm in train_sms], 120, train_examples, [], training_args, basedir) return Model(dataset, raw_model, tf_domain, pairwise_domain) if __name__ == '__main__': from semantic_modeling.assembling.learning.evaluate import predict_sm # DenseTensorFunc.set_default_type(DType.Double) Settings.get_instance(False).parallel_gmtk_n_threads = 12 Settings.get_instance().log_current_settings() dataset = "museum_edm" source_models = get_semantic_models(dataset) train_sms = source_models[:6] train_sm_ids = [sm.id for sm in train_sms] test_sms = [sm for sm in source_models if sm.id not in train_sm_ids] workdir = Path(config.fsys.debug.as_path( )) / dataset / "main_experiments" / get_short_train_name(train_sms) workdir.mkdir(exist_ok=True, parents=True) create_semantic_typer(dataset, train_sms).semantic_labeling( train_sms, test_sms,
def run_evaluation_workflow(dataset: str, scenario: Scenario, train_sms, test_sms): ont: Ontology = get_ontology(dataset) karma_models: List[KarmaModel] = get_karma_models(dataset) semantic_models: List[SemanticModel] = get_semantic_models(dataset) train_sm_ids = [sm.id for sm in train_sms] sdesc_args = dict( dataset=dataset, train_sm_ids=train_sm_ids, use_correct_type= False, # we always put semantic types to learnedSemanticTypes, even for userSetSemanticTypes use_old_semantic_typer=False, exec_dir=get_cache_dir(dataset, train_sms) / "mohsen_jws2015", sm_type_dir=Path(config.fsys.debug.as_path()) / "tmp" / "models-json-temp") # STEP 1: run semantic typing to generate semantic typing and put result to a temporal folder if sdesc_args['sm_type_dir'].exists(): shutil.rmtree(sdesc_args['sm_type_dir']) sdesc_args['sm_type_dir'].mkdir(exist_ok=True, parents=True) top_k_types = Settings.get_instance().semantic_labeling_top_n_stypes typer = create_semantic_typer(dataset, train_sms) typer.semantic_labeling(train_sms, test_sms, top_k_types, eval_train=True) for sm, ksm in zip(semantic_models, karma_models): # assign semantic types to learnedSemanticTypes sm_alignment = SemanticModelAlignment(sm, ksm) for col in ksm.source_columns: attr = sm.get_attr_by_label( sm.graph.get_node_by_id( sm_alignment.alignment[col.id]).label.decode('utf-8')) node = ksm.karma_graph.get_node_by_id(col.id) link = node.get_first_incoming_link() node.learned_semantic_types = [ KarmaSemanticType(node.id, stype.domain, stype.type, typer.__class__.__name__, stype.confidence_score) for stype in attr.semantic_types ] node.user_semantic_types = [ KarmaSemanticType(node.id, link.get_source_node().label.decode(), link.label.decode(), "User", 1.0) ] serializeJSON(ksm.to_normalized_json_model(ont), sdesc_args['sm_type_dir'] / f"{ksm.id}-model.json", indent=4) # STEP 2: invoking semantic modeling modeler = MohsenSemanticModeling(**sdesc_args) pred_sms = modeler.sm_prediction(train_sms, test_sms) # STEP 3: prediction semantic mapping result eval_hist = [["source", "precision", "recall", "f1", "stype-acc"]] if scenario == Scenario.SCENARIO_1: data_node_mode = DataNodeMode.IGNORE_DATA_NODE else: data_node_mode = DataNodeMode.NO_TOUCH for sm, pred_sm in zip(test_sms, pred_sms): eval_result = smodel_eval.f1_precision_recall(sm.graph, pred_sm.graph, data_node_mode) eval_hist.append([ sm.id, eval_result["precision"], eval_result["recall"], eval_result["f1"], smodel_eval.stype_acc(sm.graph, pred_sm.graph) ]) eval_hist.append([ 'average', np.average([float(x[1]) for x in eval_hist[1:]]), np.average([float(x[2]) for x in eval_hist[1:]]), np.average([float(x[3]) for x in eval_hist[1:]]), np.average([float(x[4]) for x in eval_hist[1:]]) ]) serializeCSV( eval_hist, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}.csv") # STEP 4: prediction semantic labeling result pred_stypes = modeler.semantic_labeling(train_sms, test_sms) for pred_stype, sm in zip(pred_stypes, test_sms): for attr in sm.attrs: if attr.label not in pred_stype: attr.semantic_types = [] else: attr.semantic_types = pred_stype[attr.label] eval_sources( test_sms, sdesc_args["exec_dir"] / f"evaluation_result_{scenario.value}_stype.csv") # STEP 5: visualize the prediction (sdesc_args['exec_dir'] / "prediction-viz").mkdir(exist_ok=True) need_render_graphs = [ (colorize_prediction( pred_sm.graph, AutoLabel.auto_label_max_f1(sm.graph, pred_sm.graph, False)[0]), sdesc_args['exec_dir'] / "prediction-viz" / f"{sm.id}.png") for sm, pred_sm in zip(test_sms, pred_sms) ] with ThreadPool(32) as p: p.map(render_graph, need_render_graphs) return eval_hist