Beispiel #1
0
def get_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'tables.pkl'
        if cache_file.exists():
            tables = deserialize(cache_file)
        else:
            mapping_dir = Path(config.datasets[dataset].models_y2rml.as_path())
            raw_tables = get_raw_data_tables(dataset)
            R2RML.load_python_scripts(Path(config.datasets[dataset].python_code.as_path()))
            tables = []
            semantic_models = []
            for i, raw_tbl in enumerate(raw_tables):
                r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml"
                tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl)
                tables.append(tbl)
                semantic_models.append(sm)

            serialize(tables, cache_file)
            _data_io_vars['semantic_models'][dataset] = semantic_models  # avoid apply R2RML twice!

        _data_io_vars["data_tables"][dataset] = tables

    return _data_io_vars["data_tables"][dataset]
Beispiel #2
0
    def get_stype_db(dataset: str, train_source_ids: List[str],
                     cache_dir: Path) -> 'SemanticTypeDB':
        if SemanticTypeDB.instance is None:
            cache_file = cache_dir / 'stype_db.pkl'
            if cache_file.exists():
                SemanticTypeDB.logger.debug(
                    "Load SemanticTypeDB from cache file...")
                stype_db: SemanticTypeDB = deserialize(cache_file)
                if set(train_source_ids) != {
                        tbl.id
                        for tbl in stype_db.train_tables
                } or stype_db.dataset != dataset:
                    stype_db = None
            else:
                stype_db = None

            if stype_db is None:
                SemanticTypeDB.logger.debug(
                    "Have to re-create SemanticTypeDB...")
                stype_db = SemanticTypeDB.create(dataset, train_source_ids)
                stype_db._build_db()
                serialize(stype_db, cache_file)

            SemanticTypeDB.instance = stype_db

        return SemanticTypeDB.instance
Beispiel #3
0
 def serialize(self, output: str):
     serialize(
         {
             'list_search_nodes':
             [[(n.get_value(), n.get_score()) for n in search_nodes]
              for search_nodes in self.list_search_nodes]
         }, output)
Beispiel #4
0
    def semantic_labeling(self,
                          train_sources: List[SemanticModel],
                          test_sources: List[SemanticModel],
                          top_n: int,
                          eval_train: bool = False) -> None:
        """Generate semantic labels and store it in test sources"""
        train_sources: Dict[str,
                            SemanticModel] = {s.id: s
                                              for s in train_sources}
        test_sources: Dict[str,
                           SemanticModel] = {s.id: s
                                             for s in test_sources}
        assert set(train_sources.keys()) == self.train_source_ids

        if self.model is None:
            model_file = self.exec_dir / 'model.pkl'

            if model_file.exists():
                self.logger.debug("Load previous trained model...")
                self.model = deserialize(model_file)
            else:
                self.logger.debug("Train new model...")
                x_train, y_train, x_test, y_test = generate_training_data(
                    self.stype_db)
                # clf = LogisticRegression(class_weight="balanced")
                clf = RandomForestClassifier(n_estimators=200,
                                             max_depth=10,
                                             class_weight="balanced",
                                             random_state=120)
                clf = clf.fit(x_train, y_train)
                self.logger.debug("Save model...")
                serialize(clf, model_file)
                self.model = clf

        col_attrs = []
        if eval_train:
            for col in self.stype_db.train_columns:
                if col.table_name not in train_sources: continue
                col_attrs.append(
                    (col, train_sources[col.table_name].get_attr_by_label(
                        col.name)))

        for col in self.stype_db.test_columns:
            if col.table_name not in test_sources: continue
            col_attrs.append(
                (col,
                 test_sources[col.table_name].get_attr_by_label(col.name)))

        for col, attr in col_attrs:
            pred_stypes = self.pred_type(col, top_n)
            attr.semantic_types = [
                SemanticType(stype[0].decode("utf-8"),
                             stype[1].decode("utf-8"), score)
                for stype, score in pred_stypes if score > 0
            ]
Beispiel #5
0
def get_ontology(dataset: str) -> Ontology:
    """Get ontology of a given dataset"""
    global _data_io_vars
    if dataset not in _data_io_vars["ont"]:
        # if it has been cached ...
        cache_file = get_cache_dir(dataset) / 'ont.pkl'
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont = deserialize(cache_file)
        else:
            ont = Ontology.from_dataset(dataset)
            serialize(ont, cache_file)
        _data_io_vars["ont"][dataset] = ont

    return _data_io_vars["ont"][dataset]
Beispiel #6
0
def get_sampled_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['sampled_data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / "sampled_tables.pkl"
        if cache_file.exists():
            tables = deserialize(cache_file)
        else:
            tables = get_data_tables(dataset)
            settings = Settings.get_instance()
            tables = [tbl.sample(settings.n_samples, settings.random_seed) for tbl in tables]
            serialize(tables, cache_file)
        _data_io_vars["sampled_data_tables"][dataset] = tables

    return _data_io_vars["sampled_data_tables"][dataset]
Beispiel #7
0
def get_data_constraint_model(
    dataset: str,
    train_sms: List[SemanticModel],
) -> DataConstraint:
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(
            dataset, train_sms) / "weak_models" / "data_constraint.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        settings = Settings.get_instance()
        valid_threshold = settings.data_constraint_valid_threshold
        guess_datetime_threshold = settings.data_constraint_guess_datetime_threshold
        n_comparison_samples = settings.data_constraint_n_comparison_samples
        random_seed = settings.random_seed
        n_sample = settings.n_samples

        if cache_file.exists():
            DataConstraint.logger.debug("Try to load previous run...")
            model, cached_dataset, cached_train_sm_ids, extra_args = deserialize(
                cache_file)
            if cached_dataset == dataset \
                    and cached_train_sm_ids == {sm.id for sm in train_sms} \
                    and extra_args == (
                        valid_threshold, guess_datetime_threshold, n_comparison_samples,
                        random_seed, n_sample):
                need_rebuilt = False

        if need_rebuilt:
            DataConstraint.logger.debug("Re-build data-constraint model...")
            data_tables = [
                ColumnBasedTable.from_table(tbl)
                for tbl in get_sampled_data_tables(dataset)
            ]
            model = DataConstraint(train_sms, data_tables, valid_threshold,
                                   guess_datetime_threshold,
                                   n_comparison_samples)
            serialize((model, dataset, {sm.id
                                        for sm in train_sms},
                       (valid_threshold, guess_datetime_threshold,
                        n_comparison_samples, random_seed, n_sample)),
                      cache_file)

        _instance = model
    return _instance
Beispiel #8
0
def get_raw_data_tables(dataset: str) -> List[DataTable]:
    global _data_io_vars
    if dataset not in _data_io_vars['raw_data_tables']:
        # if it has been cached...
        cache_file = get_cache_dir(dataset) / 'raw_tables.pkl'
        if cache_file.exists():
            raw_tables = deserialize(cache_file)
        else:
            raw_tables = []
            source_dir = Path(config.datasets[dataset].data.as_path())
            for file in sorted(source_dir.iterdir()):
                if file.name.startswith("."):
                    continue
                raw_tables.append(DataTable.load_from_file(file))

            serialize(raw_tables, cache_file)
        _data_io_vars["raw_data_tables"][dataset] = raw_tables

    return _data_io_vars["raw_data_tables"][dataset]
Beispiel #9
0
def get_stype_assistant_model(dataset: str, train_sms: List[SemanticModel]):
    global _instance
    if _instance is None:
        cache_file = get_cache_dir(dataset, train_sms) / "weak_models" / "stype_assistant.pkl"
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        need_rebuilt = True

        if cache_file.exists():
            SemanticTypeAssistant.logger.debug("Try to load previous run...")
            model, cache_dataset, cache_train_sm_ids = deserialize(cache_file)
            if cache_dataset == dataset and cache_train_sm_ids == {sm.id for sm in train_sms}:
                need_rebuilt = False

            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)
            model.triple_adviser = ota

        if need_rebuilt:
            ont_graph = get_ont_graph(dataset)
            ont = get_ontology(dataset)
            stat = Statistic.get_instance(train_sms)
            ota = EmpiricalTripleAdviser(ont_graph, ont, stat.p_triple, 15)

            typer = SemanticTyper.get_instance(dataset, train_sms)
            try:
                typer.load_model()
            except:
                sms = get_semantic_models(dataset)
                train_ids = {sm.id for sm in train_sms}
                typer.semantic_labeling(train_sms, [sm for sm in sms if sm.id not in train_ids], 4)

            model = SemanticTypeAssistant(train_sms, typer, ota)
            model.triple_adviser = None
            serialize((model, dataset, {sm.id for sm in train_sms}), cache_file)
            model.triple_adviser = ota

        _instance = model

    return _instance
Beispiel #10
0
    def get_classifier(self, retrain: bool, train_examples: List[Example]):
        # TODO: implement this properly, currently, we have to train and save manually
        cached_file = get_cache_dir(
            self.example_annotator.dataset,
            list(self.example_annotator.train_source_ids)
        ) / "weak_models" / "node_prob_classifier.pkl"
        if not cached_file.exists() or retrain:
            self.logger.debug("Retrain new model")
            raw_X_train = make_data(self, train_examples)
            classifier = LogisticRegression(fit_intercept=True)

            X_train = numpy.asarray(
                [list(features.values())[1:] for features in raw_X_train])
            X_train, y_train = X_train[:, :-1], [
                int(x) for x in X_train[:, -1]
            ]

            scaler = StandardScaler().fit(X_train)
            scaler.transform(X_train)

            try:
                classifier.fit(X_train, y_train)
            except ValueError as e:
                assert str(e).startswith(
                    "This solver needs samples of at least 2 classes in the data"
                )
                # this should be at a starter phase when we don't have any data but use ground-truth to build
                X_train = numpy.vstack([X_train, [0, 0]])
                y_train.append(0)
                classifier.fit(X_train, y_train)

            cached_file.parent.mkdir(exist_ok=True, parents=True)
            serialize((scaler, classifier), cached_file)
            return scaler, classifier

        return deserialize(cached_file)
Beispiel #11
0
                                        index=False)

    # create classifier to train
    classifier = LogisticRegression(fit_intercept=True)

    X_train = numpy.asarray(
        [list(features.values())[1:] for features in raw_X_train])
    X_test = numpy.asarray(
        [list(features.values())[1:] for features in raw_X_test])
    X_train, y_train = X_train[:, :-1], [int(x) for x in X_train[:, -1]]
    X_test, y_test = X_test[:, :-1], [int(x) for x in X_test[:, -1]]

    scaler = StandardScaler().fit(X_train)
    scaler.transform(X_train)
    scaler.transform(X_test)

    classifier.fit(X_train, list(y_train))
    print("Weights", classifier.coef_, classifier.intercept_)

    # report performance
    y_train_pred = classifier.predict(X_train)
    res = precision_recall_fscore_support(y_train, y_train_pred)
    print('Train', res)
    res = precision_recall_fscore_support(y_test, classifier.predict(X_test))
    print('Test', res)

    # save classifier
    serialize((scaler, classifier), output_dir / "classifier.pkl")

    print(classifier.predict([[0.938, 0]]))
Beispiel #12
0
def train_model(dataset: str, train_sids: List[str], manual_seed: int,
                train_examples: List[Example], test_examples: List[Example],
                args: TrainingArgs, basedir: Path):
    DenseTensorFunc.manual_seed(manual_seed)

    tf_domain = GrowableBinaryVectorDomain()

    timer = pyutils.progress.Timer().start()
    input_train_examples = train_examples
    input_test_examples = test_examples

    # BUILDING VARIABLES NEEDED FOR THE TRAINING
    example_annotator = ExampleAnnotator(dataset,
                                         train_sids,
                                         training_examples=train_examples)
    train_examples = sequential_map(example_annotator.annotate, train_examples)
    train_examples = _(train_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    pairwise_domain = example_annotator.build_pairwise_domain()
    # Freeze domain now, we've added all feature values observed in training data
    tf_domain.freeze()

    test_examples = sequential_map(example_annotator.annotate, test_examples)
    test_examples = _(test_examples) \
        .imap(example_annotator.example2vars) \
        .submap(partial(example_annotator.build_triple_features, domain=tf_domain))

    # print domain to debug
    logger.info("Preprocessing take %s" % timer.lap().get_total_time())
    # build random variables
    train_graphs = _(train_examples).submap(lambda t: t.label)
    test_graphs = _(test_examples).submap(lambda t: t.label)

    # build models, select inference method
    model = TemplateLogLinearModel([
        TripleFactorTemplate(
            *TripleFactorTemplate.get_default_args(tf_domain)),
        SubstructureFactorTemplate(
            *SubstructureFactorTemplate.get_default_args(
                pairwise_domain, example_annotator.get_obj_props())),
        # ExternalModelFactorTemplate(*ExternalModelFactorTemplate.get_default_weights())
    ])
    # or load previous training
    # model_dir = config.fsys.debug.as_path() + "/%s/models/exp_no_2" % dataset
    # model, ___, state_dict = deserialize(model_dir + '/gmtk_model.bin')

    inference = BeliefPropagation.get_constructor(InferProb.MARGINAL)
    map_inference = BeliefPropagation.get_constructor(InferProb.MAP)

    train_nll_examples = _(
        train_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    train_map_examples = _(train_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))
    test_nll_examples = _(
        test_graphs).map(lambda vars: NegativeLogLikelihoodExample(
            vars, model.get_factors(vars), inference))
    test_map_examples = _(test_nll_examples).map(
        lambda example: MAPAssignmentExample.from_nll_example(
            example, map_inference))

    # select training method/parameters, and evaluation
    n_epoch = args.n_epoch
    params = args.optparams
    mini_batch_size = args.mini_batch_size
    n_switch = args.n_switch

    global_step = 0
    require_closure = False
    if args.optimizer == 'SGD':
        optimizer = PyTorchOptimizer.SGD(parameters=model.get_parameters(),
                                         **params)
    elif args.optimizer == 'ADAM':
        optimizer = PyTorchOptimizer.Adam(parameters=model.get_parameters(),
                                          **params)
    elif args.optimizer == 'LBFGS':
        optimizer = PyTorchOptimizer.LBFGS(parameters=model.get_parameters(),
                                           **params)
        require_closure = True
    else:
        assert False
    # optimizer.optimizer.load_state_dict(state_dict)

    for template in model.templates:
        if hasattr(template, 'after_update_weights'):
            optimizer.register_on_step(template.after_update_weights)

    logger.info(args.to_string())
    logger.info("Template info: \n%s" %
                ("\n" %
                 (["\t" + template.get_info()
                   for template in model.templates])))
    logger.info("Train size: %s, Test size: %s", len(train_nll_examples),
                len(test_nll_examples))

    reporter = TensorBoard(log_dir=basedir)
    # cast to list to keep train_map_examples & train_nll_examples aligned with each other (batch example may shuffle)
    if args.parallel_training:
        batch_nll_example = ParallelBatchExample(list(train_nll_examples), 0)
    else:
        batch_nll_example = BatchExample(list(train_nll_examples), 0)

    # *********************************************** DEBUG CODE
    # for i, triples in enumerate(train_examples):
    #     example = triples[0].example
    #     if example.model_id.startswith("s03") and example.no_sample == 29:
    #         example.pred_sm.render()
    #         render_factor_graph(model.get_factors(train_graphs[i]), train_graphs[i],
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #         exit(0)
    #
    # render_factor_graph(train_nll_examples[0].factors, train_nll_examples[0].variables,
    #                     config.fsys.debug.as_path() + "/tmp/factor_graph.pdf")
    #
    # loss_val_accum = ValueAccumulator()
    # gradient_accum = Tensor1AccumulatorDict()
    # for weights in model.get_parameters():
    #     gradient_accum.track_obj(weights, DenseTensorFunc.zeros_like(weights.val))
    # **********************************************************

    progress = pyutils.progress.Progress(n_epoch)
    progress.start()

    if n_switch > 0:
        examples = list(batch_nll_example.split_random(mini_batch_size))
    else:
        examples = [batch_nll_example]

    cm_train, cm_test = None, None
    loss_history = []
    param_hists = []

    for i in range(n_epoch):
        logger.info("Iter %s" % i)

        if i >= n_switch:
            examples = [batch_nll_example]

        if args.shuffle_mini_batch and 0 < i < n_switch:
            examples = batch_nll_example.split_random(mini_batch_size)

        average_loss_val = []
        if not require_closure:
            for example in examples:
                optimizer.zero_grad()
                example.accumulate_value_and_gradient(
                    optimizer.get_value_accumulator(),
                    optimizer.get_gradient_accumulator())
                optimizer.average(example.size())

                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())

                # *********************************************** DEBUG GRADIENT
                # numerical_gradient = NumericalGradient(1e-5)
                # for j, e in enumerate(example.examples):
                #     print(f"\rExample {j}/{len(example.examples)}", end="", flush=True)
                #     gradient_accum.clear()
                #     loss_val_accum.clear()
                #     e.accumulate_value_and_gradient(loss_val_accum, gradient_accum)
                #     for template in model.templates:
                #         for weights in template.get_weights():
                #             gradient = gradient_accum.get_value(weights)
                #             approx_gradients = numerical_gradient.compute_gradient(weights, lambda: nll_func(e))
                #             try:
                #                 np.testing.assert_almost_equal(gradient.numpy(), approx_gradients.numpy(), 6)
                #             except Exception:
                #                 logger.exception("Incorrect gradient...")
                #                 print(template,  weights.val.tolist())
                #                 print(["%11.8f" % x for x in gradient.tolist()])
                #                 print(["%11.8f" % x for x in approx_gradients.tolist()])
                #                 print(["%11d" % int(np.isclose(x, y, rtol=0, atol=1e-6)) for x, y in zip(gradient, approx_gradients)])
                #
                #                 raise
                # print("\n")
                # **************************************************************

                optimizer.step()
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1
        else:
            for example in examples:

                def closure():
                    optimizer.zero_grad()
                    example.accumulate_value_and_gradient(
                        optimizer.get_value_accumulator(),
                        optimizer.get_gradient_accumulator())
                    optimizer.average(example.size())
                    optimizer.copy_grad()
                    return optimizer.get_value_accumulator().get_value()

                optimizer.step(closure)
                logger.info("Accum loss: %.10f" %
                            optimizer.get_value_accumulator().get_value())
                average_loss_val.append(
                    optimizer.get_value_accumulator().get_value())
                reporter.loss_val(
                    optimizer.get_value_accumulator().get_value(), global_step)
                global_step += 1

        if len(average_loss_val) > 1:
            logger.info("Average accum loss: %.10f" %
                        np.average(average_loss_val))

        if optimizer.get_value_accumulator().get_value() < 0:
            break

        if i % args.n_iter_eval == 0 or i == n_epoch - 1:
            cm_train = evaluate(train_map_examples)
            cm_test = evaluate(test_map_examples) or cm_train
            logger.info('train (class_idx=0): %s',
                        cm_train.precision_recall_fbeta(class_idx=0))
            logger.info('train (class_idx=1): %s',
                        cm_train.precision_recall_fbeta(class_idx=1))
            logger.info('test  (class_idx=0): %s',
                        cm_test.precision_recall_fbeta(class_idx=0))
            logger.info('test  (class_idx=1): %s',
                        cm_test.precision_recall_fbeta(class_idx=1))

            reporter.precision_recall_fbeta(cm_train,
                                            global_step,
                                            group='train')
            reporter.precision_recall_fbeta(cm_test, global_step, group='test')

        loss_history.append(np.average(average_loss_val))
        param_hists.append(model.clone_parameters())
        if len(param_hists) > 3:
            param_hists.pop(0)

        if args.optimizer == "ADAM" and len(loss_history) > 4 and all(
                x - y > 0
                for x, y in zip(loss_history[-3:], loss_history[-4:-1])):
            logger.info("Loss increase after 3 epoches. Stop training!")
            break

        progress.finish_one()

    if args.report_final_loss:
        loss_val_accum = ValueAccumulator()
        batch_nll_example.accumulate_value_and_gradient(loss_val_accum, None)
        logger.info("Average accum loss: %.10f" %
                    (loss_val_accum.get_value() / batch_nll_example.size()))

    logger.info("\n\r%s" % progress.summary())
    cm_train.pretty_print("** TRAIN **",
                          precision_recall_fbeta=True,
                          output_stream=logger.info)
    cm_test.pretty_print("** TEST **",
                         precision_recall_fbeta=True,
                         output_stream=logger.info)

    # save model and move everything into another folder for storage
    reporter.close()
    reporter.export(basedir / 'tensorboard_raw.json')

    # clear all cache
    for template in model.templates:
        if isinstance(template, CachedTemplateFactorConstructor):
            template.clear_cache()

    assert len(param_hists) == len(loss_history[-3:])
    min_loss, min_params, min_idx = min(zip(loss_history[-3:], param_hists,
                                            [-3, -2, -1]),
                                        key=lambda x: x[0])
    logger.info("Select parameters at index: %d. Loss = %s", min_idx, min_loss)
    model.update_parameters(min_params)

    serialize(
        (model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()),
        basedir / 'gmtk_model.bin')
    save_evaluation_result(zip(train_map_examples, train_nll_examples),
                           basedir / 'train.output.json')
    save_evaluation_result(zip(test_map_examples, test_nll_examples),
                           basedir / 'test.output.json')
    serializeJSON(input_train_examples, basedir / "train.json")
    serializeJSON(input_test_examples, basedir / "test.json")

    # attempt to copy log file
    try:
        logger.handlers[1].flush()
        shutil.copy(logger.handlers[1].file_handler.baseFilename,
                    str(basedir / "train.log"))
    except:
        logger.exception("Cannot backup log...")

    model_id = get_latest_model_id(basedir) + 1
    move_current_files(basedir, model_id)
    logger.info("Save model %s", model_id)
    return model, tf_domain, pairwise_domain, optimizer.optimizer.state_dict()