def test_models_holdout():
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)

    logging.info("Cleaning models from DINT server")
    for m in dm.models:
        dm.remove_model(m)
    logging.info("Cleaning datasets from DINT server")
    for ds in dm.datasets:
        dm.remove_dataset(ds)

    m1 = create_dint_model(dm, "full", "NoResampling")
    m2 = create_dint_model(dm, "single", "NoResampling")
    m3 = create_dint_model(dm, "chardist", "NoResampling")
    m4 = create_dint_model(dm, "noheader", "NoResampling")
    m5 = create_dint_model(dm, "chardistonly", "NoResampling")
    rf_model = NNetModel(['rf@charfreq'],
                         'rf@charfreq model: no headers',
                         add_headers=False,
                         p_header=0,
                         debug_csv=os.path.join("results",
                                                "debug_nnet_rf_holdout.csv"))

    models = [m1, m2, m3, m4, m5, rf_model]

    rhold_experiment = Experiment(
        models,
        experiment_type="repeated_holdout",
        description="repeated_holdout_0.5_10",
        result_csv=os.path.join('results', "performance_models_holdout.csv"),
        debug_csv=os.path.join("results", "debug_holdout.csv"),
        holdout=0.5,
        num=10)

    rhold_experiment.run()
def test_resampletomean():
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)

    logging.info("Cleaning models from DINT server")
    for m in dm.models:
        dm.remove_model(m)
    logging.info("Cleaning datasets from DINT server")
    for ds in dm.datasets:
        dm.remove_dataset(ds)

    m1 = create_dint_model(dm, "full", "ResampleToMean")
    m2 = create_dint_model(dm, "single", "ResampleToMean")
    m3 = create_dint_model(dm, "full_chardist", "ResampleToMean")
    m4 = create_dint_model(dm, "noheader", "ResampleToMean")
    m5 = create_dint_model(dm, "chardistonly", "ResampleToMean")

    models = [m1, m2, m3, m4, m5]

    loo_experiment = Experiment(
        models,
        experiment_type="leave_one_out",
        description="plain loo",
        result_csv=os.path.join('results',
                                "performance_dint_resampletomean.csv"),
        debug_csv=os.path.join("results", "debug_dint_resampletomean.csv"))

    loo_experiment.run()
def test_simple_holdout():
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)
    # dictionary with features

    single_feature_config = {
        "activeFeatures": [
            "num-unique-vals", "prop-unique-vals", "prop-missing-vals",
            "ratio-alpha-chars", "prop-numerical-chars",
            "prop-whitespace-chars", "prop-entries-with-at-sign"
        ]
    }
    # resampling strategy
    resampling_strategy = "NoResampling"
    dint_model = DINTModel(dm,
                           single_feature_config,
                           resampling_strategy,
                           "DINTModel with simple feature config",
                           debug_csv=os.path.join(
                               "results", "debug_dint_simple_holdout.csv"))

    models = [dint_model]

    loo_experiment = Experiment(
        models,
        experiment_type="repeated_holdout",
        description="repeated_holdout_0.5_2",
        result_csv=os.path.join('results', "performance_simple_holdout.csv"),
        debug_csv=os.path.join("results", "debug_simple_holdout.csv"),
        holdout=0.5,
        num=2)

    loo_experiment.run()
def benchmark_bagging_num(dm_session):
    feature_config = {
        "activeFeatures": ["shannon-entropy"],
        "activeFeatureGroups": ["char-dist-features"]
    }
    print("Picking strategy: Bagging")
    print("Performing experiment: repeated_holdout", )
    print("Setting ignore_unknown: True")

    d1 = DINTModel(dm_session,
                   feature_config,
                   "Bagging",
                   "DINTModel: resampling Bagging, chardistonly",
                   debug_csv=os.path.join("results",
                                          "debug_dint_Bagging_chardist.csv"),
                   ignore_unknown=True,
                   num_bags=100,
                   bag_size=100)
    d2 = DINTModel(dm_session,
                   feature_config,
                   "Bagging",
                   "DINTModel: resampling Bagging, chardistonly",
                   debug_csv=os.path.join("results",
                                          "debug_dint_Bagging_chardist.csv"),
                   ignore_unknown=True,
                   num_bags=10,
                   bag_size=100)
    d3 = DINTModel(dm_session,
                   feature_config,
                   "Bagging",
                   "DINTModel: resampling Bagging, chardistonly",
                   debug_csv=os.path.join("results",
                                          "debug_dint_Bagging_chardist.csv"),
                   ignore_unknown=True,
                   num_bags=150,
                   bag_size=100)
    d4 = DINTModel(dm_session,
                   feature_config,
                   "Bagging",
                   "DINTModel: resampling Bagging, chardistonly",
                   debug_csv=os.path.join("results",
                                          "debug_dint_Bagging_chardist.csv"),
                   ignore_unknown=True,
                   num_bags=50,
                   bag_size=100)

    experiment = Experiment(
        [d1, d2, d3, d4],
        experiment_type="repeated_holdout",
        description="repeated_holdout_ignoreTrue_strategyBagging",
        result_csv=os.path.join('results',
                                "performance_dint_Bagging_ignoreTrue.csv"),
        debug_csv=os.path.join("results", "debug_dint_Bagging_ignoreTrue.csv"),
        holdout=0.2,
        num=10)
    experiment.run()
def test_fullfeature_resampletomean():
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)
    # dictionary with features

    full_feature_config = {
        "activeFeatures": [
            "num-unique-vals", "prop-unique-vals", "prop-missing-vals",
            "ratio-alpha-chars", "prop-numerical-chars",
            "prop-whitespace-chars", "prop-entries-with-at-sign",
            "prop-entries-with-hyphen", "prop-entries-with-paren",
            "prop-entries-with-currency-symbol", "mean-commas-per-entry",
            "mean-forward-slashes-per-entry", "prop-range-format",
            "is-discrete", "entropy-for-discrete-values"
        ],
        "activeFeatureGroups": [
            "stats-of-text-length", "stats-of-numerical-type",
            "prop-instances-per-class-in-knearestneighbours",
            "mean-character-cosine-similarity-from-class-examples",
            "min-editdistance-from-class-examples",
            "min-wordnet-jcn-distance-from-class-examples",
            "min-wordnet-lin-distance-from-class-examples"
        ],
        "featureExtractorParams": [{
            "name": "prop-instances-per-class-in-knearestneighbours",
            "num-neighbours": 5
        }, {
            "name": "min-wordnet-jcn-distance-from-class-examples",
            "max-comparisons-per-class": 5
        }, {
            "name": "min-wordnet-lin-distance-from-class-examples",
            "max-comparisons-per-class": 5
        }]
    }

    # resampling strategy
    resampling_strategy = "ResampleToMean"
    dint_model = DINTModel(
        dm,
        full_feature_config,
        resampling_strategy,
        "DINTModel with full feature config and resampleToMean  and filtered types and no parallel",
        debug_csv=os.path.join("results",
                               "debug_dint_full_resampletomean_no.csv"))

    # models for experiments
    models = [dint_model]
    loo_experiment = Experiment(models,
                                experiment_type="leave_one_out",
                                description="plain loo",
                                result_csv=os.path.join(
                                    'results',
                                    "performance_resample_filter_no.csv"),
                                debug_csv=os.path.join("results",
                                                       "debug_resample.csv"))
    loo_experiment.run()
def make_experiment(dm_session,
                    strategies,
                    cur_experiments,
                    cur_features,
                    unknown_ignore,
                    domains=None):
    for strat in strategies:
        print("Picking strategy: ", strat)
        for exp in cur_experiments:
            print("Performing experiment:", exp)
            for ig in unknown_ignore:
                print("Setting ignore_unknown: ", ig)
                models = [
                    create_dint_model(dm_session, feat, strat, ig)
                    for feat in cur_features
                ]
                experiment = Experiment(
                    models,
                    experiment_type=exp,
                    description=exp + "_ignore" + str(ig) + "_strategy" +
                    strat,
                    result_csv=os.path.join(
                        'results',
                        "performance_dint_{}_ignore{}.csv".format(strat, ig)),
                    debug_csv=os.path.join(
                        "results",
                        "debug_dint_{}_ignore{}.csv".format(strat, ig)),
                    holdout=0.2,
                    num=10)

                if domains:
                    experiment.change_domains(domains)
                experiment.run()
def test_singlefeatures():
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)
    # dictionary with features

    single_feature_config = {
        "activeFeatures": [
            "num-unique-vals", "prop-unique-vals", "prop-missing-vals",
            "ratio-alpha-chars", "prop-numerical-chars",
            "prop-whitespace-chars", "prop-entries-with-at-sign",
            "prop-entries-with-hyphen", "prop-entries-with-paren",
            "prop-entries-with-currency-symbol", "mean-commas-per-entry",
            "mean-forward-slashes-per-entry", "prop-range-format",
            "is-discrete", "entropy-for-discrete-values"
        ]
    }
    # resampling strategy
    resampling_strategy = "BaggingToMax"
    dint_model = DINTModel(
        dm,
        single_feature_config,
        resampling_strategy,
        "DINTModel with single feature config and baggingtomax  and filtered types and no parallel",
        debug_csv=os.path.join("results", "debug_dint_single_no.csv"))

    models = [dint_model]

    loo_experiment = Experiment(models,
                                experiment_type="leave_one_out",
                                description="plain loo",
                                result_csv=os.path.join(
                                    'results',
                                    "performance_bagging_filter_no.csv"),
                                debug_csv=os.path.join("results",
                                                       "debug_bagging.csv"))

    loo_experiment.run()
def test_city(dm):
    # dictionary with features
    full_feature_config = {
        "activeFeatures": [
            "num-unique-vals", "prop-unique-vals", "prop-missing-vals",
            "ratio-alpha-chars", "prop-numerical-chars",
            "prop-whitespace-chars", "prop-entries-with-at-sign",
            "prop-entries-with-hyphen", "prop-entries-with-paren",
            "prop-entries-with-currency-symbol", "mean-commas-per-entry",
            "mean-forward-slashes-per-entry", "prop-range-format",
            "is-discrete", "entropy-for-discrete-values", "shannon-entropy"
        ],
        "activeFeatureGroups": [
            "char-dist-features", "stats-of-text-length",
            "stats-of-numerical-type",
            "mean-character-cosine-similarity-from-class-examples"
        ],
        "featureExtractorParams": [{
            "name": "prop-instances-per-class-in-knearestneighbours",
            "num-neighbours": 5
        }, {
            "name": "min-wordnet-jcn-distance-from-class-examples",
            "max-comparisons-per-class": 5
        }, {
            "name": "min-wordnet-lin-distance-from-class-examples",
            "max-comparisons-per-class": 5
        }]
    }

    # resampling strategy
    resampling_strategy = "Bagging"
    dint_model = DINTModel(
        dm,
        full_feature_config,
        resampling_strategy,
        "DINTModel: resampling {}, features fullchardist".format(
            resampling_strategy),
        debug_csv=os.path.join("results",
                               "debug_dint_chardist_noresampling_head.csv"))

    # models for experiments
    models = [dint_model]
    loo_experiment = Experiment(
        models,
        experiment_type="leave_one_out",
        description="plain loo",
        result_csv=os.path.join('results',
                                "performance_noresampling_chardist.csv"),
        debug_csv=os.path.join("results", "debug_noresampling_head.csv"))
    loo_experiment.change_domains(domains=["dbpedia"])
    loo_experiment.run()
def test_simple(ignore_uknown=True, domains=None):
    # ******* setting up DINTModel
    dm = SchemaMatcher(host="localhost", port=8080)
    # dictionary with features

    single_feature_config = {
        "activeFeatures": [
            "num-unique-vals", "prop-unique-vals", "prop-missing-vals",
            "ratio-alpha-chars", "prop-numerical-chars", "shannon-entropy",
            "prop-whitespace-chars", "prop-entries-with-at-sign"
        ]
    }
    # resampling strategy
    resampling_strategy = "NoResampling"
    dint_model = DINTModel(dm,
                           single_feature_config,
                           resampling_strategy,
                           "DINTModel with simple feature config",
                           debug_csv=os.path.join("results",
                                                  "debug_dint_simple.csv"),
                           ignore_unknown=ignore_uknown)

    models = [dint_model]

    loo_experiment = Experiment(models,
                                experiment_type="leave_one_out",
                                description="plain loo",
                                result_csv=os.path.join(
                                    'results', "performance_simple.csv"),
                                debug_csv=os.path.join("results",
                                                       "debug_simple.csv"))

    weapons = [
        "www.theoutdoorstrader.com.csv", "www.tennesseegunexchange.com.csv",
        "www.montanagunclassifieds.com.csv", "www.kyclassifieds.com.csv",
        "www.hawaiiguntrader.com.csv", "www.gunsinternational.com.csv",
        "www.floridaguntrader.com.csv", "www.floridagunclassifieds.com.csv",
        "www.elpasoguntrader.com.csv", "www.dallasguns.com.csv",
        "www.armslist.com.csv", "www.alaskaslist.com.csv"
    ]

    if domains:
        loo_experiment.change_domains(domains)
    loo_experiment.run()
    domains = None
    experiments = ["leave_one_out", "repeated_holdout"]

    ######################ignore unmapped attributes##########################
    print("Setting ignore_unknown: ", True)
    for experiment_type in experiments:

        models = create_models(dm, dsl, experiment_type, True,
                               resampling_strategies, features)

        experiment = Experiment(models,
                                experiment_type=experiment_type,
                                description=experiment_type,
                                result_csv=os.path.join(
                                    'results',
                                    "performance_{}_ignore{}.csv".format(
                                        experiment_type, True)),
                                debug_csv=os.path.join("results", "debug.csv"),
                                holdout=0.2,
                                num=1)

        if domains:
            experiment.change_domains(domains)
        experiment.run()

    ######################unknown attributes##########################
    print("Setting ignore_unknown: ", False)
    for experiment_type in experiments:
        models = create_models(dm, dsl, experiment_type, False,
                               resampling_strategies, features)
def test_cnn(ignore_unknown=True,
             experiment_type="leave_one_out",
             domains=None):
    # ******* setting up NNetModel
    cnn_model = NNetModel(['cnn@charseq'],
                          'cnn@charseq model: no headers',
                          add_headers=False,
                          p_header=0,
                          debug_csv=os.path.join(
                              "results",
                              "debug_nnet_cnn_ignore{}_{}.csv".format(
                                  ignore_unknown, experiment_type)),
                          ignore_unknown=ignore_unknown)
    mlp_model = NNetModel(['mlp@charfreq'],
                          'mlp@charfreq model: no headers',
                          add_headers=False,
                          p_header=0,
                          debug_csv=os.path.join(
                              "results",
                              "debug_nnet_mlp_ignore{}_{}.csv".format(
                                  ignore_unknown, experiment_type)),
                          ignore_unknown=ignore_unknown)
    cnn_model_head = NNetModel(
        ['cnn@charseq'],
        'cnn@charseq model: headers, p=0.4',
        add_headers=True,
        p_header=0.4,
        debug_csv=os.path.join(
            "results", "debug_nnet_cnn_head_ignore{}_{}.csv".format(
                ignore_unknown, experiment_type)),
        ignore_unknown=ignore_unknown)

    rf_model = NNetModel(['rf@charfreq'],
                         'rf@charfreq model: no headers',
                         add_headers=False,
                         p_header=0,
                         debug_csv=os.path.join(
                             "results", "debug_nnet_rf_ignore{}_{}.csv".format(
                                 ignore_unknown, experiment_type)),
                         ignore_unknown=ignore_unknown)

    # models for experiments
    models = [rf_model, cnn_model, mlp_model]
    experiment = Experiment(
        models,
        experiment_type=experiment_type,
        description=experiment_type + "_ignore" + str(ignore_unknown),
        result_csv=os.path.join(
            'results',
            "performance_nnet_ignore{}_{}.csv".format(ignore_unknown,
                                                      experiment_type)),
        debug_csv=os.path.join(
            "results",
            "debug_nnet_ignore{}_{}.csv".format(ignore_unknown,
                                                experiment_type)),
        holdout=0.2,
        num=1)

    if domains:
        experiment.change_domains(domains)
    experiment.run()