Esempio n. 1
0
 def setUp(self):
     """Initialize the global variables for the tests."""
     self.dataset_classification = DataLoader.get_openml_dataset(
         openml_id=46,
         problem_type=0
     )
     self.dataset_regression = DataLoader.get_openml_dataset(
         openml_id=46,
         problem_type=1
     )
Esempio n. 2
0
    def test_model_by_metric_all_instances_all_metrics(self):
        """Test the that models by metric works for every record in the DB."""
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)

        metrics = [
            'accuracy', 'average_precision', 'balanced_accuracy', 'f1',
            'f1_macro', 'f1_micro', 'f1_multiclass', 'f1_weighted', 'log_loss',
            'pac_score', 'precision', 'precision_macro', 'precision_micro',
            'precision_multiclass', 'precision_weighted', 'recall',
            'recall_macro', 'recall_micro', 'recall_weighted', 'roc_auc'
        ]

        instances_ids, _ = \
            MetaKnowledge().load_datasets_info().weighted_matrix()

        for metric in metrics:
            for instance in instances_ids:
                # Do one by one for testing
                try:
                    models = LandmarkModelParser.models_by_metric([instance],
                                                                  dataset,
                                                                  metric)
                    self.assertTrue(models)
                    # Assert type of list
                    self.assertTrue(isinstance(models, list))

                    # Assert type of each of the list's elements.
                    for model in models:
                        self.assertTrue(isinstance(model, MLSuggestion))
                except ValueError:
                    pass
Esempio n. 3
0
    def test_discovery(self):
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        p_disc = PipelineDiscovery(dataset=dataset)
        pipeline = p_disc.discover()

        print(pipeline)
        print(p_disc.validation_score)
Esempio n. 4
0
    def test_optimize_pipeline(self):
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        pipeline = Pipeline([('normalize-1', Normalizer(norm="max")),
                             ('fastica', FastICA()),
                             ('Union-1', FeatureUnion([('pca-3', PCA(n_components=0.3)),
                                                       ('Union-2', FeatureUnion([
                                                           ('pca-5', PCA(n_components=0.5)),
                                                           ('normalize-2',  Normalizer(norm="l1"))
                                                       ])),
                                                       ('pca-7', PCA(n_components=0.7))
                                                       ])),
                             ('stacking-1', StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=10))),
                             ('stacking-2', StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=20))),
                             ('stacking-3', StackingEstimator(estimator=ExtraTreesClassifier(n_estimators=300))),
                             ('Xgboost', XGBClassifier(base_score=0.9,
                                                       booster="dart",
                                                       min_child_weight=21,
                                                       n_estimators=10,
                                                       reg_alpha=1e-10))])
        bayesian = BayesianOptimizationPipeline(dataset, pipeline, optimize_on="quality", iteration=7)

        bayesian.optimize_pipeline()
        score = bayesian.get_optimized_score()
        opt_pipeline = bayesian.get_optimized_pipeline()
        self.assertIsInstance(opt_pipeline, Pipeline)
        self.assertIsInstance(score, float)
Esempio n. 5
0
    def test_model_by_metric_error(self):
        """Test the behaviour of the model_by_metric method for ValueError."""
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        instances_ids = [1]
        metric = 'accuracy'

        with self.assertRaises(ValueError):
            LandmarkModelParser.models_by_metric(instances_ids, dataset,
                                                 metric)
Esempio n. 6
0
    def test_value_distance_metric(self):
        """Test that a different distance metric evaluates correctly.

        For this, we test with cosine similarity.
        """
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        dists, _ = MKDatabaseClient().nearest_datasets(
            dataset=dataset, distance_metric='cosine')

        for distance in dists[0]:
            self.assertTrue(0 <= distance <= 1)  # it can be by chance but
Esempio n. 7
0
    def test_suggestions_type(self):
        """Test that returned value from meta_suggestions() is correct.

        We know that the internal methods have already been tested, so testing
        the type is the only thing left.
        """
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        _, idx = MKDatabaseClient().nearest_datasets(dataset=dataset)

        res = MKDatabaseClient().meta_suggestions(dataset, idx[0])

        self.assertTrue(isinstance(res, MLSuggestion))
Esempio n. 8
0
 def setUp(self):
     """Initialize global values for the test."""
     self.dataset = \
         DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
     self.expected_metafeatures = [
         mfi.STATS_CLASS_ENTROPY,
         mfi.STATS_CLASS_PROB_MAX,
         mfi.STATS_CLASS_PROB_MEAN,
         mfi.STATS_CLASS_PROB_MIN,
         mfi.STATS_CLASS_PROB_STD,
         mfi.STATS_DS_RATIO,
         mfi.STATS_INV_DS_RATIO,
         mfi.STATS_KURT_MAX,
         mfi.STATS_KURT_MEAN,
         mfi.STATS_KURT_MIN,
         mfi.STATS_KURT_STD,
         mfi.STATS_LANDMARK_1NN,
         mfi.STATS_LANDMARK_DNL,
         mfi.STATS_LANDMARK_DT,
         mfi.STATS_LANDMARK_LDA,
         mfi.STATS_LANDMARK_NB,
         mfi.STATS_LANDMARK_RNL,
         mfi.STATS_LOG_DS_RATIO,
         mfi.STATS_LOG_INV_DS_RATIO,
         mfi.STATS_LOG_N_FEAT,
         mfi.STATS_LOG_N_INST,
         mfi.STATS_N_CAT_FEAT,
         mfi.STATS_N_CLASS,
         mfi.STATS_N_FEAT,
         mfi.STATS_N_FEAT_NA,
         mfi.STATS_N_INST,
         mfi.STATS_N_INST_NA,
         mfi.STATS_N_NA,
         mfi.STATS_N_NF,
         mfi.STATS_PCA_F95V,
         mfi.STATS_PCA_KURT_1PC,
         mfi.STATS_PCA_SKEW_1PC,
         mfi.STATS_PERC_FEAT_NA,
         mfi.STATS_PERC_INST_NA,
         mfi.STATS_PERC_NA,
         mfi.STATS_RATIO_NOM_NUM,
         mfi.STATS_RATIO_NUM_NOM,
         mfi.STATS_SKEW_MAX,
         mfi.STATS_SKEW_MEAN,
         mfi.STATS_SKEW_MIN,
         mfi.STATS_SKEW_STD,
         mfi.STATS_SYM_MAX,
         mfi.STATS_SYM_MEAN,
         mfi.STATS_SYM_MIN,
         mfi.STATS_SYM_STD,
         mfi.STATS_SYM_SUM,
     ]
Esempio n. 9
0
    def test_model_by_metric_return_type(self):
        """Test the behaviour of the model_by_metric method."""
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        instances_ids = [251, 75115]
        metric = 'accuracy'

        models = LandmarkModelParser.models_by_metric(instances_ids, dataset,
                                                      metric)

        # Assert type of list
        self.assertTrue(isinstance(models, list))

        # Assert type of each of the list's elements.
        for model in models:
            self.assertTrue(isinstance(model, MLSuggestion))
Esempio n. 10
0
    def test_workflow(self):
        """Test a workflow for the Assistant."""
        # Get dataset
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)

        # start assistant
        assistant = Assistant(dataset)

        # Compute similar datasets
        assistant.compute_similar_datasets()
        print("Similar datasets", assistant.similar_datasets)

        # Output the resulting reduced search space
        red_ss = assistant.reduced_search_space

        print("classifiers:", red_ss.classifiers)
        print("encoders:", red_ss.encoders)
        print("rescalers:", red_ss.rescalers)
        print("preprocessors:", red_ss.preprocessors)
        print("imputations:", red_ss.imputations)

        # TPOT pipeline genration
        print("Generating pipeline...")
        pipeline_obj = assistant.generate_pipeline()
        pipeline_obj.save_pipeline(target_dir="results")
        # Save tpot's pipeline
        print(pipeline_obj.validation_score)

        # Run bayestian opt
        pipeline = pipeline_obj.pipeline
        # bayesian = BayesianOptimizationPipeline(
        #     dataset,
        #     pipeline,
        #     optimize_on="quality",
        #     iteration=20)

        bopt = assistant.optimize(optimize_on="quality", iteration=5)

        # Prive bayesian's score
        print("Score", bopt.score)
        print(bopt.opt_pipeline)

        print("TPOT: {} vs. Bayesian: {}".format(pipeline_obj.validation_score,
                                                 bopt.score))
Esempio n. 11
0
    def test_value_k(self):
        """Test that if a k > maximum_allowd is passed, error raises."""
        dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
        max_k = len(MKDatabaseClient().metaknwoledge.weighted_matrix()[0])

        # Lower values -> do not fail
        for k in range(1, max_k + 1):
            try:
                _, _ = MKDatabaseClient().nearest_datasets(dataset=dataset,
                                                           k=k)
                flag = False
            except Exception:  # pylint: disable=W0703
                flag = True
            finally:
                self.assertFalse(flag)

        # Greater than max -> fails
        with self.assertRaises(ValueError):
            MKDatabaseClient().nearest_datasets(dataset=dataset, k=max_k + 1)

        # 0 -> fails
        with self.assertRaises(ValueError):
            MKDatabaseClient().nearest_datasets(dataset=dataset, k=0)
Esempio n. 12
0
from automl.datahandler.dataloader import DataLoader
from automl.discovery.assistant import Assistant

dataset = DataLoader.get_openml_dataset(openml_id=46, problem_type=0)
assistant = Assistant(dataset)

# import pipeline and all the necessary components for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import FastICA
from tpot.builtins import StackingEstimator
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

# Build the Pipeline
pipe = Pipeline([('normalize', Normalizer(norm="max")),
                 ('fast_ica', FastICA()),
                 ('stacking_estimator',
                  StackingEstimator(estimator=ExtraTreesClassifier())),
                 ('Xgboost',
                  XGBClassifier(base_score=0.9,
                                booster="dart",
                                min_child_weight=21,
                                n_estimators=10,
                                reg_alpha=1e-10))])

from automl.bayesianoptimizationpiepeline.base import BayesianOptimizationPipeline

bayesian = BayesianOptimizationPipeline(
    dataset=dataset,
    pipeline=pipe,