Ejemplo n.º 1
0
 def __init__(self, settings, models=None):
     if isinstance(settings, AbstractSettings):
         self.settings = settings
     else:
         self.settings = AbstractSettings(settings)
     self.models = models
     self.logger = logging.getLogger(__name__)
     self.logger.info("Initializing Wallace.")
Ejemplo n.º 2
0
    def setUp(self):
        dependent_variable = DatasetVariable(0)
        independent_variables = [
            DatasetVariable(1),
            DatasetVariable(2),
            DatasetVariable(3)
        ]
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            1.0,
            "differential_evolution.differential_weight":
            1.0,
            "independent_variable_selection.initial_independent_variables_percentage":
            0.25
        })

        model_population = []
        for i in xrange(4):
            model = PredictiveModel(settings, ParameterSet({}),
                                    dependent_variable, independent_variables)
            independent_variable_selection = IndependentVariableSelection(
                settings, dependent_variable, independent_variables)
            model_population.append(
                OptimizationAlgorithmModelWrapper(
                    model, independent_variable_selection))
        self.model_population = model_population
        self.settings = settings
        self.potential_independent_variables = independent_variables
        self.target_wrapper = model_population[0]

        self.de_variable_selection = DEIndependentVariableSelection(
            self.settings, self.target_wrapper, self.model_population,
            self.potential_independent_variables)
 def setUp(self):
     settings = AbstractSettings({
         "differential_evolution.crossover_probability":
         1.0,
         "differential_evolution.differential_weight":
         1.0
     })
     self.de_selection = DESelection(settings)
Ejemplo n.º 4
0
 def __init__(self, settings, models=None):
     if isinstance(settings, AbstractSettings):
         self.settings = settings
     else:
         self.settings = AbstractSettings(settings)
     self.models = models
     self.logger = logging.getLogger(__name__)
     self.logger.info("Initializing Wallace.")
Ejemplo n.º 5
0
    def setUp(self):
        self.settings = AbstractSettings()
        self.initialization = WallaceInitialization(self.settings)

        path = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         './sample_regression_data.csv'))
        self.dataset = self.initialization.read_filename(path)
        self.dependent_variable = DatasetVariable("X1")
Ejemplo n.º 6
0
    def test_initializing_small_dataset_with_header(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)

        variables = selection.initialize_independent_variables(2)
        headers = [var.variable for var in variables]
        self.assertIn("column_1", headers)
        self.assertIn("column_2", headers)
    def test_mutate_parameters_without_crossover_probability(self):
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            0.0,
            "differential_evolution.differential_weight":
            1.0
        })
        de_selection = DESelection(settings)
        params = [1.0, 1.5, 2.0]
        result = de_selection.mutate(*params)

        self.assertEqual(None, result)
Ejemplo n.º 8
0
    def test_increasing_probability_of_variables(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)
        selection.increase_probability(DatasetVariable("column_1"))

        self.assertLess(0.5,
                        selection.get_probability(DatasetVariable("column_1")))
        self.assertGreater(
            0.5, selection.get_probability(DatasetVariable("column_2")))
Ejemplo n.º 9
0
    def setUp(self):
        self.settings = AbstractSettings({
            "dataset.remove_rows_with_missing_data":
            True,
            "dataset.maximum_missing_data_percentage":
            1.0
        })
        self.data_matrix = [["3421", "1232", "hello", "t", "5/12/2003"],
                            ["2123", "2221", "mello", "f", "3/12/1995"],
                            ["5234", "", "treble", "f", "5/5/2013"],
                            ["NaN", "NaN", "bobble", "t", "3/1/2004"]]

        self.dataset_cleaner = DatasetCleaner(self.settings, self.data_matrix)
Ejemplo n.º 10
0
    def test_training_decision_tree_on_simple_dataset(self):
        settings = AbstractSettings({})
        dependent_variable = DatasetVariable(0)
        independent_variables = [DatasetVariable(1)]
        regression = RandomForestRegression(settings, self.parameter_set, dependent_variable, independent_variables)

        data_matrix = [[1,1], [2,2], [3,3], [4,4]]
        dataset = Dataset(data_matrix)

        trained = regression.train(dataset)
        array = trained.predict(dataset)

        self.assertEqual(4, len(array))
    def setUp(self):
        settings = AbstractSettings(
            {"optimization_algorithm.population_size": 5})
        predictive_model_generator = PredictiveModelGenerator(settings)
        predictive_model_generator.add_model_type(FakePredictiveModel)
        data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]]
        dataset = Dataset(data_matrix)
        dependent_variable = DatasetVariable(0)

        optimization_algorithm = OptimizationAlgorithm(
            dataset, dependent_variable, settings, predictive_model_generator)
        optimization_algorithm.initialize_population()
        self.model_population = optimization_algorithm.model_population
Ejemplo n.º 12
0
    def setUp(self):
        data_matrix = [[1, 2, 3, "a"], [2, 3, 2, "b"], [3, 2, 1, "a"],
                       [5, 5, 1, "c"], [2, 2, 2, "a"]]
        self.dataset = Dataset(data_matrix)
        self.settings = AbstractSettings({})
        self.parameter_set = ParameterSet({})
        self.dependent_variable = DatasetVariable(0)
        self.independent_variables = [
            DatasetVariable(1),
            DatasetVariable(2),
            DatasetVariable(3)
        ]

        self.sklearn_model = SklearnModel(self.settings, self.parameter_set,
                                          self.dependent_variable,
                                          self.independent_variables)
    def test_writing_tracking_history_to_file(self):
        temporary_file = tempfile.NamedTemporaryFile()
        settings = AbstractSettings({
            "optimization_algorithm_tracking.tracking_log_filename":
            temporary_file.name
        })

        tracking = OptimizationAlgorithmTracking(settings)
        tracking.track_step(0, self.model_population)
        tracking.track_step(1, self.model_population)

        logging_results = tracking.read_logging_file(temporary_file.name)

        self.assertEqual(2, len(logging_results))
        self.assertEqual(0, logging_results[0]["step"])
        self.assertEqual(5, len(logging_results[0]["model_population"]))

        self.assertEqual(1, logging_results[1]["step"])
        self.assertEqual(5, len(logging_results[1]["model_population"]))
Ejemplo n.º 14
0
    def test_cleaning_inconsistent_columns(self):
        data_matrix = [["102", "512", "null", "null"],
                       ["212", "234", "ss", "mm"], ["2.1", "4.3", "ss", "mm"],
                       ["231", "321", "ss",
                        "mm"], ["null", "null", "bb", "cc"],
                       ["4.1", "3.2", "kk", "vv"]]
        settings = AbstractSettings({
            "dataset.remove_rows_with_missing_data":
            True,
            "dataset.maximum_missing_data_percentage":
            0.25
        })

        cleaner = DatasetCleaner(settings, data_matrix)
        result_matrix = cleaner.clean()
        self.assertEqual(4, len(result_matrix))
        self.assertListEqual([212.0, 234.0, "ss", "mm"], result_matrix[0])
        self.assertListEqual([2.1, 4.3, "ss", "mm"], result_matrix[1])
        self.assertListEqual([231.0, 321.0, "ss", "mm"], result_matrix[2])
        self.assertListEqual([4.1, 3.2, "kk", "vv"], result_matrix[3])
Ejemplo n.º 15
0
    def setUp(self):
        data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]]
        dataset = Dataset(data_matrix)
        dependent_variable = DatasetVariable(0)
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            1.0,
            "differential_evolution.differential_weight":
            1.0,
            "optimization_algorithm.population_size":
            5,
            "independent_variable_selection.initial_independent_variables_percentage":
            1.0
        })

        predictive_model_generator = PredictiveModelGenerator(settings)
        predictive_model_generator.add_model_type(FakePredictiveModel)

        self.optimization_algorithm = OptimizationAlgorithm(
            dataset, dependent_variable, settings, predictive_model_generator)
    def test_generation_of_parameter_set_without_crossover(self):
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            0.0,
            "differential_evolution.differential_weight":
            0.75,
        })
        de_parameter_selection = DEParameterSelection(settings,
                                                      self.target_wrapper,
                                                      self.model_population,
                                                      self.validity_check)
        parameter_set = de_parameter_selection.generate_parameter_set()

        self.assertEqual(0.25, parameter_set.get("range_param_0"))
        self.assertEqual(0.2, parameter_set.get("range_param_1"))
        self.assertEqual(0.5, parameter_set.get("range_param_2"))
        self.assertIn(parameter_set.get("category_param_0"),
                      ["0", "1", "2", "3"])
        self.assertIn(parameter_set.get("category_param_1"),
                      ["0", "1", "2", "3"])
    def test_generation_of_parameter_set_with_different_mutation_params(self):
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            1.0,
            "differential_evolution.differential_weight":
            0.75,
        })
        de_parameter_selection = DEParameterSelection(settings,
                                                      self.target_wrapper,
                                                      self.model_population,
                                                      self.validity_check)
        parameter_set = de_parameter_selection.generate_parameter_set()

        self.assertIn(round(parameter_set.get("range_param_0"), 3),
                      [0.275, 0.300, 0.425, 0.475, 0.600, 0.625])
        self.assertAlmostEqual(0.3, parameter_set.get("range_param_1"))
        self.assertAlmostEqual(0.5, parameter_set.get("range_param_2"))
        self.assertIn(parameter_set.get("category_param_0"),
                      ["0", "1", "2", "3"])
        self.assertIn(parameter_set.get("category_param_1"),
                      ["0", "1", "2", "3"])
Ejemplo n.º 18
0
    def test_cleaning_sparse_columns(self):
        settings = AbstractSettings({
            "dataset.remove_rows_with_missing_data":
            True,
            "dataset.maximum_missing_data_percentage":
            0.25
        })
        non_sparse_data_matrix = [["13", "123"], ["23", "234"], ["34", "455"],
                                  ["12", "345"], ["11", "235"], ["34", "234"],
                                  [None, "234"]]
        sparse_data_matrix = [
            [None, "324"],
            [None, "232"],
            [None, "123"],
            [None, "234"],
            [None, "234"],
            ["1111", "234"],
            ["4324", None],
        ]
        non_sparse_dataset_cleaner = DatasetCleaner(settings,
                                                    non_sparse_data_matrix)
        sparse_dataset_cleaner = DatasetCleaner(settings, sparse_data_matrix)

        matrix = non_sparse_dataset_cleaner.clean()
        self.assertEqual(6, len(matrix))
        self.assertListEqual([13, 123], matrix[0])
        self.assertListEqual([23, 234], matrix[1])
        self.assertListEqual([34, 455], matrix[2])
        self.assertListEqual([12, 345], matrix[3])
        self.assertListEqual([11, 235], matrix[4])
        self.assertListEqual([34, 234], matrix[5])

        matrix = sparse_dataset_cleaner.clean()
        self.assertEqual(6, len(matrix))
        self.assertListEqual([None, 324], matrix[0])
        self.assertListEqual([None, 232], matrix[1])
        self.assertListEqual([None, 123], matrix[2])
        self.assertListEqual([None, 234], matrix[3])
        self.assertListEqual([None, 234], matrix[4])
        self.assertListEqual([1111, 234], matrix[5])
    def setUp(self):
        dependent_variable = DatasetVariable(0)
        independent_variables = [DatasetVariable(1), DatasetVariable(2)]
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            1.0,
            "differential_evolution.differential_weight":
            1.0,
        })

        model_population = []
        for i in xrange(4):
            if i == 0:
                param1 = 0.2
            else:
                param1 = 0.3
            parameter_values = {
                "range_param_0": 0.25 + 0.1 * i,
                "range_param_1": param1,
                "range_param_2": 0.5,
                "category_param_0": "0",
                "category_param_1": "0"
            }
            parameter_set = ParameterSet(parameter_values)
            model = FakePredictiveModel(settings, parameter_set,
                                        dependent_variable,
                                        independent_variables)
            wrapper = OptimizationAlgorithmModelWrapper(
                model, "fake_independent_variable_selection")
            model_population.append(wrapper)

        self.model_population = model_population
        self.validity_check = FakePredictiveModel.validity_check()
        self.settings = settings
        self.target_wrapper = model_population[0]
        self.de_parameter_selection = DEParameterSelection(
            settings, self.target_wrapper, model_population,
            self.validity_check)
Ejemplo n.º 20
0
 def setUp(self):
     settings = AbstractSettings({})
     path = os.path.abspath(
         os.path.join(os.path.dirname(__file__), './sample_dataset.csv'))
     self.dataset_file_reader = DatasetFileReader(settings, path)
Ejemplo n.º 21
0
class WallaceInitialization(object):

    DEFAULT_PREDICTIVE_MODELS = {
            BayesianRidgeRegression: None,
            DecisionTreeRegression: None,
            LarsLassoRegression: None,
            LarsRegression: None,
            LassoRegression: None,
            OLSLinearRegression: None,
            RidgeRegression: None,
            SvmSvcRegression: None
        }

    def __init__(self, settings, models=None):
        if isinstance(settings, AbstractSettings):
            self.settings = settings
        else:
            self.settings = AbstractSettings(settings)
        self.models = models
        self.logger = logging.getLogger(__name__)
        self.logger.info("Initializing Wallace.")

    def create_predictive_model_generator(self, models=None):
        if models == None:
            models = WeightedSelection(self.DEFAULT_PREDICTIVE_MODELS).normalize_weights()
        elif isinstance(models, list):
            predictive_models = {}
            for model in models:
                predictive_models[model] = None
            models = WeightedSelection(predictive_models).normalize_weights()
        else:
            models = WeightedSelection(models).normalize_weights()

        predictive_model_generator = PredictiveModelGenerator(self.settings)
        predictive_model_generator.set_model_types(models)

        return predictive_model_generator

    def run_differential_evolution(self, dataset, dependent_variable):
        self.logger.info("Running differential evolution on dataset.")
        predictive_model_generator = self.create_predictive_model_generator()
        differential_evolution = DifferentialEvolution(dataset, dependent_variable, self.settings, predictive_model_generator)
        differential_evolution.run()

    def read_filename(self, dataset_filename):
        dataset = DatasetFileReader(self.settings, dataset_filename).read()
        if self.settings.get("dataset_transformation.transform_datasets"):
            return self.clean_and_transform_data(dataset)
        else:
            return dataset

    def clean_and_transform_data(self, dataset):
        return DatasetTransformer(self.settings).transform(dataset)

    @classmethod
    def initialize(klass, settings, dependent_variable, dataset_filename):
        initialization = WallaceInitialization(settings)
        initialization.settings.set("dataset.dataset_filename", dataset_filename)
        dataset = initialization.read_filename(dataset_filename)

        if not isinstance(dependent_variable, DatasetVariable):
            dependent_variable = DatasetVariable(dependent_variable)

        initialization.run_differential_evolution(dataset, dependent_variable)

    @classmethod
    def initialize_multiprocess_pool(klass, settings, dependent_variable, dataset_filename, processes=10):
        pool = Pool(processes=processes)
        result = pool.apply_async(klass.initialize, args=(settings, dependent_variable, dataset_filename))
Ejemplo n.º 22
0
class WallaceInitialization(object):

    DEFAULT_PREDICTIVE_MODELS = {
        BayesianRidgeRegression: None,
        DecisionTreeRegression: None,
        LarsLassoRegression: None,
        LarsRegression: None,
        LassoRegression: None,
        OLSLinearRegression: None,
        RidgeRegression: None,
        SvmSvcRegression: None
    }

    def __init__(self, settings, models=None):
        if isinstance(settings, AbstractSettings):
            self.settings = settings
        else:
            self.settings = AbstractSettings(settings)
        self.models = models
        self.logger = logging.getLogger(__name__)
        self.logger.info("Initializing Wallace.")

    def create_predictive_model_generator(self, models=None):
        if models == None:
            models = WeightedSelection(
                self.DEFAULT_PREDICTIVE_MODELS).normalize_weights()
        elif isinstance(models, list):
            predictive_models = {}
            for model in models:
                predictive_models[model] = None
            models = WeightedSelection(predictive_models).normalize_weights()
        else:
            models = WeightedSelection(models).normalize_weights()

        predictive_model_generator = PredictiveModelGenerator(self.settings)
        predictive_model_generator.set_model_types(models)

        return predictive_model_generator

    def run_differential_evolution(self, dataset, dependent_variable):
        self.logger.info("Running differential evolution on dataset.")
        predictive_model_generator = self.create_predictive_model_generator()
        differential_evolution = DifferentialEvolution(
            dataset, dependent_variable, self.settings,
            predictive_model_generator)
        differential_evolution.run()

    def read_filename(self, dataset_filename):
        dataset = DatasetFileReader(self.settings, dataset_filename).read()
        if self.settings.get("dataset_transformation.transform_datasets"):
            return self.clean_and_transform_data(dataset)
        else:
            return dataset

    def clean_and_transform_data(self, dataset):
        return DatasetTransformer(self.settings).transform(dataset)

    @classmethod
    def initialize(klass, settings, dependent_variable, dataset_filename):
        initialization = WallaceInitialization(settings)
        initialization.settings.set("dataset.dataset_filename",
                                    dataset_filename)
        dataset = initialization.read_filename(dataset_filename)

        if not isinstance(dependent_variable, DatasetVariable):
            dependent_variable = DatasetVariable(dependent_variable)

        initialization.run_differential_evolution(dataset, dependent_variable)

    @classmethod
    def initialize_multiprocess_pool(klass,
                                     settings,
                                     dependent_variable,
                                     dataset_filename,
                                     processes=10):
        pool = Pool(processes=processes)
        result = pool.apply_async(klass.initialize,
                                  args=(settings, dependent_variable,
                                        dataset_filename))
Ejemplo n.º 23
0
 def setUp(self):
     self.settings = AbstractSettings(
         {"dataset_transformation.default_transformations": True})
     self.transformations = [IdentityTransformation]
     self.transformer = DatasetTransformer(self.settings,
                                           self.transformations)
Ejemplo n.º 24
0
 def setUp(self):
     self.settings = AbstractSettings()
Ejemplo n.º 25
0
 def setUp(self):
     settings = AbstractSettings()
     self.dataset_transformation = IdentityTransformation(settings)