Exemple #1
0
    def test_serialize_complex_flow(self):
        ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        boosting = sklearn.ensemble.AdaBoostClassifier(
            base_estimator=sklearn.tree.DecisionTreeClassifier())
        model = sklearn.pipeline.Pipeline(steps=(
            ('ohe', ohe), ('scaler', scaler), ('boosting', boosting)))
        parameter_grid = {'n_estimators': [1, 5, 10, 100],
                          'learning_rate': scipy.stats.uniform(0.01, 0.99),
                          'base_estimator__max_depth': scipy.stats.randint(1,
                                                                           10)}
        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
        rs = sklearn.model_selection.RandomizedSearchCV(
            estimator=model, param_distributions=parameter_grid, cv=cv)
        serialized = sklearn_to_flow(rs)

        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
                       'estimator=sklearn.pipeline.Pipeline(' \
                       'ohe=sklearn.preprocessing.data.OneHotEncoder,' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
                       'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))'
        self.assertEqual(serialized.name, fixture_name)

        # now do deserialization
        deserialized = flow_to_sklearn(serialized)

        # Checks that sklearn_to_flow is idempotent.
        serialized2 = sklearn_to_flow(deserialized)
        self.assertNotEqual(rs, deserialized)
        # Would raise an exception if the flows would be unequal
        assert_flows_equal(serialized, serialized2)
    def test_serialize_complex_flow(self):
        ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        boosting = sklearn.ensemble.AdaBoostClassifier(
            base_estimator=sklearn.tree.DecisionTreeClassifier())
        model = sklearn.pipeline.Pipeline(steps=(('ohe', ohe), ('scaler',
                                                                scaler),
                                                 ('boosting', boosting)))
        parameter_grid = {
            'n_estimators': [1, 5, 10, 100],
            'learning_rate': scipy.stats.uniform(0.01, 0.99),
            'base_estimator__max_depth': scipy.stats.randint(1, 10)
        }
        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
        rs = sklearn.model_selection.RandomizedSearchCV(
            estimator=model, param_distributions=parameter_grid, cv=cv)
        serialized = sklearn_to_flow(rs)

        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
                       'estimator=sklearn.pipeline.Pipeline(' \
                       'ohe=sklearn.preprocessing.data.OneHotEncoder,' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
                       'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))'
        self.assertEqual(serialized.name, fixture_name)

        # now do deserialization
        deserialized = flow_to_sklearn(serialized)

        # Checks that sklearn_to_flow is idempotent.
        serialized2 = sklearn_to_flow(deserialized)
        self.assertNotEqual(rs, deserialized)
        # Would raise an exception if the flows would be unequal
        assert_flows_equal(serialized, serialized2)
    def test_serialize_feature_union(self):
        ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
        scaler = sklearn.preprocessing.StandardScaler()
        fu = sklearn.pipeline.FeatureUnion(
            transformer_list=[('ohe', ohe), ('scaler', scaler)])
        serialization = sklearn_to_flow(fu)
        self.assertEqual(
            serialization.name, 'sklearn.pipeline.FeatureUnion('
            'ohe=sklearn.preprocessing.data.OneHotEncoder,'
            'scaler=sklearn.preprocessing.data.StandardScaler)')
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertEqual(new_model.transformer_list[0][0],
                         fu.transformer_list[0][0])
        self.assertEqual(new_model.transformer_list[0][1].get_params(),
                         fu.transformer_list[0][1].get_params())
        self.assertEqual(new_model.transformer_list[1][0],
                         fu.transformer_list[1][0])
        self.assertEqual(new_model.transformer_list[1][1].get_params(),
                         fu.transformer_list[1][1].get_params())

        self.assertEqual([step[0] for step in new_model.transformer_list],
                         [step[0] for step in fu.transformer_list])
        self.assertIsNot(new_model.transformer_list[0][1],
                         fu.transformer_list[0][1])
        self.assertIsNot(new_model.transformer_list[1][1],
                         fu.transformer_list[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['ohe']
        del new_model_params['scaler']
        del new_model_params['transformer_list']
        fu_params = fu.get_params()
        del fu_params['ohe']
        del fu_params['scaler']
        del fu_params['transformer_list']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)

        fu.set_params(scaler=None)
        serialization = sklearn_to_flow(fu)
        self.assertEqual(
            serialization.name, 'sklearn.pipeline.FeatureUnion('
            'ohe=sklearn.preprocessing.data.OneHotEncoder)')
        new_model = flow_to_sklearn(serialization)
        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertIs(new_model.transformer_list[1][1], None)
Exemple #4
0
    def test_serialize_feature_union(self):
        ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
        scaler = sklearn.preprocessing.StandardScaler()
        fu = sklearn.pipeline.FeatureUnion(transformer_list=[('ohe', ohe),
                                                             ('scaler', scaler)])
        serialization =  sklearn_to_flow(fu)
        self.assertEqual(serialization.name,
                         'sklearn.pipeline.FeatureUnion('
                         'ohe=sklearn.preprocessing.data.OneHotEncoder,'
                         'scaler=sklearn.preprocessing.data.StandardScaler)')
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertEqual(new_model.transformer_list[0][0],
                         fu.transformer_list[0][0])
        self.assertEqual(new_model.transformer_list[0][1].get_params(),
                         fu.transformer_list[0][1].get_params())
        self.assertEqual(new_model.transformer_list[1][0],
                         fu.transformer_list[1][0])
        self.assertEqual(new_model.transformer_list[1][1].get_params(),
                         fu.transformer_list[1][1].get_params())

        self.assertEqual([step[0] for step in new_model.transformer_list],
                         [step[0] for step in fu.transformer_list])
        self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1])
        self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['ohe']
        del new_model_params['scaler']
        del new_model_params['transformer_list']
        fu_params = fu.get_params()
        del fu_params['ohe']
        del fu_params['scaler']
        del fu_params['transformer_list']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)

        fu.set_params(scaler=None)
        serialization = sklearn_to_flow(fu)
        self.assertEqual(serialization.name,
                         'sklearn.pipeline.FeatureUnion('
                         'ohe=sklearn.preprocessing.data.OneHotEncoder)')
        new_model = flow_to_sklearn(serialization)
        self.assertEqual(type(new_model), type(fu))
        self.assertIsNot(new_model, fu)
        self.assertIs(new_model.transformer_list[1][1], None)
Exemple #5
0
def testFunction(data):
    #clf = sklearn.ensemble.forest.RandomForestClassifier(bootstrap:true,weight:null,criterion:"gini",depth:null,features:"auto",nodes:null,decrease:0.0,split:null,leaf:1,split:2,leaf:0.0,estimators:10,jobs:1,score:false,state:6826,verbose:0,start:false)
    #X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True);

    run = oml.runs.get_run(1836360)
    print(run.flow_id)
    #flow = oml.flows.get_flow(4834)
    flow = oml.flows.get_flow(8900)
    #flow = oml.flows.get_flow(8426)
    #flow = oml.flows.get_flow(7650)
    flow = oml.flows.flow_to_sklearn(flow)
    clf = pipeline.Pipeline(steps=[('imputer',
                                    impute.SimpleImputer()), ('estimator',
                                                              flow)])
    flow = flows.sklearn_to_flow(clf)
    print(flow.model)
    taskId = tasks.get_task(55)

    run = runs.run_flow_on_task(taskId, flow, avoid_duplicate_runs=True)

    feval = dict(run.fold_evaluations['predictive_accuracy'][0])
    acc = 0
    for val in feval.values():
        acc += val
    print(acc / 10)
    def test_serialize_cvobject(self):
        methods = [
            sklearn.model_selection.KFold(3),
            sklearn.model_selection.LeaveOneOut()
        ]
        fixtures = [
            OrderedDict([
                ('oml-python:serialized_object', 'cv_object'),
                ('value',
                 OrderedDict([('name', 'sklearn.model_selection._split.KFold'),
                              ('parameters',
                               OrderedDict([('n_splits', '3'),
                                            ('random_state', 'null'),
                                            ('shuffle', 'false')]))]))
            ]),
            OrderedDict([
                ('oml-python:serialized_object', 'cv_object'),
                ('value',
                 OrderedDict([('name',
                               'sklearn.model_selection._split.LeaveOneOut'),
                              ('parameters', OrderedDict())]))
            ])
        ]
        for method, fixture in zip(methods, fixtures):
            m = sklearn_to_flow(method)
            self.assertEqual(m, fixture)

            m_new = flow_to_sklearn(m)
            self.assertIsNot(m_new, m)
            self.assertIsInstance(m_new, type(method))
 def test_serialize_resampling(self):
     kfold = sklearn.model_selection.StratifiedKFold(n_splits=4,
                                                     shuffle=True)
     serialized = sklearn_to_flow(kfold)
     deserialized = flow_to_sklearn(serialized)
     # Best approximation to get_params()
     self.assertEqual(str(deserialized), str(kfold))
     self.assertIsNot(deserialized, kfold)
Exemple #8
0
 def test_serialize_resampling(self):
     kfold = sklearn.model_selection.StratifiedKFold(
         n_splits=4, shuffle=True)
     serialized =  sklearn_to_flow(kfold)
     deserialized = flow_to_sklearn(serialized)
     # Best approximation to get_params()
     self.assertEqual(str(deserialized), str(kfold))
     self.assertIsNot(deserialized, kfold)
Exemple #9
0
 def test_serialize_feature_union_switched_names(self):
     ohe = sklearn.preprocessing.OneHotEncoder()
     scaler = sklearn.preprocessing.StandardScaler()
     fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[('ohe', ohe), ('scaler', scaler)])
     fu2 = sklearn.pipeline.FeatureUnion(transformer_list=[('scaler', ohe), ('ohe', scaler)])
     fu1_serialization = sklearn_to_flow(fu1)
     fu2_serialization = sklearn_to_flow(fu2)
     self.assertEqual(
         fu1_serialization.name,
         "sklearn.pipeline.FeatureUnion("
         "ohe=sklearn.preprocessing.data.OneHotEncoder,"
         "scaler=sklearn.preprocessing.data.StandardScaler)")
     self.assertEqual(
         fu2_serialization.name,
         "sklearn.pipeline.FeatureUnion("
         "scaler=sklearn.preprocessing.data.OneHotEncoder,"
         "ohe=sklearn.preprocessing.data.StandardScaler)")
 def test_serialize_feature_union_switched_names(self):
     ohe = sklearn.preprocessing.OneHotEncoder()
     scaler = sklearn.preprocessing.StandardScaler()
     fu1 = sklearn.pipeline.FeatureUnion(
         transformer_list=[('ohe', ohe), ('scaler', scaler)])
     fu2 = sklearn.pipeline.FeatureUnion(
         transformer_list=[('scaler', ohe), ('ohe', scaler)])
     fu1_serialization = sklearn_to_flow(fu1)
     fu2_serialization = sklearn_to_flow(fu2)
     self.assertEqual(
         fu1_serialization.name, "sklearn.pipeline.FeatureUnion("
         "ohe=sklearn.preprocessing.data.OneHotEncoder,"
         "scaler=sklearn.preprocessing.data.StandardScaler)")
     self.assertEqual(
         fu2_serialization.name, "sklearn.pipeline.FeatureUnion("
         "scaler=sklearn.preprocessing.data.OneHotEncoder,"
         "ohe=sklearn.preprocessing.data.StandardScaler)")
Exemple #11
0
    def test_serialize_type(self):
        supported_types = [float, np.float, np.float32, np.float64,
                           int, np.int, np.int32, np.int64]

        for supported_type in supported_types:
            serialized = sklearn_to_flow(supported_type)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(deserialized, supported_type)
Exemple #12
0
    def test_serialize_pipeline(self):
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        dummy = sklearn.dummy.DummyClassifier(strategy='prior')
        model = sklearn.pipeline.Pipeline(steps=(
            ('scaler', scaler), ('dummy', dummy)))

        fixture_name = 'sklearn.pipeline.Pipeline(' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'dummy=sklearn.dummy.DummyClassifier)'
        fixture_description = 'Automatically created scikit-learn flow.'

        serialization =  sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)

        # Comparing the pipeline
        # The parameters only have the name of base objects(not the whole flow)
        # as value
        self.assertEqual(len(serialization.parameters), 1)
        # Hard to compare two representations of a dict due to possibly
        # different sorting. Making a json makes it easier
        self.assertEqual(json.loads(serialization.parameters['steps']),
                         [{'oml-python:serialized_object':
                               'component_reference', 'value': {'key': 'scaler', 'step_name': 'scaler'}},
                          {'oml-python:serialized_object':
                               'component_reference', 'value': {'key': 'dummy', 'step_name': 'dummy'}}])

        # Checking the sub-component
        self.assertEqual(len(serialization.components), 2)
        self.assertIsInstance(serialization.components['scaler'],
                              OpenMLFlow)
        self.assertIsInstance(serialization.components['dummy'],
                              OpenMLFlow)

        #del serialization.model
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual([step[0] for step in new_model.steps],
                         [step[0] for step in model.steps])
        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['scaler']
        del new_model_params['dummy']
        del new_model_params['steps']
        fu_params = model.get_params()
        del fu_params['scaler']
        del fu_params['dummy']
        del fu_params['steps']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)
    def test_serialize_type(self):
        supported_types = [
            float, np.float, np.float32, np.float64, int, np.int, np.int32,
            np.int64
        ]

        for supported_type in supported_types:
            serialized = sklearn_to_flow(supported_type)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(deserialized, supported_type)
    def test_hypothetical_parameter_values(self):
        # The hypothetical parameter values of true, 1, 0.1 formatted as a
        # string (and their correct serialization and deserialization) an only
        #  be checked inside a model

        model = Model('true', '1', '0.1')

        serialized = sklearn_to_flow(model)
        deserialized = flow_to_sklearn(serialized)
        self.assertEqual(deserialized.get_params(), model.get_params())
        self.assertIsNot(deserialized, model)
Exemple #15
0
    def test_hypothetical_parameter_values(self):
        # The hypothetical parameter values of true, 1, 0.1 formatted as a
        # string (and their correct serialization and deserialization) an only
        #  be checked inside a model

        model = Model('true', '1', '0.1')

        serialized = sklearn_to_flow(model)
        deserialized = flow_to_sklearn(serialized)
        self.assertEqual(deserialized.get_params(), model.get_params())
        self.assertIsNot(deserialized, model)
    def test_serialize_advanced_grid(self):
        # TODO instead a GridSearchCV object should be serialized

        # This needs to be in its own function because we cannot simply check
        # for the equality of the grid, because scikit-learn objects don't
        # really support the equality operator
        # This will only work with sklearn==0.18
        N_FEATURES_OPTIONS = [2, 4, 8]
        C_OPTIONS = [1, 10, 100, 1000]
        grid = [{
            'reduce_dim': [
                sklearn.decomposition.PCA(iterated_power=7),
                sklearn.decomposition.NMF()
            ],
            'reduce_dim__n_components':
            N_FEATURES_OPTIONS,
            'classify__C':
            C_OPTIONS
        }, {
            'reduce_dim': [
                sklearn.feature_selection.SelectKBest(
                    sklearn.feature_selection.chi2)
            ],
            'reduce_dim__k':
            N_FEATURES_OPTIONS,
            'classify__C':
            C_OPTIONS
        }]

        serialized = sklearn_to_flow(grid)
        deserialized = flow_to_sklearn(serialized)

        self.assertEqual(grid[0]['reduce_dim'][0].get_params(),
                         deserialized[0]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][0],
                         deserialized[0]['reduce_dim'][0])
        self.assertEqual(grid[0]['reduce_dim'][1].get_params(),
                         deserialized[0]['reduce_dim'][1].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][1],
                         deserialized[0]['reduce_dim'][1])
        self.assertEqual(grid[0]['reduce_dim__n_components'],
                         deserialized[0]['reduce_dim__n_components'])
        self.assertEqual(grid[0]['classify__C'],
                         deserialized[0]['classify__C'])
        self.assertEqual(grid[1]['reduce_dim'][0].get_params(),
                         deserialized[1]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[1]['reduce_dim'][0],
                         deserialized[1]['reduce_dim'][0])
        self.assertEqual(grid[1]['reduce_dim__k'],
                         deserialized[1]['reduce_dim__k'])
        self.assertEqual(grid[1]['classify__C'],
                         deserialized[1]['classify__C'])
Exemple #17
0
    def test_serialize_rvs(self):
        supported_rvs = [scipy.stats.norm(loc=1, scale=5),
                         scipy.stats.expon(loc=1, scale=5),
                         scipy.stats.randint(low=-3, high=15)]

        for supported_rv in supported_rvs:
            serialized = sklearn_to_flow(supported_rv)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
            del deserialized.dist
            del supported_rv.dist
            self.assertEqual(deserialized.__dict__,
                             supported_rv.__dict__)
Exemple #18
0
    def test_serialize_simple_parameter_grid(self):

        # We cannot easily test for scipy random variables in here, but they
        # should be covered

        # Examples from the scikit-learn documentation
        models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
        grids = \
            [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
               'kernel': ['rbf']}],
             {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}]

        for grid, model in zip(grids, models):
            serialized = sklearn_to_flow(grid)
            deserialized = flow_to_sklearn(serialized)

            self.assertEqual(deserialized, grid)
            self.assertIsNot(deserialized, grid)

            hpo = sklearn.model_selection.GridSearchCV(
                param_grid=grid, estimator=model)

            serialized = sklearn_to_flow(hpo)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(hpo.param_grid, deserialized.param_grid)
            self.assertEqual(hpo.estimator.get_params(),
                             deserialized.estimator.get_params())
            hpo_params = hpo.get_params(deep=False)
            deserialized_params = deserialized.get_params(deep=False)
            del hpo_params['estimator']
            del deserialized_params['estimator']
            self.assertEqual(hpo_params, deserialized_params)
    def test_serialize_simple_parameter_grid(self):

        # We cannot easily test for scipy random variables in here, but they
        # should be covered

        # Examples from the scikit-learn documentation
        models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
        grids = \
            [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
               'kernel': ['rbf']}],
             {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}]

        for grid, model in zip(grids, models):
            serialized = sklearn_to_flow(grid)
            deserialized = flow_to_sklearn(serialized)

            self.assertEqual(deserialized, grid)
            self.assertIsNot(deserialized, grid)

            hpo = sklearn.model_selection.GridSearchCV(param_grid=grid,
                                                       estimator=model)

            serialized = sklearn_to_flow(hpo)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(hpo.param_grid, deserialized.param_grid)
            self.assertEqual(hpo.estimator.get_params(),
                             deserialized.estimator.get_params())
            hpo_params = hpo.get_params(deep=False)
            deserialized_params = deserialized.get_params(deep=False)
            del hpo_params['estimator']
            del deserialized_params['estimator']
            self.assertEqual(hpo_params, deserialized_params)
    def test_serialize_rvs(self):
        supported_rvs = [
            scipy.stats.norm(loc=1, scale=5),
            scipy.stats.expon(loc=1, scale=5),
            scipy.stats.randint(low=-3, high=15)
        ]

        for supported_rv in supported_rvs:
            serialized = sklearn_to_flow(supported_rv)
            deserialized = flow_to_sklearn(serialized)
            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
            del deserialized.dist
            del supported_rv.dist
            self.assertEqual(deserialized.__dict__, supported_rv.__dict__)
Exemple #21
0
 def test_subflow_version_propagated(self):
     this_directory = os.path.dirname(os.path.abspath(__file__))
     tests_directory = os.path.abspath(os.path.join(this_directory, '..', '..'))
     sys.path.append(tests_directory)
     import tests.test_flows.dummy_learn.dummy_forest
     pca = sklearn.decomposition.PCA()
     dummy = tests.test_flows.dummy_learn.dummy_forest.DummyRegressor()
     pipeline = sklearn.pipeline.Pipeline((('pca', pca), ('dummy', dummy)))
     flow = sklearn_to_flow(pipeline)
     # In python2.7, the unit tests work differently on travis-ci; therefore,
     # I put the alternative travis-ci answer here as well. While it has a
     # different value, it is still correct as it is a propagation of the
     # subclasses' module name
     self.assertEqual(flow.external_version, '%s,%s,%s' % (
         _format_external_version('openml', openml.__version__),
         _format_external_version('sklearn', sklearn.__version__),
         _format_external_version('tests', '0.1')))
    def test_serialize_model_with_subcomponent(self):
        model = sklearn.ensemble.AdaBoostClassifier(
            n_estimators=100,
            base_estimator=sklearn.tree.DecisionTreeClassifier())

        fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
                       '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
        fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_class_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
        self.assertIsInstance(serialization.parameters['base_estimator'], str)
        self.assertEqual(serialization.parameters['learning_rate'], '1.0')
        self.assertEqual(serialization.parameters['n_estimators'], '100')
        self.assertEqual(serialization.components['base_estimator'].name,
                         fixture_subcomponent_name)
        self.assertEqual(serialization.components['base_estimator'].class_name,
                         fixture_subcomponent_class_name)
        self.assertEqual(
            serialization.components['base_estimator'].description,
            fixture_subcomponent_description)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertIsNot(new_model.base_estimator, model.base_estimator)
        self.assertEqual(new_model.base_estimator.get_params(),
                         model.base_estimator.get_params())
        new_model_params = new_model.get_params()
        del new_model_params['base_estimator']
        model_params = model.get_params()
        del model_params['base_estimator']

        self.assertEqual(new_model_params, model_params)
        new_model.fit(self.X, self.y)
Exemple #23
0
    def test_serialize_cvobject(self):
        methods = [sklearn.model_selection.KFold(3),
                   sklearn.model_selection.LeaveOneOut()]
        fixtures = [OrderedDict([('oml-python:serialized_object', 'cv_object'),
                                 ('value', OrderedDict([('name', 'sklearn.model_selection._split.KFold'),
                                                        ('parameters', OrderedDict([('n_splits', '3'),
                                                                                    ('random_state', 'null'),
                                                                                    ('shuffle', 'false')]))]))]),
                    OrderedDict([('oml-python:serialized_object', 'cv_object'),
                                 ('value', OrderedDict([('name', 'sklearn.model_selection._split.LeaveOneOut'),
                                                        ('parameters', OrderedDict())]))])]
        for method, fixture in zip(methods, fixtures):
            m = sklearn_to_flow(method)
            self.assertEqual(m, fixture)

            m_new = flow_to_sklearn(m)
            self.assertIsNot(m_new, m)
            self.assertIsInstance(m_new, type(method))
Exemple #24
0
 def test_subflow_version_propagated(self):
     this_directory = os.path.dirname(os.path.abspath(__file__))
     tests_directory = os.path.abspath(
         os.path.join(this_directory, '..', '..'))
     sys.path.append(tests_directory)
     import tests.test_flows.dummy_learn.dummy_forest
     pca = sklearn.decomposition.PCA()
     dummy = tests.test_flows.dummy_learn.dummy_forest.DummyRegressor()
     pipeline = sklearn.pipeline.Pipeline((('pca', pca), ('dummy', dummy)))
     flow = sklearn_to_flow(pipeline)
     # In python2.7, the unit tests work differently on travis-ci; therefore,
     # I put the alternative travis-ci answer here as well. While it has a
     # different value, it is still correct as it is a propagation of the
     # subclasses' module name
     self.assertEqual(
         flow.external_version, '%s,%s' %
         (_format_external_version('sklearn', sklearn.__version__),
          _format_external_version('tests', '0.1')))
Exemple #25
0
    def test_serialize_model_with_subcomponent(self):
        model = sklearn.ensemble.AdaBoostClassifier(
            n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier())

        fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
                       '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
        fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'

        serialization =  sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_class_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
        self.assertIsInstance(serialization.parameters['base_estimator'], str)
        self.assertEqual(serialization.parameters['learning_rate'], '1.0')
        self.assertEqual(serialization.parameters['n_estimators'], '100')
        self.assertEqual(serialization.components['base_estimator'].name,
                         fixture_subcomponent_name)
        self.assertEqual(serialization.components['base_estimator'].class_name,
                         fixture_subcomponent_class_name)
        self.assertEqual(serialization.components['base_estimator'].description,
                         fixture_subcomponent_description)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertIsNot(new_model.base_estimator, model.base_estimator)
        self.assertEqual(new_model.base_estimator.get_params(),
                         model.base_estimator.get_params())
        new_model_params = new_model.get_params()
        del new_model_params['base_estimator']
        model_params = model.get_params()
        del model_params['base_estimator']

        self.assertEqual(new_model_params, model_params)
        new_model.fit(self.X, self.y)
Exemple #26
0
    def test_serialize_advanced_grid(self):
        # TODO instead a GridSearchCV object should be serialized

        # This needs to be in its own function because we cannot simply check
        # for the equality of the grid, because scikit-learn objects don't
        # really support the equality operator
        # This will only work with sklearn==0.18
        N_FEATURES_OPTIONS = [2, 4, 8]
        C_OPTIONS = [1, 10, 100, 1000]
        grid = [{'reduce_dim': [sklearn.decomposition.PCA(iterated_power=7),
                                sklearn.decomposition.NMF()],
                 'reduce_dim__n_components': N_FEATURES_OPTIONS,
                 'classify__C': C_OPTIONS},
                {'reduce_dim': [sklearn.feature_selection.SelectKBest(
                                sklearn.feature_selection.chi2)],
                 'reduce_dim__k': N_FEATURES_OPTIONS,
                 'classify__C': C_OPTIONS}]

        serialized = sklearn_to_flow(grid)
        deserialized = flow_to_sklearn(serialized)

        self.assertEqual(grid[0]['reduce_dim'][0].get_params(),
                         deserialized[0]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][0],
                         deserialized[0]['reduce_dim'][0])
        self.assertEqual(grid[0]['reduce_dim'][1].get_params(),
                         deserialized[0]['reduce_dim'][1].get_params())
        self.assertIsNot(grid[0]['reduce_dim'][1],
                         deserialized[0]['reduce_dim'][1])
        self.assertEqual(grid[0]['reduce_dim__n_components'],
                         deserialized[0]['reduce_dim__n_components'])
        self.assertEqual(grid[0]['classify__C'],
                         deserialized[0]['classify__C'])
        self.assertEqual(grid[1]['reduce_dim'][0].get_params(),
                         deserialized[1]['reduce_dim'][0].get_params())
        self.assertIsNot(grid[1]['reduce_dim'][0],
                         deserialized[1]['reduce_dim'][0])
        self.assertEqual(grid[1]['reduce_dim__k'],
                         deserialized[1]['reduce_dim__k'])
        self.assertEqual(grid[1]['classify__C'],
                         deserialized[1]['classify__C'])
Exemple #27
0
def runMLAlgorithm(estimator, name, settings, RTPName=None, tooLong=False):
    acc = 0
    expectedRuntime = -1
    if settings.showRuntimePrediction and RTPName is not None:
        expectedRuntime = getAverageRuntime(RTPName, settings.task)
    if (expectedRuntime <= settings.timeLimit
            and expectedRuntime != -1) or (not tooLong
                                           and expectedRuntime == -1):
        if settings.removeOutliers:
            name += "_noOutlier"
            clf = pipeline.Pipeline(
                steps=[('imputer', impute.SimpleImputer()),
                       ('estimator',
                        WithoutOutliersClassifier(
                            IsolationForest(behaviour='new',
                                            contamination='auto'), estimator)
                        )])
        else:
            clf = pipeline.Pipeline(
                steps=[('imputer',
                        impute.SimpleImputer()), ('estimator', estimator)])
        flow = flows.sklearn_to_flow(clf)
        try:
            run = runs.run_flow_on_task(settings.taskId,
                                        flow,
                                        avoid_duplicate_runs=True)
        except PyOpenMLError:
            print("Run already exists in OpenML, WIP")
            return
        except:
            print("An unexpected error occured")
            return
        feval = dict(run.fold_evaluations['predictive_accuracy'][0])

        for val in feval.values():
            acc += val
        settings.addAlgorithm(name, acc / 10)
        run.publish()
        run.push_tag("auto-jupyter-notebook")
    else:
        print("Skipping run because of time limit set")
    def test_serialize_model(self, check_dependencies_mock):
        model = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
                                                    max_features='auto',
                                                    max_leaf_nodes=2000)

        fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('class_weight', 'null'),
                         ('criterion', '"entropy"'),
                         ('max_depth', 'null'),
                         ('max_features', '"auto"'),
                         ('max_leaf_nodes', '2000'),
                         ('min_impurity_split', '1e-07'),
                         ('min_samples_leaf', '1'),
                         ('min_samples_split', '2'),
                         ('min_weight_fraction_leaf', '0.0'),
                         ('presort', 'false'),
                         ('random_state', 'null'),
                         ('splitter', '"best"')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X, self.y)

        self.assertEqual(check_dependencies_mock.call_count, 1)
Exemple #29
0
    def test_serialize_model(self, check_dependencies_mock):
        model = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
                                                    max_features='auto',
                                                    max_leaf_nodes=2000)

        fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('class_weight', 'null'),
                         ('criterion', '"entropy"'),
                         ('max_depth', 'null'),
                         ('max_features', '"auto"'),
                         ('max_leaf_nodes', '2000'),
                         ('min_impurity_split', '1e-07'),
                         ('min_samples_leaf', '1'),
                         ('min_samples_split', '2'),
                         ('min_weight_fraction_leaf', '0.0'),
                         ('presort', 'false'),
                         ('random_state', 'null'),
                         ('splitter', '"best"')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X, self.y)

        self.assertEqual(check_dependencies_mock.call_count, 1)
    def test_serialize_model_clustering(self, check_dependencies_mock):
        model = sklearn.cluster.KMeans()

        fixture_name = 'sklearn.cluster.k_means_.KMeans'
        fixture_description = 'Automatically created scikit-learn flow.'
        version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                          % sklearn.__version__
        fixture_parameters = \
            OrderedDict((('algorithm', '"auto"'),
                         ('copy_x', 'true'),
                         ('init', '"k-means++"'),
                         ('max_iter', '300'),
                         ('n_clusters', '8'),
                         ('n_init', '10'),
                         ('n_jobs', '1'),
                         ('precompute_distances', '"auto"'),
                         ('random_state', 'null'),
                         ('tol', '0.0001'),
                         ('verbose', '0')))

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.class_name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)
        self.assertEqual(serialization.parameters, fixture_parameters)
        self.assertEqual(serialization.dependencies, version_fixture)

        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual(new_model.get_params(), model.get_params())
        new_model.fit(self.X)

        self.assertEqual(check_dependencies_mock.call_count, 1)
Exemple #31
0
 def test_serialize_function(self):
     serialized =  sklearn_to_flow(sklearn.feature_selection.chi2)
     deserialized = flow_to_sklearn(serialized)
     self.assertEqual(deserialized, sklearn.feature_selection.chi2)
 def test_serialize_function(self):
     serialized = sklearn_to_flow(sklearn.feature_selection.chi2)
     deserialized = flow_to_sklearn(serialized)
     self.assertEqual(deserialized, sklearn.feature_selection.chi2)
    def test_serialize_pipeline_clustering(self):
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        km = sklearn.cluster.KMeans()
        model = sklearn.pipeline.Pipeline(steps=(('scaler', scaler),
                                                 ('clusterer', km)))

        fixture_name = 'sklearn.pipeline.Pipeline(' \
                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
                       'clusterer=sklearn.cluster.k_means_.KMeans)'
        fixture_description = 'Automatically created scikit-learn flow.'

        serialization = sklearn_to_flow(model)

        self.assertEqual(serialization.name, fixture_name)
        self.assertEqual(serialization.description, fixture_description)

        # Comparing the pipeline
        # The parameters only have the name of base objects(not the whole flow)
        # as value
        self.assertEqual(len(serialization.parameters), 1)
        # Hard to compare two representations of a dict due to possibly
        # different sorting. Making a json makes it easier
        self.assertEqual(
            json.loads(serialization.parameters['steps']),
            [{
                'oml-python:serialized_object': 'component_reference',
                'value': {
                    'key': 'scaler',
                    'step_name': 'scaler'
                }
            }, {
                'oml-python:serialized_object': 'component_reference',
                'value': {
                    'key': 'clusterer',
                    'step_name': 'clusterer'
                }
            }])

        # Checking the sub-component
        self.assertEqual(len(serialization.components), 2)
        self.assertIsInstance(serialization.components['scaler'], OpenMLFlow)
        self.assertIsInstance(serialization.components['clusterer'],
                              OpenMLFlow)

        # del serialization.model
        new_model = flow_to_sklearn(serialization)

        self.assertEqual(type(new_model), type(model))
        self.assertIsNot(new_model, model)

        self.assertEqual([step[0] for step in new_model.steps],
                         [step[0] for step in model.steps])
        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])

        new_model_params = new_model.get_params()
        del new_model_params['scaler']
        del new_model_params['clusterer']
        del new_model_params['steps']
        fu_params = model.get_params()
        del fu_params['scaler']
        del fu_params['clusterer']
        del fu_params['steps']

        self.assertEqual(new_model_params, fu_params)
        new_model.fit(self.X, self.y)