def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data(
            multiclass=False)

        actual = GradientBoostingClassifier(random_state=42)
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        y_actual = actual.predict(X_test)

        config['max_depth'] = max(
            resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2)
        del config['max_depth_factor']

        config['max_leaf_nodes'] = max(
            resolve_factor(config['max_leaf_nodes_factor'], X_train.shape[0]),
            2)
        del config['max_leaf_nodes_factor']

        config['min_samples_leaf'] = resolve_factor(
            config['min_samples_leaf_factor'], X_train.shape[0])
        del config['min_samples_leaf_factor']

        expected = HistGradientBoostingClassifier(**config,
                                                  scoring='f1_weighted',
                                                  random_state=42)
        expected.fit(X_train, y_train)
        y_expected = expected.predict(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'prediction'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(y_actual, y_expected)
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = RandomTreesEmbeddingComponent(random_state=42)
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        config['max_depth'] = max(
            resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2)
        del config['max_depth_factor']

        config['min_samples_leaf'] = resolve_factor(
            config['min_samples_leaf_factor'], X_train.shape[0])
        del config['min_samples_leaf_factor']

        config['min_samples_split'] = max(
            resolve_factor(config['min_samples_split_factor'],
                           X_train.shape[0]), 2)
        del config['min_samples_split_factor']

        expected = sklearn.ensemble.RandomTreesEmbedding(**config,
                                                         n_jobs=1,
                                                         sparse_output=False,
                                                         random_state=42)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        # assert actual.get_feature_names_out(feature_names).tolist() == []
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Example #3
0
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = ExtraTreesClassifier(random_state=42)
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        y_actual = actual.predict(X_test)

        config['max_depth'] = max(
            resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2)
        del config['max_depth_factor']

        config['max_leaf_nodes'] = max(
            resolve_factor(config['max_leaf_nodes_factor'], X_train.shape[0]),
            2)
        del config['max_leaf_nodes_factor']

        expected = sklearn.ensemble.ExtraTreesClassifier(**config,
                                                         random_state=42)
        expected.fit(X_train, y_train)
        y_expected = expected.predict(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'prediction'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(y_actual, y_expected)
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.tree import DecisionTreeClassifier

        # Heuristic to set the tree depth
        max_depth = resolve_factor(self.max_depth_factor,
                                   n_features,
                                   cs_default=1.)
        if max_depth is not None:
            max_depth = max(max_depth, 2)

        # Heuristic to set the tree width
        max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor,
                                        n_samples,
                                        cs_default=1.)
        if max_leaf_nodes is not None:
            max_leaf_nodes = max(max_leaf_nodes, 2)

        # Heuristic to set max features
        max_features = resolve_factor(self.max_features_factor,
                                      n_features,
                                      cs_default=1.,
                                      default=None)
        if max_features is not None:
            max_features = max(max_features, 1)

        # Heuristic to set min_samples_split
        min_samples_split = resolve_factor(self.min_samples_split_factor,
                                           n_samples,
                                           default=2,
                                           cs_default=0.0001)
        if min_samples_split is not None:
            min_samples_split = max(min_samples_split, 2)

        # Heuristic to set min_samples_leaf
        min_samples_leaf = resolve_factor(self.min_samples_leaf_factor,
                                          n_samples,
                                          default=1,
                                          cs_default=0.0001)
        if min_samples_leaf is not None:
            min_samples_leaf = max(min_samples_leaf, 1)

        return DecisionTreeClassifier(
            criterion=self.criterion,
            splitter=self.splitter,
            max_depth=max_depth,
            max_features=max_features,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            min_impurity_decrease=self.min_impurity_decrease,
            class_weight=self.class_weight,
            ccp_alpha=self.ccp_alpha,
            random_state=self.random_state)
Example #5
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.ensemble import RandomTreesEmbedding

        self.n_estimators = int(self.n_estimators)
        self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)

        # Heuristic to set the tree depth
        if isinstance(self.max_depth_factor, int):
            max_depth = self.max_depth_factor
        else:
            max_depth = resolve_factor(self.max_depth_factor,
                                       n_features,
                                       default=5,
                                       cs_default=1.)
        if max_depth is not None:
            max_depth = max(max_depth, 2)

        # Heuristic to set the tree width
        max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor,
                                        n_samples,
                                        cs_default=1.)
        if max_leaf_nodes is not None:
            max_leaf_nodes = max(max_leaf_nodes, 2)

        # Heuristic to set max features
        min_samples_split = resolve_factor(self.min_samples_split_factor,
                                           n_samples,
                                           default=2,
                                           cs_default=0.0001)
        if min_samples_split is not None:
            min_samples_split = max(min_samples_split, 2)

        # Heuristic to set max features
        min_samples_leaf = resolve_factor(self.min_samples_leaf_factor,
                                          n_samples,
                                          default=1,
                                          cs_default=0.0001)
        if min_samples_leaf is not None:
            min_samples_leaf = max(min_samples_leaf, 1)

        return RandomTreesEmbedding(
            n_estimators=self.n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            min_impurity_decrease=self.min_impurity_decrease,
            sparse_output=False,
            n_jobs=1,
            random_state=self.random_state)
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = SelectKBestComponent()
        config = self.get_config(actual, seed=0)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        if config['score_func'] == "chi2":
            config['score_func'] = sklearn.feature_selection.chi2
        elif config['score_func'] == "f_classif":
            config['score_func'] = sklearn.feature_selection.f_classif
        elif config['score_func'] == "mutual_info":
            config['score_func'] = sklearn.feature_selection.mutual_info_classif
        elif config['score_func'] == "f_regression":
            config['score_func'] = sklearn.feature_selection.f_regression

        config['k'] = resolve_factor(config['k_factor'], X_train.shape[1])
        del config['k_factor']

        expected = sklearn.feature_selection.SelectKBest(**config)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == ['petal length (cm)', 'petal width (cm)']
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Example #7
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.cluster import FeatureAgglomeration

        if self.pooling_func == "mean":
            pooling_func = np.mean
        elif self.pooling_func == "median":
            pooling_func = np.median
        elif self.pooling_func == "max":
            pooling_func = np.max
        else:
            raise ValueError(f'Unknown pooling function \'{self.pooling_func}\'')

        if self.distance_threshold is not None:
            n_clusters = None
            self.compute_full_tree = True
        else:
            if isinstance(self.n_clusters_factor, int):
                n_clusters = self.n_clusters_factor
            else:
                n_clusters = max(min(resolve_factor(self.n_clusters_factor, n_features, default=2, cs_default=1.),
                                     (n_features - 1)), 2)

        return FeatureAgglomeration(n_clusters=n_clusters,
                                    affinity=self.affinity,
                                    compute_full_tree=self.compute_full_tree,
                                    linkage=self.linkage,
                                    distance_threshold=self.distance_threshold,
                                    pooling_func=pooling_func)
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = FeatureAgglomerationComponent()
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        if config['pooling_func'] == "mean":
            config['pooling_func'] = np.mean
        elif config['pooling_func'] == "median":
            config['pooling_func'] = np.median
        elif config['pooling_func'] == "max":
            config['pooling_func'] = np.max

        config['n_clusters'] = max(
            min(resolve_factor(config['n_clusters_factor'], X_train.shape[1]),
                (X_train.shape[1] - 1)), 2)
        del config['n_clusters_factor']

        expected = sklearn.cluster.FeatureAgglomeration(**config)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'cluster_{}'.format(i) for i in range(expected.n_clusters)
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Example #9
0
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = LinearDiscriminantAnalysis()
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        y_actual = actual.predict(X_test)

        config['n_components'] = resolve_factor(
            config['n_components_factor'],
            min(X_train.shape[1],
                len(np.unique(y_train)) - 1),
            cs_default=0.5)
        del config['n_components_factor']

        expected = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            **config)
        expected.fit(X_train, y_train)
        y_expected = expected.predict(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'prediction'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(y_actual, y_expected)
Example #10
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import HistGradientBoostingClassifier

        if check_none(self.scoring):
            self.scoring = None

        # Heuristic to set the tree depth
        max_depth = resolve_factor(self.max_depth_factor, n_features, cs_default=1.)
        if max_depth is not None:
            max_depth = max(max_depth, 2)

        l2_regularization = 0. if self.l2_regularization == 1e-07 else self.l2_regularization

        # Heuristic to set the tree width
        if isinstance(self.max_leaf_nodes_factor, int):
            max_leaf_nodes = self.max_leaf_nodes_factor
        else:
            max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, default=31, cs_default=1.)
        if max_leaf_nodes is not None:
            max_leaf_nodes = max(max_leaf_nodes, 2)

        # Heuristic to set the tree width
        if isinstance(self.min_samples_leaf_factor, int):
            min_samples_leaf = self.min_samples_leaf_factor
        else:
            min_samples_leaf = resolve_factor(self.min_samples_leaf_factor, n_samples, default=20, cs_default=0.0001)

        n_iter_no_change = None if self.n_iter_no_change == 0 else self.n_iter_no_change

        if self.scoring == 'balanced_accurary':
            self.scoring = 'balanced_accuracy'

        return HistGradientBoostingClassifier(
            loss=self.loss,
            learning_rate=self.learning_rate,
            max_iter=self.max_iter,
            min_samples_leaf=min_samples_leaf,
            max_depth=max_depth,
            max_leaf_nodes=max_leaf_nodes,
            max_bins=self.max_bins,
            l2_regularization=l2_regularization,
            tol=self.tol,
            scoring=self.scoring,
            n_iter_no_change=n_iter_no_change,
            validation_fraction=self.validation_fraction,
            random_state=self.random_state,
        )
Example #11
0
 def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
     from sklearn.decomposition import PCA
     n_components = resolve_factor(self.keep_variance,
                                   min(n_samples, n_features),
                                   cs_default=0.9999)
     return PCA(n_components=n_components,
                whiten=self.whiten,
                random_state=self.random_state,
                svd_solver=self.svd_solver,
                tol=self.tol,
                iterated_power=self.iterated_power)
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.decomposition import TruncatedSVD

        n_components = min(
            resolve_factor(self.n_components_factor,
                           min(n_samples, n_features)),
            min(n_samples, n_features) - 1)
        return TruncatedSVD(n_components=n_components,
                            algorithm=self.algorithm,
                            n_iter=self.n_iter,
                            tol=self.tol,
                            random_state=self.random_state)
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.decomposition import FactorAnalysis

        n_components = resolve_factor(self.n_components_factor,
                                      n_features,
                                      cs_default=1.)
        return FactorAnalysis(n_components=n_components,
                              svd_method=self.svd_method,
                              max_iter=self.max_iter,
                              iterated_power=self.iterated_power,
                              tol=self.tol,
                              random_state=self.random_state)
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.decomposition import FastICA

        n_components = resolve_factor(self.n_components_factor,
                                      min(n_samples, n_features),
                                      cs_default=1.)
        return FastICA(n_components=n_components,
                       algorithm=self.algorithm,
                       whiten=self.whiten,
                       fun=self.fun,
                       max_iter=self.max_iter,
                       random_state=self.random_state,
                       tol=self.tol)
Example #15
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.ensemble import ExtraTreesClassifier

        # Heuristic to set the tree depth
        max_depth = resolve_factor(self.max_depth_factor,
                                   n_features,
                                   cs_default=1.)
        if max_depth is not None:
            max_depth = max(max_depth, 2)

        # Heuristic to set the tree width
        max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor,
                                        n_samples,
                                        cs_default=1.)
        if max_leaf_nodes is not None:
            max_leaf_nodes = max(max_leaf_nodes, 2)

        max_features = 'auto' if self.max_features == 0.5 else self.max_features
        min_samples_leaf = 1 if self.min_samples_leaf == 0.0001 else self.min_samples_leaf
        min_samples_split = 2 if self.min_samples_split == 0.0001 else self.min_samples_split

        # initial fit of only increment trees
        return ExtraTreesClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_features=max_features,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            bootstrap=self.bootstrap,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            ccp_alpha=self.ccp_alpha,
            random_state=self.random_state,
            class_weight=self.class_weight)
Example #16
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, f_regression
        if self.score_func == "chi2":
            score_func = chi2
        elif self.score_func == "f_classif":
            score_func = f_classif
        elif self.score_func == "mutual_info":
            score_func = mutual_info_classif
        elif self.score_func == "f_regression":
            score_func = f_regression
        else:
            raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info'), but is: %s" % self.score_func)

        from sklearn.feature_selection import SelectKBest
        k = resolve_factor(self.k_factor, n_features)
        return SelectKBest(score_func=score_func, k=k)
Example #17
0
    def to_sklearn(self,
                   n_samples: int = 0,
                   n_features: int = 0,
                   n_classes: int = 0,
                   **kwargs):
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

        n_components = resolve_factor(self.n_components_factor,
                                      min(n_features, n_classes - 1),
                                      cs_default=1.,
                                      default=None)

        # initial fit of only increment trees
        return LinearDiscriminantAnalysis(solver=self.solver,
                                          shrinkage=self.shrinkage,
                                          tol=self.tol,
                                          n_components=n_components)
Example #18
0
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = KernelPCAComponent(random_state=42)
        config = self.get_config(actual, seed=0)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        config['n_components'] = resolve_factor(config['n_components_factor'], min(*X_train.shape))
        del config['n_components_factor']

        expected = sklearn.decomposition.KernelPCA(**config, n_jobs=1, copy_X=False, random_state=42)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == ['principal_component_0',
                                                                        'principal_component_1']
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = FastICAComponent(random_state=42)
        config = self.get_config(actual, seed=0)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        config['n_components'] = resolve_factor(config['n_components_factor'],
                                                min(*X_train.shape))
        del config['n_components_factor']

        expected = sklearn.decomposition.FastICA(**config, random_state=42)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'independent_component_{}'.format(i) for i in range(4)
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Example #20
0
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = FactorAnalysisComponent(random_state=42)
        config = self.get_config(actual)

        actual.set_hyperparameters(config)
        actual.fit(np.copy(X_train), np.copy(y_train))
        X_actual = actual.transform(np.copy(X_test))

        config['n_components'] = resolve_factor(config['n_components_factor'],
                                                X_train.shape[1])
        del config['n_components_factor']

        expected = sklearn.decomposition.FactorAnalysis(**config,
                                                        random_state=42)
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            f'factor_{f}' for f in range(4)
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
    def test_configured(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = PCAComponent(random_state=42)
        config = self.get_config(actual, seed=0)

        actual.set_hyperparameters(config)
        actual.fit(np.copy(X_train), np.copy(y_train))
        X_actual = actual.transform(np.copy(X_test))

        config['n_components'] = resolve_factor(config['keep_variance'],
                                                min(*X_train.shape))
        del config['keep_variance']

        expected = PCA(**config, random_state=42)
        expected.fit(np.copy(X_train), np.copy(y_train))
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == [
            'principal_component_0', 'principal_component_1',
            'principal_component_2'
        ]
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Example #22
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.decomposition import KernelPCA

        n_components = resolve_factor(self.n_components_factor,
                                      min(n_samples, n_features),
                                      cs_default=1.)

        gamma = None if self.gamma == 1. else self.gamma
        max_iter = None if self.max_iter == 100 else self.max_iter

        return KernelPCA(n_components=n_components,
                         kernel=self.kernel,
                         degree=self.degree,
                         gamma=gamma,
                         coef0=self.coef0,
                         random_state=self.random_state,
                         alpha=self.alpha,
                         fit_inverse_transform=self.fit_inverse_transform,
                         eigen_solver=self.eigen_solver,
                         tol=self.tol,
                         max_iter=max_iter,
                         remove_zero_eig=self.remove_zero_eig,
                         n_jobs=1,
                         copy_X=False)