def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data( multiclass=False) actual = GradientBoostingClassifier(random_state=42) config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) y_actual = actual.predict(X_test) config['max_depth'] = max( resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2) del config['max_depth_factor'] config['max_leaf_nodes'] = max( resolve_factor(config['max_leaf_nodes_factor'], X_train.shape[0]), 2) del config['max_leaf_nodes_factor'] config['min_samples_leaf'] = resolve_factor( config['min_samples_leaf_factor'], X_train.shape[0]) del config['min_samples_leaf_factor'] expected = HistGradientBoostingClassifier(**config, scoring='f1_weighted', random_state=42) expected.fit(X_train, y_train) y_expected = expected.predict(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'prediction' ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(y_actual, y_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = RandomTreesEmbeddingComponent(random_state=42) config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) config['max_depth'] = max( resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2) del config['max_depth_factor'] config['min_samples_leaf'] = resolve_factor( config['min_samples_leaf_factor'], X_train.shape[0]) del config['min_samples_leaf_factor'] config['min_samples_split'] = max( resolve_factor(config['min_samples_split_factor'], X_train.shape[0]), 2) del config['min_samples_split_factor'] expected = sklearn.ensemble.RandomTreesEmbedding(**config, n_jobs=1, sparse_output=False, random_state=42) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) # assert actual.get_feature_names_out(feature_names).tolist() == [] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = ExtraTreesClassifier(random_state=42) config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) y_actual = actual.predict(X_test) config['max_depth'] = max( resolve_factor(config['max_depth_factor'], X_train.shape[1]), 2) del config['max_depth_factor'] config['max_leaf_nodes'] = max( resolve_factor(config['max_leaf_nodes_factor'], X_train.shape[0]), 2) del config['max_leaf_nodes_factor'] expected = sklearn.ensemble.ExtraTreesClassifier(**config, random_state=42) expected.fit(X_train, y_train) y_expected = expected.predict(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'prediction' ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(y_actual, y_expected)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.tree import DecisionTreeClassifier # Heuristic to set the tree depth max_depth = resolve_factor(self.max_depth_factor, n_features, cs_default=1.) if max_depth is not None: max_depth = max(max_depth, 2) # Heuristic to set the tree width max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, cs_default=1.) if max_leaf_nodes is not None: max_leaf_nodes = max(max_leaf_nodes, 2) # Heuristic to set max features max_features = resolve_factor(self.max_features_factor, n_features, cs_default=1., default=None) if max_features is not None: max_features = max(max_features, 1) # Heuristic to set min_samples_split min_samples_split = resolve_factor(self.min_samples_split_factor, n_samples, default=2, cs_default=0.0001) if min_samples_split is not None: min_samples_split = max(min_samples_split, 2) # Heuristic to set min_samples_leaf min_samples_leaf = resolve_factor(self.min_samples_leaf_factor, n_samples, default=1, cs_default=0.0001) if min_samples_leaf is not None: min_samples_leaf = max(min_samples_leaf, 1) return DecisionTreeClassifier( criterion=self.criterion, splitter=self.splitter, max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, class_weight=self.class_weight, ccp_alpha=self.ccp_alpha, random_state=self.random_state)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.ensemble import RandomTreesEmbedding self.n_estimators = int(self.n_estimators) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) # Heuristic to set the tree depth if isinstance(self.max_depth_factor, int): max_depth = self.max_depth_factor else: max_depth = resolve_factor(self.max_depth_factor, n_features, default=5, cs_default=1.) if max_depth is not None: max_depth = max(max_depth, 2) # Heuristic to set the tree width max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, cs_default=1.) if max_leaf_nodes is not None: max_leaf_nodes = max(max_leaf_nodes, 2) # Heuristic to set max features min_samples_split = resolve_factor(self.min_samples_split_factor, n_samples, default=2, cs_default=0.0001) if min_samples_split is not None: min_samples_split = max(min_samples_split, 2) # Heuristic to set max features min_samples_leaf = resolve_factor(self.min_samples_leaf_factor, n_samples, default=1, cs_default=0.0001) if min_samples_leaf is not None: min_samples_leaf = max(min_samples_leaf, 1) return RandomTreesEmbedding( n_estimators=self.n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, sparse_output=False, n_jobs=1, random_state=self.random_state)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = SelectKBestComponent() config = self.get_config(actual, seed=0) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) if config['score_func'] == "chi2": config['score_func'] = sklearn.feature_selection.chi2 elif config['score_func'] == "f_classif": config['score_func'] = sklearn.feature_selection.f_classif elif config['score_func'] == "mutual_info": config['score_func'] = sklearn.feature_selection.mutual_info_classif elif config['score_func'] == "f_regression": config['score_func'] = sklearn.feature_selection.f_regression config['k'] = resolve_factor(config['k_factor'], X_train.shape[1]) del config['k_factor'] expected = sklearn.feature_selection.SelectKBest(**config) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == ['petal length (cm)', 'petal width (cm)'] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.cluster import FeatureAgglomeration if self.pooling_func == "mean": pooling_func = np.mean elif self.pooling_func == "median": pooling_func = np.median elif self.pooling_func == "max": pooling_func = np.max else: raise ValueError(f'Unknown pooling function \'{self.pooling_func}\'') if self.distance_threshold is not None: n_clusters = None self.compute_full_tree = True else: if isinstance(self.n_clusters_factor, int): n_clusters = self.n_clusters_factor else: n_clusters = max(min(resolve_factor(self.n_clusters_factor, n_features, default=2, cs_default=1.), (n_features - 1)), 2) return FeatureAgglomeration(n_clusters=n_clusters, affinity=self.affinity, compute_full_tree=self.compute_full_tree, linkage=self.linkage, distance_threshold=self.distance_threshold, pooling_func=pooling_func)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = FeatureAgglomerationComponent() config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) if config['pooling_func'] == "mean": config['pooling_func'] = np.mean elif config['pooling_func'] == "median": config['pooling_func'] = np.median elif config['pooling_func'] == "max": config['pooling_func'] = np.max config['n_clusters'] = max( min(resolve_factor(config['n_clusters_factor'], X_train.shape[1]), (X_train.shape[1] - 1)), 2) del config['n_clusters_factor'] expected = sklearn.cluster.FeatureAgglomeration(**config) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'cluster_{}'.format(i) for i in range(expected.n_clusters) ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = LinearDiscriminantAnalysis() config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) y_actual = actual.predict(X_test) config['n_components'] = resolve_factor( config['n_components_factor'], min(X_train.shape[1], len(np.unique(y_train)) - 1), cs_default=0.5) del config['n_components_factor'] expected = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( **config) expected.fit(X_train, y_train) y_expected = expected.predict(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'prediction' ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(y_actual, y_expected)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import HistGradientBoostingClassifier if check_none(self.scoring): self.scoring = None # Heuristic to set the tree depth max_depth = resolve_factor(self.max_depth_factor, n_features, cs_default=1.) if max_depth is not None: max_depth = max(max_depth, 2) l2_regularization = 0. if self.l2_regularization == 1e-07 else self.l2_regularization # Heuristic to set the tree width if isinstance(self.max_leaf_nodes_factor, int): max_leaf_nodes = self.max_leaf_nodes_factor else: max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, default=31, cs_default=1.) if max_leaf_nodes is not None: max_leaf_nodes = max(max_leaf_nodes, 2) # Heuristic to set the tree width if isinstance(self.min_samples_leaf_factor, int): min_samples_leaf = self.min_samples_leaf_factor else: min_samples_leaf = resolve_factor(self.min_samples_leaf_factor, n_samples, default=20, cs_default=0.0001) n_iter_no_change = None if self.n_iter_no_change == 0 else self.n_iter_no_change if self.scoring == 'balanced_accurary': self.scoring = 'balanced_accuracy' return HistGradientBoostingClassifier( loss=self.loss, learning_rate=self.learning_rate, max_iter=self.max_iter, min_samples_leaf=min_samples_leaf, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes, max_bins=self.max_bins, l2_regularization=l2_regularization, tol=self.tol, scoring=self.scoring, n_iter_no_change=n_iter_no_change, validation_fraction=self.validation_fraction, random_state=self.random_state, )
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.decomposition import PCA n_components = resolve_factor(self.keep_variance, min(n_samples, n_features), cs_default=0.9999) return PCA(n_components=n_components, whiten=self.whiten, random_state=self.random_state, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.decomposition import TruncatedSVD n_components = min( resolve_factor(self.n_components_factor, min(n_samples, n_features)), min(n_samples, n_features) - 1) return TruncatedSVD(n_components=n_components, algorithm=self.algorithm, n_iter=self.n_iter, tol=self.tol, random_state=self.random_state)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.decomposition import FactorAnalysis n_components = resolve_factor(self.n_components_factor, n_features, cs_default=1.) return FactorAnalysis(n_components=n_components, svd_method=self.svd_method, max_iter=self.max_iter, iterated_power=self.iterated_power, tol=self.tol, random_state=self.random_state)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.decomposition import FastICA n_components = resolve_factor(self.n_components_factor, min(n_samples, n_features), cs_default=1.) return FastICA(n_components=n_components, algorithm=self.algorithm, whiten=self.whiten, fun=self.fun, max_iter=self.max_iter, random_state=self.random_state, tol=self.tol)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.ensemble import ExtraTreesClassifier # Heuristic to set the tree depth max_depth = resolve_factor(self.max_depth_factor, n_features, cs_default=1.) if max_depth is not None: max_depth = max(max_depth, 2) # Heuristic to set the tree width max_leaf_nodes = resolve_factor(self.max_leaf_nodes_factor, n_samples, cs_default=1.) if max_leaf_nodes is not None: max_leaf_nodes = max(max_leaf_nodes, 2) max_features = 'auto' if self.max_features == 0.5 else self.max_features min_samples_leaf = 1 if self.min_samples_leaf == 0.0001 else self.min_samples_leaf min_samples_split = 2 if self.min_samples_split == 0.0001 else self.min_samples_split # initial fit of only increment trees return ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, ccp_alpha=self.ccp_alpha, random_state=self.random_state, class_weight=self.class_weight)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, f_regression if self.score_func == "chi2": score_func = chi2 elif self.score_func == "f_classif": score_func = f_classif elif self.score_func == "mutual_info": score_func = mutual_info_classif elif self.score_func == "f_regression": score_func = f_regression else: raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info'), but is: %s" % self.score_func) from sklearn.feature_selection import SelectKBest k = resolve_factor(self.k_factor, n_features) return SelectKBest(score_func=score_func, k=k)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, n_classes: int = 0, **kwargs): from sklearn.discriminant_analysis import LinearDiscriminantAnalysis n_components = resolve_factor(self.n_components_factor, min(n_features, n_classes - 1), cs_default=1., default=None) # initial fit of only increment trees return LinearDiscriminantAnalysis(solver=self.solver, shrinkage=self.shrinkage, tol=self.tol, n_components=n_components)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = KernelPCAComponent(random_state=42) config = self.get_config(actual, seed=0) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) config['n_components'] = resolve_factor(config['n_components_factor'], min(*X_train.shape)) del config['n_components_factor'] expected = sklearn.decomposition.KernelPCA(**config, n_jobs=1, copy_X=False, random_state=42) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == ['principal_component_0', 'principal_component_1'] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = FastICAComponent(random_state=42) config = self.get_config(actual, seed=0) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) config['n_components'] = resolve_factor(config['n_components_factor'], min(*X_train.shape)) del config['n_components_factor'] expected = sklearn.decomposition.FastICA(**config, random_state=42) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'independent_component_{}'.format(i) for i in range(4) ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = FactorAnalysisComponent(random_state=42) config = self.get_config(actual) actual.set_hyperparameters(config) actual.fit(np.copy(X_train), np.copy(y_train)) X_actual = actual.transform(np.copy(X_test)) config['n_components'] = resolve_factor(config['n_components_factor'], X_train.shape[1]) del config['n_components_factor'] expected = sklearn.decomposition.FactorAnalysis(**config, random_state=42) expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ f'factor_{f}' for f in range(4) ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def test_configured(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = PCAComponent(random_state=42) config = self.get_config(actual, seed=0) actual.set_hyperparameters(config) actual.fit(np.copy(X_train), np.copy(y_train)) X_actual = actual.transform(np.copy(X_test)) config['n_components'] = resolve_factor(config['keep_variance'], min(*X_train.shape)) del config['keep_variance'] expected = PCA(**config, random_state=42) expected.fit(np.copy(X_train), np.copy(y_train)) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == [ 'principal_component_0', 'principal_component_1', 'principal_component_2' ] assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.decomposition import KernelPCA n_components = resolve_factor(self.n_components_factor, min(n_samples, n_features), cs_default=1.) gamma = None if self.gamma == 1. else self.gamma max_iter = None if self.max_iter == 100 else self.max_iter return KernelPCA(n_components=n_components, kernel=self.kernel, degree=self.degree, gamma=gamma, coef0=self.coef0, random_state=self.random_state, alpha=self.alpha, fit_inverse_transform=self.fit_inverse_transform, eigen_solver=self.eigen_solver, tol=self.tol, max_iter=max_iter, remove_zero_eig=self.remove_zero_eig, n_jobs=1, copy_X=False)