def _get_map_of_algorithms(): mapping = { 'pca': [[(decomposition_module, 'PCA', PCA_daal4py), None]], 'kmeans': [[(cluster_module, 'KMeans', KMeans_daal4py), None]], 'dbscan': [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]], 'distances': [[(pairwise, 'pairwise_distances', daal_pairwise_distances), None]], 'linear': [[(linear_model_module, 'LinearRegression', LinearRegression_daal4py), None]], 'ridge': [[(linear_model_module, 'Ridge', Ridge_daal4py), None]], 'elasticnet': [[(linear_model_module, 'ElasticNet', ElasticNet_daal4py), None]], 'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]], 'svm': [[(svm_module, 'SVC', SVC_daal4py), None]], 'logistic': [[(logistic_module, _patched_log_reg_path_func_name, daal_optimized_logistic_path), None]], 'knn_classifier': [[(neighbors_module, 'KNeighborsClassifier', KNeighborsClassifier_daal4py), None]], 'nearest_neighbors': [[(neighbors_module, 'NearestNeighbors', NearestNeighbors_daal4py), None]], 'knn_regressor': [[(neighbors_module, 'KNeighborsRegressor', KNeighborsRegressor_daal4py), None]], 'random_forest_classifier': [[(ensemble_module, 'RandomForestClassifier', RandomForestClassifier_daal4py), None]], 'random_forest_regressor': [[(ensemble_module, 'RandomForestRegressor', RandomForestRegressor_daal4py), None]], 'train_test_split': [[(model_selection, 'train_test_split', _daal_train_test_split), None]], 'fin_check': [[(validation, '_assert_all_finite', _daal_assert_all_finite), None]], 'tsne': [[(manifold_module, 'TSNE', TSNE_daal4py), None]], } if daal_check_version((2021,'P', 1)): mapping['log_reg'] = [[(linear_model_module, 'LogisticRegression', LogisticRegression_daal4py), None]] if daal_check_version((2021,'P', 2)): mapping['roc_auc_score'] = [[(metrics, 'roc_auc_score', _daal_roc_auc_score), None]] return mapping
def test_models(model_head): if (get_class_name(model_head['model']) == 'RandomForestClassifier') \ and daal_check_version((2021, 'P', 200)): TO_SKIP.remove('RandomForestClassifier') if (get_class_name(model_head['model']) == 'RandomForestRegressor') \ and daal_check_version((2021, 'P', 200)): TO_SKIP.remove('RandomForestRegressor') if get_class_name(model_head['model']) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) _run_test(model_head['model'], model_head['methods'], model_head['dataset'])
def _fit_classifier(self, X, y, sample_weight=None): if sp.issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." ) _check_parameters(self) if sample_weight is not None: sample_weight = check_sample_weight(sample_weight, X) _patching_status = PatchingConditionsChain( "sklearn.ensemble.RandomForestClassifier.fit") _dal_ready = _patching_status.and_conditions([ (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score, "OOB score is only supported starting from 2021.5 version of oneDAL."), (self.warm_start is False, "Warm start is not supported."), (self.criterion == "gini", f"'{self.criterion}' criterion is not supported. " "Only 'gini' criterion is supported."), (self.ccp_alpha == 0.0, f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), (not sp.issparse(X), "X is sparse. Sparse input is not supported.") ]) if _dal_ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) X = check_array(X, dtype=[np.float32, np.float64]) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] _dal_ready = _patching_status.and_conditions([ (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")]) _patching_status.write_log() if _dal_ready: _daal_fit_classifier(self, X, y, sample_weight=sample_weight) self.estimators_ = self._estimators_ # Decapsulate classes_ attributes self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self return super(RandomForestClassifier, self).fit( X, y, sample_weight=sample_weight)
def predict_proba(self, X): """ Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) if hasattr(self, 'n_features_in_'): try: num_features = _daal_num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( (f'X has {num_features} features, ' f'but RandomForestClassifier is expecting ' f'{self.n_features_in_} features as input')) _patching_status = PatchingConditionsChain( "sklearn.ensemble.RandomForestClassifier.predict_proba") _dal_ready = _patching_status.and_conditions([ (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), (not sp.issparse(X), "X is sparse. Sparse input is not supported."), (daal_check_version((2021, 'P', 400)), "oneDAL version is lower than 2021.4.")]) if hasattr(self, 'n_outputs_'): _dal_ready = _patching_status.and_conditions([ (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")]) _patching_status.write_log() if not _dal_ready: return super(RandomForestClassifier, self).predict_proba(X) X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) if sklearn_check_version('0.23'): self._check_n_features(X, reset=False) return _daal_predict_proba(self, X)
def test_sklearnex_import_rf_regression(): from sklearnex.ensemble import RandomForestRegressor X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y) assert 'daal4py' in rf.__module__ if daal_check_version((2021, 'P', 400)): assert_allclose([-6.97], rf.predict([[0, 0, 0, 0]]), atol=1e-2) else: assert_allclose([-6.66], rf.predict([[0, 0, 0, 0]]), atol=1e-2)
def test_models(model_head): stable_algos = [] if get_class_name(model_head['model']) in stable_algos \ and daal_check_version((2021, 'P', 300)): try: TO_SKIP.remove(get_class_name(model_head['model'])) except ValueError: pass if get_class_name(model_head['model']) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) _run_test(model_head['model'], model_head['methods'], model_head['dataset'])
def test_models(model_head): stable_algos = [ 'RandomForestClassifier', 'RandomForestRegressor', 'PCA', 'LinearRegression', 'Ridge' ] if get_class_name(model_head['model']) in stable_algos \ and daal_check_version((2021, 'P', 200)): TO_SKIP.remove(get_class_name(model_head['model'])) if get_class_name(model_head['model']) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) _run_test(model_head['model'], model_head['methods'], model_head['dataset'])
def get_patch_map(): from daal4py.sklearn.monkeypatch.dispatcher import _get_map_of_algorithms mapping = _get_map_of_algorithms().copy() if daal_check_version((2021, 'P', 300)): mapping.pop('svm') mapping.pop('svc') mapping['svr'] = [[(svm_module, 'SVR', SVR_sklearnex), None]] mapping['svc'] = [[(svm_module, 'SVC', SVC_sklearnex), None]] mapping['nusvr'] = [[(svm_module, 'NuSVR', NuSVR_sklearnex), None]] mapping['nusvc'] = [[(svm_module, 'NuSVC', NuSVC_sklearnex), None]] return mapping
def _daal_predict_proba(self, X): if not daal_check_version((2021, 'P', 200)): X = self._validate_X_predict(X) X_fptype = getFPType(X) dfc_algorithm = daal4py.decision_forest_classification_prediction( nClasses=int(self.n_classes_), fptype=X_fptype, resultsToEvaluate="computeClassProbabilities") dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_) pred = dfc_predictionResult.probabilities return pred
def _daal_predict_regressor(self, X): if X.shape[1] != self.n_features_in_: raise ValueError((f'X has {X.shape[1]} features, ' f'but RandomForestRegressor is expecting ' f'{self.n_features_in_} features as input')) if not daal_check_version((2021, 'P', 200)): X = self._validate_X_predict(X) X_fptype = getFPType(X) dfr_alg = daal4py.decision_forest_regression_prediction(fptype=X_fptype) dfr_predictionResult = dfr_alg.compute(X, self.daal_model_) pred = dfr_predictionResult.prediction return pred.ravel()
def test_models(model_head): stable_algos = [ 'RandomForestClassifier', 'RandomForestRegressor', 'PCA', 'LinearRegression', 'Ridge', 'KNeighborsClassifier', 'NearestNeighbors', 'KMeans', 'ElasticNet', 'Lasso' ] if get_class_name(model_head['model']) in stable_algos \ and daal_check_version((2021, 'P', 200)): try: TO_SKIP.remove(get_class_name(model_head['model'])) except ValueError: pass if get_class_name(model_head['model']) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) _run_test(model_head['model'], model_head['methods'], model_head['dataset'])
def predict_proba(self, X): """ Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ if hasattr(self, 'n_features_in_'): try: num_features = _daal_num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError((f'X has {num_features} features, ' f'but RandomForestClassifier is expecting ' f'{self.n_features_in_} features as input')) if not hasattr(self, 'daal_model_') or \ sp.issparse(X) or self.n_outputs_ != 1 or \ not daal_check_version((2021, 'P', 400)): logging.info("sklearn.ensemble.RandomForestClassifier." "predict_proba: " + get_patch_message("sklearn")) return super(RandomForestClassifier, self).predict_proba(X) logging.info("sklearn.ensemble.RandomForestClassifier." "predict_proba: " + get_patch_message("daal")) X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) if sklearn_check_version('0.23'): self._check_n_features(X, reset=False) return _daal_predict_proba(self, X)
def _daal_predict_classifier(self, X): if not daal_check_version((2021, 'P', 200)): X = self._validate_X_predict(X) X_fptype = getFPType(X) dfc_algorithm = daal4py.decision_forest_classification_prediction( nClasses=int(self.n_classes_), fptype=X_fptype, resultsToEvaluate="computeClassLabels") if X.shape[1] != self.n_features_in_: raise ValueError((f'X has {X.shape[1]} features, ' f'but RandomForestClassifier is expecting ' f'{self.n_features_in_} features as input')) dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_) pred = dfc_predictionResult.prediction return np.take(self.classes_, pred.ravel().astype(np.int64, casting='unsafe'))
import random from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.metrics import (accuracy_score, roc_auc_score, mean_squared_error, log_loss) from sklearn.ensemble \ import RandomForestClassifier as ScikitRandomForestClassifier from daal4py.sklearn.ensemble \ import RandomForestClassifier as DaalRandomForestClassifier from sklearn.ensemble \ import RandomForestRegressor as ScikitRandomForestRegressor from daal4py.sklearn.ensemble \ import RandomForestRegressor as DaalRandomForestRegressor from daal4py.sklearn._utils import daal_check_version ACCURACY_RATIO = 0.95 if daal_check_version((2021, 'P', 400)) else 0.85 MSE_RATIO = 1.07 LOG_LOSS_RATIO = 1.4 if daal_check_version((2021, 'P', 400)) else 1.55 ROC_AUC_RATIO = 0.96 RNG = np.random.RandomState(0) IRIS = load_iris() def _compare_with_sklearn_classifier_iris(n_estimators=100, class_weight=None, sample_weight=None, description=""): x_train, x_test, y_train, y_test = \ train_test_split(IRIS.data, IRIS.target, test_size=0.33, random_state=31) # models scikit_model = ScikitRandomForestClassifier(n_estimators=n_estimators, class_weight=class_weight,
None]], 'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]], 'svm': [[(svm_module, 'SVC', SVC_daal4py), None]], 'logistic': [[(logistic_module, _patched_log_reg_path_func_name, daal_optimized_logistic_path), None]], } del _patched_log_reg_path_func_name try: from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py _mapping['dbscan'] = [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]] except ImportError: pass if daal_check_version((2020, 1), (2021, 5)): _mapping['fin_check'] = [[(validation, '_assert_all_finite', _daal_assert_all_finite), None]] if daal_check_version((2020, 2), (2021, 8)): _mapping['tt_split'] = [[(model_selection, 'train_test_split', _daal_train_test_split), None]] def do_patch(name): lname = name.lower() if lname in _mapping: for descriptor in _mapping[lname]: which, what, replacer = descriptor[0] if descriptor[1] is None: descriptor[1] = getattr(which, what, None)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #=============================================================================== # Other imports import sys from distutils.version import LooseVersion from functools import lru_cache from daal4py.sklearn._utils import daal_check_version # Classes for patching if daal_check_version((2021, 'P', 300)): from .svm import SVR as SVR_sklearnex from .svm import SVC as SVC_sklearnex from .svm import NuSVR as NuSVR_sklearnex from .svm import NuSVC as NuSVC_sklearnex # Scikit-learn* modules import sklearn.svm as svm_module @lru_cache(maxsize=None) def get_patch_map(): from daal4py.sklearn.monkeypatch.dispatcher import _get_map_of_algorithms mapping = _get_map_of_algorithms().copy() if daal_check_version((2021, 'P', 300)):
def _is_new_patching_available(): return os.environ.get('OFF_ONEDAL_IFACE') is None \ and daal_check_version((2021, 'P', 300))
class LogRegModelBuilder(unittest.TestCase): @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \ daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \ " not supported in this library version " + str(daal_version)) def test_iris_with_intercept(self): X, y = load_iris(return_X_y=True) n_classes=3 clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(X, y) builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1]) builder.set_beta(clf.coef_, clf.intercept_) alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \ daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \ " not supported in this library version " + str(daal_version)) def test_iris_without_intercept(self): X, y = load_iris(return_X_y=True) n_classes=3 clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit(X, y) builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1]) builder.set_beta(clf.coef_, clf.intercept_) alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \ daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \ " not supported in this library version " + str(daal_version)) def test_breast_cancer_with_intercept(self): X, y = load_breast_cancer(return_X_y=True) n_classes=2 clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit(X, y) builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1]) builder.set_beta(clf.coef_, clf.intercept_) alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \ daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \ " not supported in this library version " + str(daal_version)) def test_breast_cancer_without_intercept(self): X, y = load_breast_cancer(return_X_y=True) n_classes=2 clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit(X, y) builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1]) builder.set_beta(clf.coef_, clf.intercept_) alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn))
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.metrics import (accuracy_score, roc_auc_score, mean_squared_error, log_loss) from sklearn.ensemble \ import RandomForestClassifier as ScikitRandomForestClassifier from daal4py.sklearn.ensemble \ import RandomForestClassifier as DaalRandomForestClassifier from sklearn.ensemble \ import RandomForestRegressor as ScikitRandomForestRegressor from daal4py.sklearn.ensemble \ import RandomForestRegressor as DaalRandomForestRegressor from daal4py.sklearn._utils import daal_check_version N_TRIES = 10 ACCURACY_RATIO = 0.85 if daal_check_version((2021, 'P', 200)) else 0.7 MSE_RATIO = 1.05 if daal_check_version((2021, 'P', 200)) else 1.42 LOG_LOSS_RATIO = 1.55 if daal_check_version((2021, 'P', 200)) else 2.28 ROC_AUC_RATIO = 0.978 IRIS = load_iris() random.seed(777) CLASS_WEIGHTS_IRIS = [ { 0: 0, 1: 0, 2: 0 }, { 0: 0, 1: 1,
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and rng not in [ 'default', 'OPTIMIZED_MT19937' ] and (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_train + n_test) test, train = indexes[:n_test], indexes[n_test:] elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \ and (isinstance(random_state, int) or random_state is None) \ and platform.system() != 'Windows': indexes = np.empty(shape=(n_train + n_test, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: fallback = False # input format check if not isinstance(arr, np.ndarray): if pandas_is_imported: if not isinstance(arr, pd.core.frame.DataFrame) and not isinstance( arr, pd.core.series.Series): fallback = True else: fallback = True # dimensions check if hasattr(arr, 'ndim'): if arr.ndim > 2: fallback = True else: fallback = True # data types check dtypes = get_dtypes(arr) if dtypes is None: fallback = True else: for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): fallback = True break if fallback: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols), order='A') if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty(shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order) test_arr = np.empty(shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty(shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame( train_arr), pd.DataFrame(test_arr) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = train_arr.reshape( n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr), pd.Series( test_arr) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res
def _daal_fit_classifier(self, X, y, sample_weight=None): y = check_array(y, ensure_2d=False, dtype=None) y, expanded_class_weight = self._validate_y_class_weight(y) n_classes_ = self.n_classes_[0] self.n_features_ = X.shape[1] if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if sample_weight is not None: sample_weight = [sample_weight] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) features_per_node_ = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True) n_samples_bootstrap_ = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(n_classes_), fptype=X_fptype, method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap_ if self.bootstrap is True else 1., featuresPerNode=int(features_per_node_), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self.estimators_ = self._estimators_ self._set_oob_score(X, y) return self
def get_sklearnex_version(rule): from daal4py.sklearn._utils import daal_check_version return daal_check_version(rule)
def _daal_fit_regressor(self, X, y, sample_weight=None): self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) # limitation on the number of stream for mt2203 is 6024 # more details here: # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html max_stream_count = 6024 if self.n_estimators <= max_stream_count: daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) else: daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if sample_weight is not None: sample_weight = [sample_weight] # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype=getFPType(X), method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap if self.bootstrap is True else 1., featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ #if self.oob_score: # self.estimators_ = self._estimators_ # self._set_oob_score(X, y) return self
'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]], 'svm': [[(svm_module, 'SVC', SVC_daal4py), None]], 'logistic': [[(logistic_module, _patched_log_reg_path_func_name, daal_optimized_logistic_path), None]], 'knn_clsf': [[(neighbors_module, 'KNeighborsClassifier', KNeighborsClassifier_daal4py), None]], 'knn_mxn': [[(neighbors_module, 'KNeighborsMixin', KNeighborsMixin_daal4py), None]] } del _patched_log_reg_path_func_name try: from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py _mapping['dbscan'] = [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]] except ImportError: pass if daal_check_version(((2020,'P', 1), (2021,'B',5))): _mapping['fin_check'] = [[(validation, '_assert_all_finite', _daal_assert_all_finite), None]] if daal_check_version(((2020,'P', 2), (2021,'B',8))): _mapping['tt_split'] = [[(model_selection, 'train_test_split', _daal_train_test_split), None]] if daal_check_version((2020,'P', 3)): _mapping['df_classifier'] = [[(ensemble_module, 'RandomForestClassifier', RandomForestClassifier_daal4py), None]] _mapping['df_regressor'] = [[(ensemble_module, 'RandomForestRegressor', RandomForestRegressor_daal4py), None]] def do_patch(name): lname = name.lower() if lname in _mapping: for descriptor in _mapping[lname]: which, what, replacer = descriptor[0] if descriptor[1] is None:
import pytest from sklearn.neighbors \ import KNeighborsClassifier as ScikitKNeighborsClassifier from daal4py.sklearn.neighbors \ import KNeighborsClassifier as DaalKNeighborsClassifier from sklearn.datasets import load_iris from sklearn.metrics import (accuracy_score, log_loss, roc_auc_score) from sklearn.model_selection import train_test_split from daal4py.sklearn._utils import daal_check_version DISTANCES = ['minkowski'] ALGORITHMS = ['brute', 'kd_tree', 'auto'] WEIGHTS = ['uniform', 'distance'] KS = [1, 3, 7, 15, 31] N_TRIES = 10 ACCURACY_RATIO = 1.0 if daal_check_version(((2020, 'P', 300))) else 0.9 LOG_LOSS_RATIO = 1.02 ROC_AUC_RATIO = 0.999 IRIS = load_iris() def _test_determenistic(distance, algorithm, weight, k): x_train, x_test, y_train, y_test = \ train_test_split(IRIS.data, IRIS.target, test_size=0.33, random_state=31) alg_results = [] for _ in range(N_TRIES): # models scikit_model = ScikitKNeighborsClassifier(n_neighbors=k, weights=weight,
""" The 'daal4py.sklearn.ensemble' module implements daal4py-based RandomForestClassifier and RandomForestRegressor classes. """ from daal4py.sklearn._utils import daal_check_version if daal_check_version((2020, 3)): from .forest import (RandomForestClassifier, RandomForestRegressor) else: from .decision_forest import (RandomForestClassifier, RandomForestRegressor) from .GBTDAAL import (GBTDAALClassifier, GBTDAALRegressor) from .AdaBoostClassifier import AdaBoostClassifier __all__ = ['RandomForestClassifier', 'RandomForestRegressor', 'GBTDAALClassifier', 'GBTDAALRegressor', 'AdaBoostClassifier']
np.int64, np.float16, np.float32, np.float64, np.uint8, np.uint16, np.uint32, np.uint64, ] TO_SKIP = [ # --------------- NO INFO --------------- r'KMeans .*transform', r'KMeans .*score', r'PCA .*score', r'LogisticRegression .*decision_function', r'LogisticRegressionCV .*decision_function', r'LogisticRegressionCV .*predict', r'LogisticRegressionCV .*predict_proba', r'LogisticRegressionCV .*predict_log_proba', r'LogisticRegressionCV .*score', # --------------- Scikit --------------- r'Ridge float16 predict', r'Ridge float16 score', r'RandomForestClassifier .*predict_proba', r'RandomForestClassifier .*predict_log_proba', r'pairwise_distances .*pairwise_distances', # except float64 r'roc_auc_score .*roc_auc_score' \ if not daal_check_version((2021, 'P', 200)) else None, ]
def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" if isinstance(self.init, str) and self.init == 'warn': warnings.warn( "The default initialization in TSNE will change " "from 'random' to 'pca' in 1.2.", FutureWarning) self._init = 'random' else: self._init = self.init if isinstance(self._init, str) and self._init == 'pca' and issparse(X): raise TypeError("PCA initialization is currently not suported " "with the sparse input matrix. Use " "init=\"random\" instead.") if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.learning_rate == 'warn': warnings.warn( "The default learning rate in TSNE will change " "from 200.0 to 'auto' in 1.2.", FutureWarning) self._learning_rate = 200.0 else: self._learning_rate = self.learning_rate if self._learning_rate == 'auto': self._learning_rate = X.shape[0] / self.early_exaggeration / 4 self._learning_rate = np.maximum(self._learning_rate, 50) else: if not (self._learning_rate > 0): raise ValueError("'learning_rate' must be a positive number " "or 'auto'.") if hasattr(self, 'square_distances'): if self.square_distances not in [True, 'legacy']: raise ValueError( "'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: warnings.warn( ("'square_distances' has been introduced in 0.24" "to help phase out legacy squaring behavior. The " "'legacy' setting will be removed in 0.26, and the " "default setting will be changed to True. In 0.28, " "'square_distances' will be removed altogether," "and distances will be squared by default. Set " "'square_distances'=True to silence this warning."), FutureWarning) if self.method == 'barnes_hut': if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64]) else: if sklearn_check_version('0.23'): X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self._init, str) and self._init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") check_non_negative( X, "TSNE.fit(). With metric='precomputed', X " "should contain positive distances.") if self.method == "exact" and issparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' 'or provide the dense distance matrix.') if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " "quad-tree or oct-tree.") random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError( "early_exaggeration must be at least 1, but is {}".format( self.early_exaggeration)) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") n_samples = X.shape[0] neighbors_nn = None if self.method == "exact": # Retrieve the distance matrix, either using the precomputed one or # computing it. if self.metric == "precomputed": distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": # Euclidean is squared here, rather than using **= 2, # because euclidean_distances already calculates # squared distances, and returns np.sqrt(dist) for # squared=False. # Also, Euclidean is slower for n_jobs>1, so don't set here distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric, n_jobs=self.n_jobs) if np.any(distances < 0): raise ValueError("All distances should be positive, the " "metric given is not correct") if self.metric != "euclidean" and \ getattr(self, 'square_distances', True) is True: distances **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") else: # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) if self.verbose: print("[t-SNE] Computing {} nearest neighbors...".format( n_neighbors)) # Find the nearest neighbors for every point knn = NearestNeighbors( algorithm='auto', n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric, ) t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) t0 = time() distances_nn = knn.kneighbors_graph(mode='distance') duration = time() - t0 if self.verbose: print("[t-SNE] Computed neighbors for {} samples " "in {:.3f}s...".format(n_samples, duration)) # Free the memory used by the ball_tree del knn if getattr(self, 'square_distances', True) is True or \ self.metric == "euclidean": # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. distances_nn.data **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) if isinstance(self._init, np.ndarray): X_embedded = self._init elif self._init == 'pca': pca = PCA( n_components=self.n_components, svd_solver='randomized', random_state=random_state, ) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) warnings.warn( "The PCA initialization in TSNE will change to " "have the standard deviation of PC1 equal to 1e-4 " "in 1.2. This will ensure better convergence.", FutureWarning) elif self._init == 'random': # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from # "Learning a Parametric Embedding by Preserving Local Structure" # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1, 1) daal_ready = self.method == 'barnes_hut' and self.n_components == 2 and \ self.verbose == 0 and daal_check_version((2021, 'P', 600)) if daal_ready: X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64]) return self._daal_tsne(P, n_samples, X_embedded=X_embedded) return self._tsne(P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)
def main(): X, y = load_iris(return_X_y=True) n_classes=3 # set parameters and train clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(X, y) #set parameters and call model builder builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1]) builder.set_beta(clf.coef_, clf.intercept_) # set parameters and compute predictions predict_alg = d4p.logistic_regression_prediction(nClasses=n_classes, resultsToEvaluate="computeClassLabels") # set parameters and compute predictions predict_result_daal = predict_alg.compute(X, builder.model) predict_result_sklearn = clf.predict(X) assert np.allclose(predict_result_daal.prediction.flatten(), predict_result_sklearn) return (builder, predict_result_daal) if __name__ == "__main__": if daal_check_version(((2021,'P', 1))): (builder, predict_result_daal) = main() print("\nLogistic Regression coefficients:\n", builder.model) print("\nLogistic regression prediction results (first 10 rows):\n", predict_result_daal.prediction[0:10]) print("\nLogistic regression prediction probabilities (first 10 rows):\n", predict_result_daal.probabilities[0:10]) print("\nLogistic regression prediction log probabilities (first 10 rows):\n", predict_result_daal.logProbabilities[0:10]) print('All looks good!')
def _fit_regressor(self, X, y, sample_weight=None): if sp.issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." ) _check_parameters(self) if sample_weight is not None: sample_weight = check_sample_weight(sample_weight, X) if sklearn_check_version('1.0') and self.criterion == "mse": warnings.warn( "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", FutureWarning ) _patching_status = PatchingConditionsChain( "sklearn.ensemble.RandomForestRegressor.fit") _dal_ready = _patching_status.and_conditions([ (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score, "OOB score is only supported starting from 2021.5 version of oneDAL."), (self.warm_start is False, "Warm start is not supported."), (self.criterion in ["mse", "squared_error"], f"'{self.criterion}' criterion is not supported. " "Only 'mse' and 'squared_error' criteria are supported."), (self.ccp_alpha == 0.0, f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), (not sp.issparse(X), "X is sparse. Sparse input is not supported.") ]) if _dal_ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) X = check_array(X, dtype=[np.float64, np.float32]) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] _dal_ready = _patching_status.and_conditions([ (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")]) _patching_status.write_log() if _dal_ready: _daal_fit_regressor(self, X, y, sample_weight=sample_weight) self.estimators_ = self._estimators_ return self return super(RandomForestRegressor, self).fit( X, y, sample_weight=sample_weight)