Ejemplo n.º 1
0
def _get_map_of_algorithms():
    mapping = {
        'pca':                      [[(decomposition_module, 'PCA', PCA_daal4py), None]],
        'kmeans':                   [[(cluster_module, 'KMeans', KMeans_daal4py), None]],
        'dbscan':                   [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]],
        'distances':                [[(pairwise, 'pairwise_distances', daal_pairwise_distances), None]],
        'linear':                   [[(linear_model_module, 'LinearRegression', LinearRegression_daal4py), None]],
        'ridge':                    [[(linear_model_module, 'Ridge', Ridge_daal4py), None]],
        'elasticnet':               [[(linear_model_module, 'ElasticNet', ElasticNet_daal4py), None]],
        'lasso':                    [[(linear_model_module, 'Lasso', Lasso_daal4py), None]],
        'svm':                      [[(svm_module, 'SVC', SVC_daal4py), None]],
        'logistic':                 [[(logistic_module, _patched_log_reg_path_func_name, daal_optimized_logistic_path), None]],
        'knn_classifier':           [[(neighbors_module, 'KNeighborsClassifier', KNeighborsClassifier_daal4py), None]],
        'nearest_neighbors':        [[(neighbors_module, 'NearestNeighbors', NearestNeighbors_daal4py), None]],
        'knn_regressor':            [[(neighbors_module, 'KNeighborsRegressor', KNeighborsRegressor_daal4py), None]],
        'random_forest_classifier': [[(ensemble_module, 'RandomForestClassifier', RandomForestClassifier_daal4py), None]],
        'random_forest_regressor':  [[(ensemble_module, 'RandomForestRegressor', RandomForestRegressor_daal4py), None]],
        'train_test_split':         [[(model_selection, 'train_test_split', _daal_train_test_split), None]],
        'fin_check':                [[(validation, '_assert_all_finite', _daal_assert_all_finite), None]],
        'tsne':                     [[(manifold_module, 'TSNE', TSNE_daal4py), None]],
    }
    if daal_check_version((2021,'P', 1)):
        mapping['log_reg'] = [[(linear_model_module, 'LogisticRegression', LogisticRegression_daal4py), None]]
    if daal_check_version((2021,'P', 2)):
        mapping['roc_auc_score'] = [[(metrics, 'roc_auc_score', _daal_roc_auc_score), None]]
    return mapping
def test_models(model_head):
    if (get_class_name(model_head['model']) == 'RandomForestClassifier') \
            and daal_check_version((2021, 'P', 200)):
        TO_SKIP.remove('RandomForestClassifier')
    if (get_class_name(model_head['model']) == 'RandomForestRegressor') \
            and daal_check_version((2021, 'P', 200)):
        TO_SKIP.remove('RandomForestRegressor')
    if get_class_name(model_head['model']) in TO_SKIP:
        pytest.skip("Unstable", allow_module_level=False)
    _run_test(model_head['model'], model_head['methods'],
              model_head['dataset'])
Ejemplo n.º 3
0
def _fit_classifier(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError(
            "sparse multilabel-indicator for y is not supported."
        )
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    _patching_status = PatchingConditionsChain(
        "sklearn.ensemble.RandomForestClassifier.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score,
            "OOB score is only supported starting from 2021.5 version of oneDAL."),
        (self.warm_start is False, "Warm start is not supported."),
        (self.criterion == "gini",
            f"'{self.criterion}' criterion is not supported. "
            "Only 'gini' criterion is supported."),
        (self.ccp_alpha == 0.0,
            f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."),
        (not sp.issparse(X), "X is sparse. Sparse input is not supported.")
    ])

    if _dal_ready:
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)
        X = check_array(X, dtype=[np.float32, np.float64])
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                          " expected. Please change the shape of y to "
                          "(n_samples,), for example using ravel().",
                          DataConversionWarning, stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        _dal_ready = _patching_status.and_conditions([
            (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")])

    _patching_status.write_log()
    if _dal_ready:
        _daal_fit_classifier(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_

        # Decapsulate classes_ attributes
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]
        return self
    return super(RandomForestClassifier, self).fit(
        X, y, sample_weight=sample_weight)
Ejemplo n.º 4
0
    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=False)
        if hasattr(self, 'n_features_in_'):
            try:
                num_features = _daal_num_features(X)
            except TypeError:
                num_features = _num_samples(X)
            if num_features != self.n_features_in_:
                raise ValueError(
                    (f'X has {num_features} features, '
                     f'but RandomForestClassifier is expecting '
                     f'{self.n_features_in_} features as input'))

        _patching_status = PatchingConditionsChain(
            "sklearn.ensemble.RandomForestClassifier.predict_proba")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (daal_check_version((2021, 'P', 400)),
                "oneDAL version is lower than 2021.4.")])
        if hasattr(self, 'n_outputs_'):
            _dal_ready = _patching_status.and_conditions([
                (self.n_outputs_ == 1,
                    f"Number of outputs ({self.n_outputs_}) is not 1.")])
        _patching_status.write_log()

        if not _dal_ready:
            return super(RandomForestClassifier, self).predict_proba(X)
        X = check_array(X, dtype=[np.float64, np.float32])
        check_is_fitted(self)
        if sklearn_check_version('0.23'):
            self._check_n_features(X, reset=False)
        return _daal_predict_proba(self, X)
Ejemplo n.º 5
0
def test_sklearnex_import_rf_regression():
    from sklearnex.ensemble import RandomForestRegressor
    X, y = make_regression(n_features=4, n_informative=2,
                           random_state=0, shuffle=False)
    rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y)
    assert 'daal4py' in rf.__module__
    if daal_check_version((2021, 'P', 400)):
        assert_allclose([-6.97], rf.predict([[0, 0, 0, 0]]), atol=1e-2)
    else:
        assert_allclose([-6.66], rf.predict([[0, 0, 0, 0]]), atol=1e-2)
def test_models(model_head):
    stable_algos = []
    if get_class_name(model_head['model']) in stable_algos \
            and daal_check_version((2021, 'P', 300)):
        try:
            TO_SKIP.remove(get_class_name(model_head['model']))
        except ValueError:
            pass
    if get_class_name(model_head['model']) in TO_SKIP:
        pytest.skip("Unstable", allow_module_level=False)
    _run_test(model_head['model'], model_head['methods'], model_head['dataset'])
Ejemplo n.º 7
0
def test_models(model_head):
    stable_algos = [
        'RandomForestClassifier', 'RandomForestRegressor', 'PCA',
        'LinearRegression', 'Ridge'
    ]
    if get_class_name(model_head['model']) in stable_algos \
            and daal_check_version((2021, 'P', 200)):
        TO_SKIP.remove(get_class_name(model_head['model']))
    if get_class_name(model_head['model']) in TO_SKIP:
        pytest.skip("Unstable", allow_module_level=False)
    _run_test(model_head['model'], model_head['methods'],
              model_head['dataset'])
Ejemplo n.º 8
0
def get_patch_map():
    from daal4py.sklearn.monkeypatch.dispatcher import _get_map_of_algorithms
    mapping = _get_map_of_algorithms().copy()

    if daal_check_version((2021, 'P', 300)):
        mapping.pop('svm')
        mapping.pop('svc')
        mapping['svr'] = [[(svm_module, 'SVR', SVR_sklearnex), None]]
        mapping['svc'] = [[(svm_module, 'SVC', SVC_sklearnex), None]]
        mapping['nusvr'] = [[(svm_module, 'NuSVR', NuSVR_sklearnex), None]]
        mapping['nusvc'] = [[(svm_module, 'NuSVC', NuSVC_sklearnex), None]]
    return mapping
Ejemplo n.º 9
0
def _daal_predict_proba(self, X):
    if not daal_check_version((2021, 'P', 200)):
        X = self._validate_X_predict(X)
    X_fptype = getFPType(X)
    dfc_algorithm = daal4py.decision_forest_classification_prediction(
        nClasses=int(self.n_classes_),
        fptype=X_fptype,
        resultsToEvaluate="computeClassProbabilities")
    dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_)

    pred = dfc_predictionResult.probabilities

    return pred
Ejemplo n.º 10
0
def _daal_predict_regressor(self, X):
    if X.shape[1] != self.n_features_in_:
        raise ValueError((f'X has {X.shape[1]} features, '
                          f'but RandomForestRegressor is expecting '
                          f'{self.n_features_in_} features as input'))
    if not daal_check_version((2021, 'P', 200)):
        X = self._validate_X_predict(X)
    X_fptype = getFPType(X)
    dfr_alg = daal4py.decision_forest_regression_prediction(fptype=X_fptype)
    dfr_predictionResult = dfr_alg.compute(X, self.daal_model_)

    pred = dfr_predictionResult.prediction

    return pred.ravel()
Ejemplo n.º 11
0
def test_models(model_head):
    stable_algos = [
        'RandomForestClassifier', 'RandomForestRegressor', 'PCA',
        'LinearRegression', 'Ridge', 'KNeighborsClassifier',
        'NearestNeighbors', 'KMeans', 'ElasticNet', 'Lasso'
    ]
    if get_class_name(model_head['model']) in stable_algos \
            and daal_check_version((2021, 'P', 200)):
        try:
            TO_SKIP.remove(get_class_name(model_head['model']))
        except ValueError:
            pass
    if get_class_name(model_head['model']) in TO_SKIP:
        pytest.skip("Unstable", allow_module_level=False)
    _run_test(model_head['model'], model_head['methods'],
              model_head['dataset'])
Ejemplo n.º 12
0
    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if hasattr(self, 'n_features_in_'):
            try:
                num_features = _daal_num_features(X)
            except TypeError:
                num_features = _num_samples(X)
            if num_features != self.n_features_in_:
                raise ValueError((f'X has {num_features} features, '
                                  f'but RandomForestClassifier is expecting '
                                  f'{self.n_features_in_} features as input'))
        if not hasattr(self, 'daal_model_') or \
           sp.issparse(X) or self.n_outputs_ != 1 or \
           not daal_check_version((2021, 'P', 400)):
            logging.info("sklearn.ensemble.RandomForestClassifier."
                         "predict_proba: " + get_patch_message("sklearn"))
            return super(RandomForestClassifier, self).predict_proba(X)
        logging.info("sklearn.ensemble.RandomForestClassifier."
                     "predict_proba: " + get_patch_message("daal"))
        X = check_array(X, dtype=[np.float64, np.float32])
        check_is_fitted(self)
        if sklearn_check_version('0.23'):
            self._check_n_features(X, reset=False)
        return _daal_predict_proba(self, X)
Ejemplo n.º 13
0
def _daal_predict_classifier(self, X):
    if not daal_check_version((2021, 'P', 200)):
        X = self._validate_X_predict(X)
    X_fptype = getFPType(X)
    dfc_algorithm = daal4py.decision_forest_classification_prediction(
        nClasses=int(self.n_classes_),
        fptype=X_fptype,
        resultsToEvaluate="computeClassLabels")
    if X.shape[1] != self.n_features_in_:
        raise ValueError((f'X has {X.shape[1]} features, '
                          f'but RandomForestClassifier is expecting '
                          f'{self.n_features_in_} features as input'))
    dfc_predictionResult = dfc_algorithm.compute(X, self.daal_model_)

    pred = dfc_predictionResult.prediction

    return np.take(self.classes_,
                   pred.ravel().astype(np.int64, casting='unsafe'))
import random
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, roc_auc_score,
                             mean_squared_error, log_loss)
from sklearn.ensemble \
    import RandomForestClassifier as ScikitRandomForestClassifier
from daal4py.sklearn.ensemble \
    import RandomForestClassifier as DaalRandomForestClassifier
from sklearn.ensemble \
    import RandomForestRegressor as ScikitRandomForestRegressor
from daal4py.sklearn.ensemble \
    import RandomForestRegressor as DaalRandomForestRegressor
from daal4py.sklearn._utils import daal_check_version

ACCURACY_RATIO = 0.95 if daal_check_version((2021, 'P', 400)) else 0.85
MSE_RATIO = 1.07
LOG_LOSS_RATIO = 1.4 if daal_check_version((2021, 'P', 400)) else 1.55
ROC_AUC_RATIO = 0.96
RNG = np.random.RandomState(0)
IRIS = load_iris()


def _compare_with_sklearn_classifier_iris(n_estimators=100, class_weight=None,
                                          sample_weight=None, description=""):
    x_train, x_test, y_train, y_test = \
        train_test_split(IRIS.data, IRIS.target,
                         test_size=0.33, random_state=31)
    # models
    scikit_model = ScikitRandomForestClassifier(n_estimators=n_estimators,
                                                class_weight=class_weight,
Ejemplo n.º 15
0
                    None]],
    'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]],
    'svm': [[(svm_module, 'SVC', SVC_daal4py), None]],
    'logistic': [[(logistic_module, _patched_log_reg_path_func_name,
                   daal_optimized_logistic_path), None]],
}

del _patched_log_reg_path_func_name

try:
    from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py
    _mapping['dbscan'] = [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]]
except ImportError:
    pass

if daal_check_version((2020, 1), (2021, 5)):
    _mapping['fin_check'] = [[(validation, '_assert_all_finite',
                               _daal_assert_all_finite), None]]

if daal_check_version((2020, 2), (2021, 8)):
    _mapping['tt_split'] = [[(model_selection, 'train_test_split',
                              _daal_train_test_split), None]]


def do_patch(name):
    lname = name.lower()
    if lname in _mapping:
        for descriptor in _mapping[lname]:
            which, what, replacer = descriptor[0]
            if descriptor[1] is None:
                descriptor[1] = getattr(which, what, None)
Ejemplo n.º 16
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

# Other imports
import sys
from distutils.version import LooseVersion
from functools import lru_cache
from daal4py.sklearn._utils import daal_check_version

# Classes for patching
if daal_check_version((2021, 'P', 300)):
    from .svm import SVR as SVR_sklearnex
    from .svm import SVC as SVC_sklearnex
    from .svm import NuSVR as NuSVR_sklearnex
    from .svm import NuSVC as NuSVC_sklearnex

# Scikit-learn* modules
import sklearn.svm as svm_module


@lru_cache(maxsize=None)
def get_patch_map():
    from daal4py.sklearn.monkeypatch.dispatcher import _get_map_of_algorithms
    mapping = _get_map_of_algorithms().copy()

    if daal_check_version((2021, 'P', 300)):
Ejemplo n.º 17
0
def _is_new_patching_available():
    return os.environ.get('OFF_ONEDAL_IFACE') is None \
        and daal_check_version((2021, 'P', 300))
Ejemplo n.º 18
0
class LogRegModelBuilder(unittest.TestCase):
    @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \
                         daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \
                         " not supported in this library version " + str(daal_version))
    def test_iris_with_intercept(self):
        X, y = load_iris(return_X_y=True)
        n_classes=3
        clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(X, y)
        builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1])
        builder.set_beta(clf.coef_, clf.intercept_)

        alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)

        pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
        pred_sklearn = clf.predict(X)
        self.assertTrue(np.allclose(pred_daal, pred_sklearn))


    @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \
                         daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \
                         " not supported in this library version " + str(daal_version))
    def test_iris_without_intercept(self):
        X, y = load_iris(return_X_y=True)
        n_classes=3
        clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit(X, y)
        builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1])
        builder.set_beta(clf.coef_, clf.intercept_)

        alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)

        pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
        pred_sklearn = clf.predict(X)
        self.assertTrue(np.allclose(pred_daal, pred_sklearn))


    @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \
                         daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \
                         " not supported in this library version " + str(daal_version))
    def test_breast_cancer_with_intercept(self):
        X, y = load_breast_cancer(return_X_y=True)
        n_classes=2
        clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit(X, y)
        builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1])
        builder.set_beta(clf.coef_, clf.intercept_)

        alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)

        pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
        pred_sklearn = clf.predict(X)
        self.assertTrue(np.allclose(pred_daal, pred_sklearn))


    @unittest.skipUnless(hasattr(d4p, 'logistic_regression_model_builder') and \
                         daal_check_version(((2021, 'P', 1))), str(((2021, 'P', 1))) + \
                         " not supported in this library version " + str(daal_version))
    def test_breast_cancer_without_intercept(self):
        X, y = load_breast_cancer(return_X_y=True)
        n_classes=2
        clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit(X, y)
        builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1])
        builder.set_beta(clf.coef_, clf.intercept_)

        alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)

        pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
        pred_sklearn = clf.predict(X)
        self.assertTrue(np.allclose(pred_daal, pred_sklearn))
Ejemplo n.º 19
0
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, roc_auc_score, mean_squared_error,
                             log_loss)
from sklearn.ensemble \
    import RandomForestClassifier as ScikitRandomForestClassifier
from daal4py.sklearn.ensemble \
    import RandomForestClassifier as DaalRandomForestClassifier
from sklearn.ensemble \
    import RandomForestRegressor as ScikitRandomForestRegressor
from daal4py.sklearn.ensemble \
    import RandomForestRegressor as DaalRandomForestRegressor
from daal4py.sklearn._utils import daal_check_version

N_TRIES = 10
ACCURACY_RATIO = 0.85 if daal_check_version((2021, 'P', 200)) else 0.7
MSE_RATIO = 1.05 if daal_check_version((2021, 'P', 200)) else 1.42
LOG_LOSS_RATIO = 1.55 if daal_check_version((2021, 'P', 200)) else 2.28
ROC_AUC_RATIO = 0.978
IRIS = load_iris()

random.seed(777)
CLASS_WEIGHTS_IRIS = [
    {
        0: 0,
        1: 0,
        2: 0
    },
    {
        0: 0,
        1: 1,
Ejemplo n.º 20
0
def _daal_train_test_split(*arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    stratify = options.pop('stratify', None)
    shuffle = options.pop('shuffle', True)
    rng = options.pop('rng', 'OPTIMIZED_MT19937')

    available_rngs = [
        'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31',
        'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937'
    ]
    if rng not in available_rngs:
        raise ValueError("Wrong random numbers generator is chosen. "
                         "Available generators: %s" %
                         str(available_rngs)[1:-1])

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)
    if shuffle is False:
        if stratify is not None:
            raise ValueError(
                "Stratified train/test split is not implemented for "
                "shuffle=False")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)
    else:
        if stratify is not None:
            cv = StratifiedShuffleSplit(test_size=n_test,
                                        train_size=n_train,
                                        random_state=random_state)
            train, test = next(cv.split(X=arrays[0], y=stratify))
        else:
            if mkl_random_is_imported and rng not in [
                    'default', 'OPTIMIZED_MT19937'
            ] and (isinstance(random_state, int) or random_state is None):
                random_state = mkl_random.RandomState(random_state, rng)
                indexes = random_state.permutation(n_train + n_test)
                test, train = indexes[:n_test], indexes[n_test:]
            elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \
            and (isinstance(random_state, int) or random_state is None) \
            and platform.system() != 'Windows':
                indexes = np.empty(shape=(n_train + n_test, ),
                                   dtype=np.int64 if
                                   n_train + n_test > 2**31 - 1 else np.int32)
                random_state = np.random.RandomState(random_state)
                random_state = random_state.get_state()[1]
                d4p.daal_generate_shuffled_indices([indexes], [random_state])
                test, train = indexes[:n_test], indexes[n_test:]
            else:
                cv = ShuffleSplit(test_size=n_test,
                                  train_size=n_train,
                                  random_state=random_state)
                train, test = next(cv.split(X=arrays[0], y=stratify))

    res = []
    for arr in arrays:
        fallback = False

        # input format check
        if not isinstance(arr, np.ndarray):
            if pandas_is_imported:
                if not isinstance(arr,
                                  pd.core.frame.DataFrame) and not isinstance(
                                      arr, pd.core.series.Series):
                    fallback = True
            else:
                fallback = True

        # dimensions check
        if hasattr(arr, 'ndim'):
            if arr.ndim > 2:
                fallback = True
        else:
            fallback = True

        # data types check
        dtypes = get_dtypes(arr)
        if dtypes is None:
            fallback = True
        else:
            for i, dtype in enumerate(dtypes):
                if 'float' not in str(dtype) and 'int' not in str(dtype):
                    fallback = True
                    break

        if fallback:
            res.append(safe_indexing(arr, train))
            res.append(safe_indexing(arr, test))
        else:

            if len(arr.shape) == 2:
                n_cols = arr.shape[1]
                reshape_later = False
            else:
                n_cols = 1
                reshape_later = True

            arr_copy = d4p.get_data(arr)
            if not isinstance(arr_copy, list):
                arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols),
                                            order='A')
            if isinstance(arr_copy, np.ndarray):
                order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F'
                train_arr = np.empty(shape=(n_train, n_cols),
                                     dtype=arr_copy.dtype,
                                     order=order)
                test_arr = np.empty(shape=(n_test, n_cols),
                                    dtype=arr_copy.dtype,
                                    order=order)
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                if reshape_later:
                    train_arr, test_arr = train_arr.reshape(
                        (n_train, )), test_arr.reshape((n_test, ))
            elif isinstance(arr_copy, list):
                train_arr = [
                    np.empty(shape=(n_train, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                test_arr = [
                    np.empty(shape=(n_test, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                train_arr = {
                    col: train_arr[i]
                    for i, col in enumerate(arr.columns)
                }
                test_arr = {
                    col: test_arr[i]
                    for i, col in enumerate(arr.columns)
                }
            else:
                raise ValueError('Array can\'t be converted to needed format')

            if pandas_is_imported:
                if isinstance(arr, pd.core.frame.DataFrame):
                    train_arr, test_arr = pd.DataFrame(
                        train_arr), pd.DataFrame(test_arr)
                if isinstance(arr, pd.core.series.Series):
                    train_arr, test_arr = train_arr.reshape(
                        n_train), test_arr.reshape(n_test)
                    train_arr, test_arr = pd.Series(train_arr), pd.Series(
                        test_arr)

            if hasattr(arr, 'index'):
                train_arr.index = train
                test_arr.index = test

            res.append(train_arr)
            res.append(test_arr)

    return res
Ejemplo n.º 21
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_ = X.shape[1]

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)
    daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    features_per_node_ = _to_absolute_max_features(self.max_features,
                                                   X.shape[1],
                                                   is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine_,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    if self.oob_score:
        self.estimators_ = self._estimators_
        self._set_oob_score(X, y)

    return self
Ejemplo n.º 22
0
def get_sklearnex_version(rule):
    from daal4py.sklearn._utils import daal_check_version
    return daal_check_version(rule)
Ejemplo n.º 23
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    rs_ = check_random_state(self.random_state)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                 X.shape[1],
                                                 is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if sample_weight is not None:
        sample_weight = [sample_weight]

    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype=getFPType(X),
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap
        if self.bootstrap is True else 1.,
        featuresPerNode=int(_featuresPerNode),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)

    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
Ejemplo n.º 24
0
    'lasso':         [[(linear_model_module, 'Lasso', Lasso_daal4py), None]],
    'svm':           [[(svm_module, 'SVC', SVC_daal4py), None]],
    'logistic':      [[(logistic_module, _patched_log_reg_path_func_name, daal_optimized_logistic_path), None]],
    'knn_clsf':      [[(neighbors_module, 'KNeighborsClassifier', KNeighborsClassifier_daal4py), None]],
    'knn_mxn':       [[(neighbors_module, 'KNeighborsMixin', KNeighborsMixin_daal4py), None]]
}

del _patched_log_reg_path_func_name

try:
    from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py
    _mapping['dbscan'] = [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]]
except ImportError:
    pass

if daal_check_version(((2020,'P', 1), (2021,'B',5))):
    _mapping['fin_check'] = [[(validation, '_assert_all_finite', _daal_assert_all_finite), None]]

if daal_check_version(((2020,'P', 2), (2021,'B',8))):
    _mapping['tt_split'] = [[(model_selection, 'train_test_split', _daal_train_test_split), None]]

if daal_check_version((2020,'P', 3)):
    _mapping['df_classifier'] = [[(ensemble_module, 'RandomForestClassifier', RandomForestClassifier_daal4py), None]]
    _mapping['df_regressor']  = [[(ensemble_module, 'RandomForestRegressor', RandomForestRegressor_daal4py), None]]

def do_patch(name):
    lname = name.lower()
    if lname in _mapping:
        for descriptor in _mapping[lname]:
            which, what, replacer = descriptor[0]
            if descriptor[1] is None:
Ejemplo n.º 25
0
import pytest
from sklearn.neighbors \
    import KNeighborsClassifier as ScikitKNeighborsClassifier
from daal4py.sklearn.neighbors \
    import KNeighborsClassifier as DaalKNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import (accuracy_score, log_loss, roc_auc_score)
from sklearn.model_selection import train_test_split
from daal4py.sklearn._utils import daal_check_version

DISTANCES = ['minkowski']
ALGORITHMS = ['brute', 'kd_tree', 'auto']
WEIGHTS = ['uniform', 'distance']
KS = [1, 3, 7, 15, 31]
N_TRIES = 10
ACCURACY_RATIO = 1.0 if daal_check_version(((2020, 'P', 300))) else 0.9
LOG_LOSS_RATIO = 1.02
ROC_AUC_RATIO = 0.999
IRIS = load_iris()


def _test_determenistic(distance, algorithm, weight, k):
    x_train, x_test, y_train, y_test = \
        train_test_split(IRIS.data, IRIS.target,
                         test_size=0.33, random_state=31)

    alg_results = []
    for _ in range(N_TRIES):
        # models
        scikit_model = ScikitKNeighborsClassifier(n_neighbors=k,
                                                  weights=weight,
Ejemplo n.º 26
0
"""
The 'daal4py.sklearn.ensemble' module implements daal4py-based 
RandomForestClassifier and RandomForestRegressor classes.
"""
from daal4py.sklearn._utils import daal_check_version
if daal_check_version((2020, 3)):
    from .forest import (RandomForestClassifier, RandomForestRegressor)
else:
    from .decision_forest import (RandomForestClassifier, RandomForestRegressor)
from .GBTDAAL import (GBTDAALClassifier, GBTDAALRegressor)
from .AdaBoostClassifier import AdaBoostClassifier

__all__ = ['RandomForestClassifier', 'RandomForestRegressor', 'GBTDAALClassifier', 'GBTDAALRegressor', 'AdaBoostClassifier']

Ejemplo n.º 27
0
    np.int64,
    np.float16,
    np.float32,
    np.float64,
    np.uint8,
    np.uint16,
    np.uint32,
    np.uint64,
]

TO_SKIP = [
    # --------------- NO INFO ---------------
    r'KMeans .*transform',
    r'KMeans .*score',
    r'PCA .*score',
    r'LogisticRegression .*decision_function',
    r'LogisticRegressionCV .*decision_function',
    r'LogisticRegressionCV .*predict',
    r'LogisticRegressionCV .*predict_proba',
    r'LogisticRegressionCV .*predict_log_proba',
    r'LogisticRegressionCV .*score',
    # --------------- Scikit ---------------
    r'Ridge float16 predict',
    r'Ridge float16 score',
    r'RandomForestClassifier .*predict_proba',
    r'RandomForestClassifier .*predict_log_proba',
    r'pairwise_distances .*pairwise_distances',  # except float64
    r'roc_auc_score .*roc_auc_score' \
    if not daal_check_version((2021, 'P', 200)) else None,
]
Ejemplo n.º 28
0
    def _fit(self, X, skip_num_points=0):
        """Private function to fit the model using X as training data."""
        if isinstance(self.init, str) and self.init == 'warn':
            warnings.warn(
                "The default initialization in TSNE will change "
                "from 'random' to 'pca' in 1.2.", FutureWarning)
            self._init = 'random'
        else:
            self._init = self.init

        if isinstance(self._init, str) and self._init == 'pca' and issparse(X):
            raise TypeError("PCA initialization is currently not suported "
                            "with the sparse input matrix. Use "
                            "init=\"random\" instead.")

        if self.method not in ['barnes_hut', 'exact']:
            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
        if self.angle < 0.0 or self.angle > 1.0:
            raise ValueError("'angle' must be between 0.0 - 1.0")
        if self.learning_rate == 'warn':
            warnings.warn(
                "The default learning rate in TSNE will change "
                "from 200.0 to 'auto' in 1.2.", FutureWarning)
            self._learning_rate = 200.0
        else:
            self._learning_rate = self.learning_rate
        if self._learning_rate == 'auto':
            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
            self._learning_rate = np.maximum(self._learning_rate, 50)
        else:
            if not (self._learning_rate > 0):
                raise ValueError("'learning_rate' must be a positive number "
                                 "or 'auto'.")

        if hasattr(self, 'square_distances'):
            if self.square_distances not in [True, 'legacy']:
                raise ValueError(
                    "'square_distances' must be True or 'legacy'.")
            if self.metric != "euclidean" and self.square_distances is not True:
                warnings.warn(
                    ("'square_distances' has been introduced in 0.24"
                     "to help phase out legacy squaring behavior. The "
                     "'legacy' setting will be removed in 0.26, and the "
                     "default setting will be changed to True. In 0.28, "
                     "'square_distances' will be removed altogether,"
                     "and distances will be squared by default. Set "
                     "'square_distances'=True to silence this warning."),
                    FutureWarning)

        if self.method == 'barnes_hut':
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr'],
                                        ensure_min_samples=2,
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr'],
                                ensure_min_samples=2,
                                dtype=[np.float32, np.float64])
        else:
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr', 'csc', 'coo'],
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr', 'csc', 'coo'],
                                dtype=[np.float32, np.float64])

        if self.metric == "precomputed":
            if isinstance(self._init, str) and self._init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")

            check_non_negative(
                X, "TSNE.fit(). With metric='precomputed', X "
                "should contain positive distances.")

            if self.method == "exact" and issparse(X):
                raise TypeError(
                    'TSNE with method="exact" does not accept sparse '
                    'precomputed distance matrix. Use method="barnes_hut" '
                    'or provide the dense distance matrix.')

        if self.method == 'barnes_hut' and self.n_components > 3:
            raise ValueError("'n_components' should be inferior to 4 for the "
                             "barnes_hut algorithm as it relies on "
                             "quad-tree or oct-tree.")
        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError(
                "early_exaggeration must be at least 1, but is {}".format(
                    self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        if self.method == "exact":
            # Retrieve the distance matrix, either using the precomputed one or
            # computing it.
            if self.metric == "precomputed":
                distances = X
            else:
                if self.verbose:
                    print("[t-SNE] Computing pairwise distances...")

                if self.metric == "euclidean":
                    # Euclidean is squared here, rather than using **= 2,
                    # because euclidean_distances already calculates
                    # squared distances, and returns np.sqrt(dist) for
                    # squared=False.
                    # Also, Euclidean is slower for n_jobs>1, so don't set here
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   squared=True)
                else:
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   n_jobs=self.n_jobs)

            if np.any(distances < 0):
                raise ValueError("All distances should be positive, the "
                                 "metric given is not correct")

            if self.metric != "euclidean" and \
                    getattr(self, 'square_distances', True) is True:
                distances **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities(distances, self.perplexity, self.verbose)
            assert np.all(np.isfinite(P)), "All probabilities should be finite"
            assert np.all(P >= 0), "All probabilities should be non-negative"
            assert np.all(P <= 1), ("All probabilities should be less "
                                    "or then equal to one")

        else:
            # Compute the number of nearest neighbors to find.
            # LvdM uses 3 * perplexity as the number of neighbors.
            # In the event that we have very small # of points
            # set the neighbors to n - 1.
            n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1))

            if self.verbose:
                print("[t-SNE] Computing {} nearest neighbors...".format(
                    n_neighbors))

            # Find the nearest neighbors for every point
            knn = NearestNeighbors(
                algorithm='auto',
                n_jobs=self.n_jobs,
                n_neighbors=n_neighbors,
                metric=self.metric,
            )
            t0 = time()
            knn.fit(X)
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
                    n_samples, duration))

            t0 = time()
            distances_nn = knn.kneighbors_graph(mode='distance')
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Computed neighbors for {} samples "
                      "in {:.3f}s...".format(n_samples, duration))

            # Free the memory used by the ball_tree
            del knn

            if getattr(self, 'square_distances', True) is True or \
                    self.metric == "euclidean":
                # knn return the euclidean distance but we need it squared
                # to be consistent with the 'exact' method. Note that the
                # the method was derived using the euclidean method as in the
                # input space. Not sure of the implication of using a different
                # metric.
                distances_nn.data **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                        self.verbose)

        if isinstance(self._init, np.ndarray):
            X_embedded = self._init
        elif self._init == 'pca':
            pca = PCA(
                n_components=self.n_components,
                svd_solver='randomized',
                random_state=random_state,
            )
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
            warnings.warn(
                "The PCA initialization in TSNE will change to "
                "have the standard deviation of PC1 equal to 1e-4 "
                "in 1.2. This will ensure better convergence.", FutureWarning)
        elif self._init == 'random':
            # The embedding is initialized with iid samples from Gaussians with
            # standard deviation 1e-4.
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        # Degrees of freedom of the Student's t-distribution. The suggestion
        # degrees_of_freedom = n_components - 1 comes from
        # "Learning a Parametric Embedding by Preserving Local Structure"
        # Laurens van der Maaten, 2009.
        degrees_of_freedom = max(self.n_components - 1, 1)

        daal_ready = self.method == 'barnes_hut' and self.n_components == 2 and \
            self.verbose == 0 and daal_check_version((2021, 'P', 600))

        if daal_ready:
            X_embedded = check_array(X_embedded,
                                     dtype=[np.float32, np.float64])
            return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
        return self._tsne(P,
                          degrees_of_freedom,
                          n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)
Ejemplo n.º 29
0
def main():
    X, y = load_iris(return_X_y=True)
    n_classes=3

    # set parameters and train
    clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(X, y)

    #set parameters and call model builder
    builder = d4p.logistic_regression_model_builder(n_classes=n_classes, n_features=X.shape[1])
    builder.set_beta(clf.coef_, clf.intercept_)

    # set parameters and compute predictions
    predict_alg = d4p.logistic_regression_prediction(nClasses=n_classes,
                                                     resultsToEvaluate="computeClassLabels")
    # set parameters and compute predictions
    predict_result_daal = predict_alg.compute(X, builder.model)
    predict_result_sklearn = clf.predict(X)
    assert np.allclose(predict_result_daal.prediction.flatten(), predict_result_sklearn)
    return (builder, predict_result_daal)


if __name__ == "__main__":
    if daal_check_version(((2021,'P', 1))):
        (builder, predict_result_daal) = main()
        print("\nLogistic Regression coefficients:\n", builder.model)
        print("\nLogistic regression prediction results (first 10 rows):\n", predict_result_daal.prediction[0:10])
        print("\nLogistic regression prediction probabilities (first 10 rows):\n", predict_result_daal.probabilities[0:10])
        print("\nLogistic regression prediction log probabilities (first 10 rows):\n", predict_result_daal.logProbabilities[0:10])
        print('All looks good!')
Ejemplo n.º 30
0
def _fit_regressor(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError(
            "sparse multilabel-indicator for y is not supported."
        )
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    if sklearn_check_version('1.0') and self.criterion == "mse":
        warnings.warn(
            "Criterion 'mse' was deprecated in v1.0 and will be "
            "removed in version 1.2. Use `criterion='squared_error'` "
            "which is equivalent.",
            FutureWarning
        )

    _patching_status = PatchingConditionsChain(
        "sklearn.ensemble.RandomForestRegressor.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score,
            "OOB score is only supported starting from 2021.5 version of oneDAL."),
        (self.warm_start is False, "Warm start is not supported."),
        (self.criterion in ["mse", "squared_error"],
            f"'{self.criterion}' criterion is not supported. "
            "Only 'mse' and 'squared_error' criteria are supported."),
        (self.ccp_alpha == 0.0,
            f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."),
        (not sp.issparse(X), "X is sparse. Sparse input is not supported.")
    ])

    if _dal_ready:
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)
        X = check_array(X, dtype=[np.float64, np.float32])
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                          " expected. Please change the shape of y to "
                          "(n_samples,), for example using ravel().",
                          DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        _dal_ready = _patching_status.and_conditions([
            (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")])

    _patching_status.write_log()
    if _dal_ready:
        _daal_fit_regressor(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_
        return self
    return super(RandomForestRegressor, self).fit(
        X, y, sample_weight=sample_weight)