def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """

        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=[np.float64, np.float32])
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        if not hasattr(self, 'daal_model_') or \
                sp.issparse(X) or \
                not good_shape_for_daal:
            logging.info("sklearn.linear_model.Lasso."
                         "predict: " + get_patch_message("sklearn"))
            return self._decision_function(X)
        logging.info("sklearn.linear_model.Lasso."
                     "predict: " + get_patch_message("daal"))
        return _daal4py_predict_lasso(self, X)
Exemple #2
0
    def fit(self, X, y=None, sample_weight=None):
        """Perform DBSCAN clustering from features, or distance matrix.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features), or \
            (n_samples, n_samples)
            Training instances to cluster, or distances between instances if
            ``metric='precomputed'``. If a sparse matrix is provided, it will
            be converted into a sparse ``csr_matrix``.

        sample_weight : array, shape (n_samples,), optional
            Weight of each sample, such that a sample with a weight of at least
            ``min_samples`` is by itself a core sample; a sample with a
            negative weight may inhibit its eps-neighbor from being core.
            Note that weights are absolute, and default to 1.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self

        """
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])

        if self.eps <= 0.0:
            raise ValueError("eps must be positive.")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        _daal_ready = self.algorithm in [
            'auto', 'brute'] and (
            self.metric == 'euclidean' or (
                self.metric == 'minkowski' and self.p == 2)) and isinstance(
                X, np.ndarray)
        if _daal_ready:
            logging.info(
                "sklearn.cluster.DBSCAN."
                "fit: " + get_patch_message("daal"))
            core_ind, assignments = _daal_dbscan(
                X, self.eps,
                self.min_samples,
                sample_weight=sample_weight)
            self.core_sample_indices_ = core_ind
            self.labels_ = assignments
            self.components_ = np.take(X, core_ind, axis=0)
            return self
        logging.info(
            "sklearn.cluster.DBSCAN."
            "fit: " + get_patch_message("sklearn"))
        return super().fit(X, y, sample_weight=sample_weight)
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit model with coordinate descent.

        Parameters
        ----------
        X : {ndarray, sparse matrix} of (n_samples, n_features)
            Data

        y : {ndarray, sparse matrix} of shape (n_samples,) or \
            (n_samples, n_targets)
            Target. Will be cast to X's dtype if necessary

        sample_weight : float or array-like of shape (n_samples,), default=None
            Sample weight.

        check_input : bool, default=True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Notes
        -----

        Coordinate descent is an algorithm that considers each column of
        data at a time hence it will automatically convert the X input
        as a Fortran-contiguous numpy array if necessary.

        To avoid memory re-allocation it is advised to allocate the
        initial data in memory directly using that format.
        """
        # check X and y
        if check_input:
            X, y = check_X_y(X,
                             y,
                             copy=False,
                             accept_sparse='csc',
                             dtype=[np.float64, np.float32],
                             multi_output=True,
                             y_numeric=True)
            y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)
        else:
            # only for compliance with Sklearn,
            # this assert is not required for Intel(R) oneAPI Data
            # Analytics Library
            if isinstance(X, np.ndarray) and \
                    X.flags['F_CONTIGUOUS'] is False:
                raise ValueError("ndarray is not Fortran contiguous")

        if isinstance(X, np.ndarray):
            self.fit_shape_good_for_daal_ = True if X.ndim <= 1 else True if X.shape[
                0] >= X.shape[1] else False
        else:
            self.fit_shape_good_for_daal_ = False

        if sp.issparse(X) or \
                sample_weight is not None or \
                not self.fit_shape_good_for_daal_ or \
                not (X.dtype == np.float64 or X.dtype == np.float32):
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso."
                         "fit: " + get_patch_message("sklearn"))
            res_new = super(ElasticNet, self).fit(X,
                                                  y,
                                                  sample_weight=sample_weight,
                                                  check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new
        self.n_iter_ = None
        self._gap = None
        # only for pass tests
        # "check_estimators_fit_returns_self(readonly_memmap=True) and
        # check_regressors_train(readonly_memmap=True)
        if not (X.flags.writeable):
            X = np.copy(X)
        if not (y.flags.writeable):
            y = np.copy(y)
        logging.info("sklearn.linear_model.Lasso."
                     "fit: " + get_patch_message("daal"))
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
        if res is None:
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso."
                         "fit: " + get_patch_message("sklearn_after_daal"))
            res_new = super(ElasticNet, self).fit(X,
                                                  y,
                                                  sample_weight=sample_weight,
                                                  check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new
        return res
Exemple #4
0
def _fit(self, X, y, sample_weight=None, check_input=True):
    if sklearn_check_version('1.0'):
        self._check_feature_names(X, reset=True)
    # check X and y
    if check_input:
        X, y = check_X_y(
            X,
            y,
            copy=False,
            accept_sparse='csc',
            dtype=[np.float64, np.float32],
            multi_output=True,
            y_numeric=True,
        )
        y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)

    if not sp.issparse(X):
        self.fit_shape_good_for_daal_ = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False
    else:
        self.fit_shape_good_for_daal_ = False

    _function_name = f"sklearn.linear_model.{self.__class__.__name__}.fit"
    _patching_status = PatchingConditionsChain(
        _function_name)
    _dal_ready = _patching_status.and_conditions([
        (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
        (self.fit_shape_good_for_daal_,
            "The shape of X does not satisfy oneDAL requirements: "
            "number of features > number of samples."),
        (X.dtype == np.float64 or X.dtype == np.float32,
            f"'{X.dtype}' X data type is not supported. "
            "Only np.float32 and np.float64 are supported."),
        (sample_weight is None, "Sample weights are not supported.")])
    _patching_status.write_log()

    if not _dal_ready:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    self.n_iter_ = None
    self._gap = None

    if not check_input:
        # only for compliance with Sklearn,
        # this assert is not required for Intel(R) oneAPI Data
        # Analytics Library
        print(type(X), X.flags['F_CONTIGUOUS'])
        if isinstance(X, np.ndarray) and \
                X.flags['F_CONTIGUOUS'] is False:
            # print(X.flags)
            raise ValueError("ndarray is not Fortran contiguous")

    if sklearn_check_version('1.0'):
        self._normalize = _deprecate_normalize(
            self.normalize,
            default=False,
            estimator_name=self.__class__.__name__)

    # only for pass tests
    # "check_estimators_fit_returns_self(readonly_memmap=True) and
    # check_regressors_train(readonly_memmap=True)
    if not X.flags.writeable:
        X = np.copy(X)
    if not y.flags.writeable:
        y = np.copy(y)

    if self.__class__.__name__ == "ElasticNet":
        res = _daal4py_fit_enet(self, X, y, check_input=check_input)
    else:
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
    if res is None:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            _function_name + ": " + get_patch_message("sklearn_after_daal")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    return res
def _fit(self, X, y, sample_weight=None, check_input=True):
    # check X and y
    if check_input:
        X, y = check_X_y(
            X,
            y,
            copy=False,
            accept_sparse='csc',
            dtype=[np.float64, np.float32],
            multi_output=True,
            y_numeric=True,
        )
        y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)

    if not sp.issparse(X):
        self.fit_shape_good_for_daal_ = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False
    else:
        self.fit_shape_good_for_daal_ = False

    log_str = "sklearn.linear_model." + self.__class__.__name__ + ".fit: "
    sklearn_ready = sp.issparse(X) or not self.fit_shape_good_for_daal_ or \
        X.dtype not in [np.float64, np.float32] or sample_weight is not None

    if sklearn_ready:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    self.n_iter_ = None
    self._gap = None

    if not check_input:
        # only for compliance with Sklearn,
        # this assert is not required for Intel(R) oneAPI Data
        # Analytics Library
        print(type(X), X.flags['F_CONTIGUOUS'])
        if isinstance(X, np.ndarray) and \
                X.flags['F_CONTIGUOUS'] is False:
            # print(X.flags)
            raise ValueError("ndarray is not Fortran contiguous")

    if sklearn_check_version('1.0'):
        self._normalize = _deprecate_normalize(
            self.normalize,
            default=False,
            estimator_name=self.__class__.__name__)

    # only for pass tests
    # "check_estimators_fit_returns_self(readonly_memmap=True) and
    # check_regressors_train(readonly_memmap=True)
    if not X.flags.writeable:
        X = np.copy(X)
    if not y.flags.writeable:
        y = np.copy(y)
    logging.info(log_str + get_patch_message("daal"))

    if self.__class__.__name__ == "ElasticNet":
        res = _daal4py_fit_enet(self, X, y, check_input=check_input)
    else:
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
    if res is None:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn_after_daal")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    return res
Exemple #6
0
def _daal_train_test_split(*arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    stratify = options.pop('stratify', None)
    shuffle = options.pop('shuffle', True)
    rng = options.pop('rng', 'OPTIMIZED_MT19937')

    available_rngs = ['default', 'MT19937', 'SFMT19937', 'MT2203', 'R250',
                      'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10',
                      'NONDETERM', 'OPTIMIZED_MT19937']
    if rng not in available_rngs:
        raise ValueError(
            "Wrong random numbers generator is chosen. "
            "Available generators: %s" % str(available_rngs)[1:-1])

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    if shuffle is False:
        if stratify is not None:
            raise ValueError(
                "Stratified train/test split is not implemented for shuffle=False")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)
    else:
        if stratify is not None:
            cv = StratifiedShuffleSplit(
                test_size=n_test,
                train_size=n_train,
                random_state=random_state
            )
            train, test = next(cv.split(X=arrays[0], y=stratify))
        else:
            if mkl_random_is_imported and \
               rng not in ['default', 'OPTIMIZED_MT19937'] and \
               (isinstance(random_state, int) or random_state is None):
                random_state = mkl_random.RandomState(random_state, rng)
                indexes = random_state.permutation(n_samples)
                test, train = indexes[:n_test], indexes[n_test:(
                    n_test + n_train)]
            elif rng == 'OPTIMIZED_MT19937' and \
                (isinstance(random_state, int) or random_state is None) and \
                    platform.system() != 'Windows':
                indexes = np.empty(
                    shape=(n_samples,),
                    dtype=np.int64 if n_train + n_test > 2 ** 31 - 1 else np.int32
                )
                random_state = np.random.RandomState(random_state)
                random_state = random_state.get_state()[1]
                d4p.daal_generate_shuffled_indices([indexes], [random_state])
                test, train = indexes[:n_test], indexes[n_test:(
                    n_test + n_train)]
            else:
                cv = ShuffleSplit(
                    test_size=n_test,
                    train_size=n_train,
                    random_state=random_state
                )
                train, test = next(cv.split(X=arrays[0], y=stratify))

    res = []
    for arr in arrays:
        fallback = False

        # input format check
        if not isinstance(arr, np.ndarray):
            if pandas_is_imported:
                if not isinstance(arr, pd.core.frame.DataFrame) and \
                   not isinstance(arr, pd.core.series.Series):
                    fallback = True
            else:
                fallback = True

        # dimensions check
        if hasattr(arr, 'ndim'):
            if arr.ndim > 2:
                fallback = True
        else:
            fallback = True

        # data types check
        dtypes = get_dtypes(arr)
        if dtypes is None:
            fallback = True
        else:
            for i, dtype in enumerate(dtypes):
                if 'float' not in str(dtype) and 'int' not in str(dtype):
                    fallback = True
                    break

        if fallback:
            logging.info(
                "sklearn.model_selection."
                "train_test_split: " + get_patch_message("sklearn"))
            res.append(safe_indexing(arr, train))
            res.append(safe_indexing(arr, test))
        else:
            logging.info(
                "sklearn.model_selection."
                "train_test_split: " + get_patch_message("daal"))
            if len(arr.shape) == 2:
                n_cols = arr.shape[1]
                reshape_later = False
            else:
                n_cols = 1
                reshape_later = True

            arr_copy = d4p.get_data(arr)
            if not isinstance(arr_copy, list):
                arr_copy = arr_copy.reshape(
                    (arr_copy.shape[0], n_cols),
                    order='A',
                )
            if isinstance(arr_copy, np.ndarray):
                order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F'
                train_arr = np.empty(
                    shape=(n_train, n_cols),
                    dtype=arr_copy.dtype,
                    order=order,
                )
                test_arr = np.empty(
                    shape=(n_test, n_cols),
                    dtype=arr_copy.dtype,
                    order=order,
                )
                d4p.daal_train_test_split(
                    arr_copy, train_arr, test_arr, [train], [test]
                )
                if reshape_later:
                    train_arr, test_arr = train_arr.reshape(
                        (n_train,)), test_arr.reshape((n_test,))
            elif isinstance(arr_copy, list):
                train_arr = [
                    np.empty(
                        shape=(n_train,),
                        dtype=el.dtype,
                        order='C' if el.flags['C_CONTIGUOUS'] else 'F',
                    ) for el in arr_copy
                ]
                test_arr = [
                    np.empty(
                        shape=(n_test,),
                        dtype=el.dtype,
                        order='C' if el.flags['C_CONTIGUOUS'] else 'F'
                    ) for el in arr_copy
                ]
                d4p.daal_train_test_split(
                    arr_copy, train_arr, test_arr, [train], [test])
                train_arr = {col: train_arr[i]
                             for i, col in enumerate(arr.columns)}
                test_arr = {col: test_arr[i]
                            for i, col in enumerate(arr.columns)}
            else:
                raise ValueError('Array can\'t be converted to needed format')

            if pandas_is_imported:
                if isinstance(arr, pd.core.frame.DataFrame):
                    train_arr, test_arr = \
                        pd.DataFrame(train_arr), pd.DataFrame(test_arr)
                if isinstance(arr, pd.core.series.Series):
                    train_arr, test_arr = \
                        train_arr.reshape(n_train), test_arr.reshape(n_test)
                    train_arr, test_arr = pd.Series(
                        train_arr), pd.Series(test_arr)

            if hasattr(arr, 'index'):
                train_arr.index = train
                test_arr.index = test

            res.append(train_arr)
            res.append(test_arr)

    return res