Esempio n. 1
0
    def fit(self, X, y=None):
        """Fit the imputer on X.
        Parameters
        ----------
        X : array-like shape of (n_samples, n_features)
            Input data, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        Returns
        -------
        self : object
        """
        # Check data integrity and calling arguments
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
            if self.metric not in _NAN_METRICS and not callable(self.metric):
                raise ValueError(
                    "The selected metric does not support NaN values")
        if self.n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got {}".format(
                self.n_neighbors))

        X = check_array(X,
                        accept_sparse=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite=force_all_finite,
                        copy=self.copy)
        super()._fit_indicator(X)

        _check_weights(self.weights)
        self._fit_X = X
        self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
        return self
Esempio n. 2
0
    def __init__(
        self,
        n_neighbors=1,
        weights="uniform",
        distance="dtw",
        distance_params=None,
        **kwargs
    ):
        self._distance_params = distance_params
        if distance_params is None:
            self._distance_params = {}
        self.distance = distance
        self.distance_params = distance_params

        if isinstance(self.distance, str):
            distance = distance_factory(metric=self.distance)

        super(KNeighborsTimeSeriesClassifier, self).__init__(
            n_neighbors=n_neighbors,
            algorithm="brute",
            metric=distance,
            metric_params=None,  # Extra distance params handled in _fit
            **kwargs
        )
        BaseClassifier.__init__(self)
        self.weights = _check_weights(weights)

        # We need to add is-fitted state when inheriting from scikit-learn
        self._is_fitted = False
Esempio n. 3
0
    def fit(self, X, y=None):
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """

        # Check data integrity and calling arguments
        force_all_finite = False if self.missing_values in ["NaN",
                                                            np.nan] else True
        if not force_all_finite:
            if self.metric not in _MASKED_METRICS and not callable(
                    self.metric):
                raise ValueError(
                    "The selected metric does not support NaN values.")
        X = check_array(X, accept_sparse=False, dtype=np.float64,
                        force_all_finite=force_all_finite, copy=self.copy)
        self.weights = _check_weights(self.weights)

        # Check for +/- inf
        if np.any(np.isinf(X)):
            raise ValueError("+/- inf values are not allowed.")

        # Check if % missing in any column > col_max_missing
        mask = _get_mask(X, self.missing_values)
        if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)):
            raise ValueError("Some column(s) have more than {}% missing values"
                             .format(self.col_max_missing * 100))
        X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data

        # Check if % missing in any row > row_max_missing
        bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
        if np.any(bad_rows):
            warnings.warn(
                "There are rows with more than {0}% missing values. These "
                "rows are not included as donor neighbors."
                    .format(self.row_max_missing * 100))

            # Remove rows that have more than row_max_missing % missing
            X = X[~bad_rows, :]

        # Check if sufficient neighboring samples available
        if X.shape[0] < self.n_neighbors:
            raise ValueError("There are only %d samples, but n_neighbors=%d."
                             % (X.shape[0], self.n_neighbors))
        self.fitted_X_ = X
        self.statistics_ = X_col_means

        return self
Esempio n. 4
0
 def __init__(self, n_neighbors=5, *, weights='uniform',
              algorithm='auto', leaf_size=30,
              p=2, metric='minkowski', metric_params=None, n_jobs=None,
              **kwargs):
     super().__init__(
         n_neighbors=n_neighbors,
         algorithm=algorithm,
         leaf_size=leaf_size, metric=metric, p=p,
         metric_params=metric_params, n_jobs=n_jobs, **kwargs)
     self.weights = _check_weights(weights)
Esempio n. 5
0
    def __init__(self, n_neighbors=1, weights='uniform', algorithm='brute',
                 metric='dtw', metric_params=None, **kwargs):

        self._cv_for_params = False

        if metric == 'dtw':
            metric = dtw_distance
        elif metric == 'dtwcv':  # special case to force loocv grid search
            # cv in training
            if metric_params is not None:
                warnings.warn(
                    "Warning: measure parameters have been specified for "
                    "dtwcv. "
                    "These will be ignored and parameter values will be "
                    "found using LOOCV.")
            metric = dtw_distance
            self._cv_for_params = True
            self._param_matrix = {
                'metric_params': [{'w': x / 100} for x in range(0, 100)]}
        elif metric == 'ddtw':
            metric = ddtw_distance
        elif metric == 'wdtw':
            metric = wdtw_distance
        elif metric == 'wddtw':
            metric = wddtw_distance
        elif metric == 'lcss':
            metric = lcss_distance
        elif metric == 'erp':
            metric = erp_distance
        elif metric == 'msm':
            metric = msm_distance
        elif metric == 'twe':
            metric = twe_distance
        elif metric == 'mpdist':
            metric = mpdist
        # When mpdist is used, the subsequence length (parameter m) must be set
        # Example: knn_mpdist = KNeighborsTimeSeriesClassifier(
        # metric='mpdist', metric_params={'m':30})
        else:
            if type(metric) is str:
                raise ValueError(
                    "Unrecognised distance measure: " + metric + ". Allowed "
                                                                 "values are "
                                                                 "names from "
                                                                 "[dtw,ddtw,"
                                                                 "wdtw,"
                                                                 "wddtw,"
                                                                 "lcss,erp,"
                                                                 "msm] or "
                                                                 "please "
                                                                 "pass a "
                                                                 "callable "
                                                                 "distance "
                                                                 "measure "
                                                                 "into the "
                                                                 "constuctor "
                                                                 "directly.")

        super(KNeighborsTimeSeriesClassifier, self).__init__(
            n_neighbors=n_neighbors,
            algorithm=algorithm,
            metric=metric,
            metric_params=metric_params,
            **kwargs)
        self.weights = _check_weights(weights)

        # We need to add is-fitted state when inheriting from scikit-learn
        self._is_fitted = False
Esempio n. 6
0
    def __init__(
        self,
        n_neighbors=1,
        weights="uniform",
        distance="dtw",
        distance_params=None,
        **kwargs
    ):
        self._cv_for_params = False
        self.distance = distance
        self.distance_params = distance_params

        if distance == "euclidean":  # Euclidean will default to the base class distance
            distance = euclidean_distance
        elif distance == "dtw":
            distance = dtw_distance
        elif distance == "dtwcv":  # special case to force loocv grid search
            # cv in training
            if distance_params is not None:
                warnings.warn(
                    "Warning: measure parameters have been specified for "
                    "dtwcv. "
                    "These will be ignored and parameter values will be "
                    "found using LOOCV."
                )
            distance = dtw_distance
            self._cv_for_params = True
            self._param_matrix = {
                "metric_params": [{"w": x / 100} for x in range(0, 100)]
            }
        elif distance == "ddtw":
            distance = ddtw_distance
        elif distance == "wdtw":
            distance = wdtw_distance
        elif distance == "wddtw":
            distance = wddtw_distance
        elif distance == "lcss":
            distance = lcss_distance
        elif distance == "erp":
            distance = erp_distance
        elif distance == "msm":
            distance = msm_distance
        elif distance == "twe":
            distance = twe_distance
        elif distance == "mpdist":
            distance = mpdist
            # When mpdist is used, the subsequence length (parameter m) must be set
            # Example: knn_mpdist = KNeighborsTimeSeriesClassifier(
            # metric='mpdist', metric_params={'m':30})
        else:
            if type(distance) is str:
                raise ValueError(
                    "Unrecognised distance measure: " + distance + ". Allowed values "
                    "are names from [euclidean,dtw,ddtw,wdtw,wddtw,lcss,erp,msm] or "
                    "please pass a callable distance measure into the constuctor"
                )

        super(KNeighborsTimeSeriesClassifier, self).__init__(
            n_neighbors=n_neighbors,
            algorithm="brute",
            metric=distance,
            metric_params=distance_params,
            **kwargs
        )
        self.weights = _check_weights(weights)

        # We need to add is-fitted state when inheriting from scikit-learn
        self._is_fitted = False
Esempio n. 7
0
    def _fit(self, X, y=None):
        if self.metric_params is not None and 'p' in self.metric_params:
            if self.p is not None:
                warnings.warn("Parameter p is found in metric_params. "
                              "The corresponding parameter from __init__ "
                              "is ignored.", SyntaxWarning, stacklevel=2)

        if hasattr(self, 'weights') and sklearn_check_version("1.0"):
            self.weights = _check_weights(self.weights)

        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)

        X_incorrect_type = isinstance(
            X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase))
        single_output = True
        self._daal_model = None
        shape = None
        correct_n_classes = True

        try:
            requires_y = self._get_tags()["requires_y"]
        except KeyError:
            requires_y = False

        if y is not None or requires_y:
            if not X_incorrect_type or y is None:
                X, y = validate_data(
                    self, X, y, accept_sparse="csr", multi_output=True,
                    dtype=[np.float64, np.float32])
                single_output = False if y.ndim > 1 and y.shape[1] > 1 else True

            shape = y.shape

            if is_classifier(self) or is_regressor(self):
                if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
                    self.outputs_2d_ = False
                    y = y.reshape((-1, 1))
                else:
                    self.outputs_2d_ = True

                if is_classifier(self):
                    check_classification_targets(y)
                self.classes_ = []
                self._y = np.empty(y.shape, dtype=int)
                for k in range(self._y.shape[1]):
                    classes, self._y[:, k] = np.unique(
                        y[:, k], return_inverse=True)
                    self.classes_.append(classes)

                if not self.outputs_2d_:
                    self.classes_ = self.classes_[0]
                    self._y = self._y.ravel()

                n_classes = len(self.classes_)
                if n_classes < 2:
                    correct_n_classes = False
            else:
                self._y = y
        else:
            if not X_incorrect_type:
                X, _ = validate_data(
                    self, X, accept_sparse='csr', dtype=[np.float64, np.float32])

        if not X_incorrect_type:
            self.n_samples_fit_ = X.shape[0]
            self.n_features_in_ = X.shape[1]

        try:
            fptype = getFPType(X)
        except ValueError:
            fptype = None

        weights = getattr(self, 'weights', 'uniform')

        def stock_fit(self, X, y):
            if sklearn_check_version("0.24"):
                result = super(NeighborsBase, self)._fit(X, y)
            else:
                result = super(NeighborsBase, self)._fit(X)
            return result

        if self.n_neighbors is not None:
            if self.n_neighbors <= 0:
                raise ValueError(
                    "Expected n_neighbors > 0. Got %d" %
                    self.n_neighbors
                )
            if not isinstance(self.n_neighbors, numbers.Integral):
                raise TypeError(
                    "n_neighbors does not take %s value, "
                    "enter integer value" %
                    type(self.n_neighbors))

        _patching_status = PatchingConditionsChain(
            "sklearn.neighbors.KNeighborsMixin.kneighbors")
        _dal_ready = _patching_status.and_conditions([
            (self.metric == 'minkowski' and self.p == 2 or self.metric == 'euclidean',
                f"'{self.metric}' (p={self.p}) metric is not supported. "
                "Only 'euclidean' or 'minkowski' with p=2 metrics are supported."),
            (not X_incorrect_type, "X is not Tree or Neighbors instance or array."),
            (weights in ['uniform', 'distance'],
                f"'{weights}' weights is not supported. "
                "Only 'uniform' and 'distance' weights are supported."),
            (self.algorithm in ['brute', 'kd_tree', 'auto', 'ball_tree'],
                f"'{self.algorithm}' algorithm is not supported. "
                "Only 'brute', 'kd_tree', 'auto' and 'ball_tree' "
                "algorithms are supported."),
            (single_output, "Multiple outputs are not supported."),
            (fptype is not None, "Unable to get dtype."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (correct_n_classes, "Number of classes < 2.")])
        _patching_status.write_log()
        if _dal_ready:
            try:
                daal4py_fit(self, X, fptype)
                result = self
            except RuntimeError:
                logging.info(
                    "sklearn.neighbors.KNeighborsMixin."
                    "kneighbors: " + get_patch_message("sklearn_after_daal"))
                result = stock_fit(self, X, y)
        else:
            result = stock_fit(self, X, y)

        if y is not None and is_regressor(self):
            self._y = y if shape is None else y.reshape(shape)

        return result
Esempio n. 8
0
    def __init__(
        self,
        n_neighbors=1,
        weights="uniform",
        algorithm="brute",
        metric="dtw",
        metric_params=None,
        **kwargs
    ):
        if algorithm == "kd_tree":
            raise ValueError(
                "KNeighborsTimeSeriesClassifier cannot work with kd_tree since kd_tree "
                "cannot be used with a callable distance metric and we do not support "
                "precalculated distances as yet."
            )
        if algorithm == "ball_tree":
            raise ValueError(
                "KNeighborsTimeSeriesClassifier cannot work with ball_tree since "
                "ball_tree has a list of hard coded distances it can use, and cannot "
                "work with 3-D arrays"
            )

        self._cv_for_params = False
        # TODO: add in capacity for euclidean
        # if metric != "euclidean":  # Euclidean will default to the base class distance
        if metric == "dtw":
            metric = dtw_distance
        elif metric == "dtwcv":  # special case to force loocv grid search
            # cv in training
            if metric_params is not None:
                warnings.warn(
                    "Warning: measure parameters have been specified for "
                    "dtwcv. "
                    "These will be ignored and parameter values will be "
                    "found using LOOCV."
                )
            metric = dtw_distance
            self._cv_for_params = True
            self._param_matrix = {
                "metric_params": [{"w": x / 100} for x in range(0, 100)]
            }
        elif metric == "ddtw":
            metric = ddtw_distance
        elif metric == "wdtw":
            metric = wdtw_distance
        elif metric == "wddtw":
            metric = wddtw_distance
        elif metric == "lcss":
            metric = lcss_distance
        elif metric == "erp":
            metric = erp_distance
        elif metric == "msm":
            metric = msm_distance
        elif metric == "twe":
            metric = twe_distance
        elif metric == "mpdist":
            metric = mpdist
            # When mpdist is used, the subsequence length (parameter m) must be set
            # Example: knn_mpdist = KNeighborsTimeSeriesClassifier(
            # metric='mpdist', metric_params={'m':30})
        else:
            if type(metric) is str:
                raise ValueError(
                    "Unrecognised distance measure: " + metric + ". Allowed values "
                    "are names from [dtw,ddtw,wdtw,wddtw,lcss,erp,msm] or "
                    "please pass a callable distance measure into the constuctor"
                )

        super(KNeighborsTimeSeriesClassifier, self).__init__(
            n_neighbors=n_neighbors,
            algorithm=algorithm,
            metric=metric,
            metric_params=metric_params,
            **kwargs
        )
        self.weights = _check_weights(weights)

        # We need to add is-fitted state when inheriting from scikit-learn
        self._is_fitted = False
Esempio n. 9
0
    def _fit(self, X, y=None):
        if self.metric_params is not None and 'p' in self.metric_params:
            if self.p is not None:
                warnings.warn(
                    "Parameter p is found in metric_params. "
                    "The corresponding parameter from __init__ "
                    "is ignored.",
                    SyntaxWarning,
                    stacklevel=2)

        if hasattr(self, 'weights') and sklearn_check_version("1.0"):
            self.weights = _check_weights(self.weights)

        X_incorrect_type = isinstance(
            X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase))
        single_output = True
        self._daal_model = None
        shape = None
        correct_n_classes = True

        try:
            requires_y = self._get_tags()["requires_y"]
        except KeyError:
            requires_y = False

        if y is not None or requires_y:
            if not X_incorrect_type or y is None:
                X, y = validate_data(self,
                                     X,
                                     y,
                                     accept_sparse="csr",
                                     multi_output=True,
                                     dtype=[np.float64, np.float32])
                single_output = False if y.ndim > 1 and y.shape[1] > 1 else True

            shape = y.shape

            if is_classifier(self) or is_regressor(self):
                if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
                    self.outputs_2d_ = False
                    y = y.reshape((-1, 1))
                else:
                    self.outputs_2d_ = True

                if is_classifier(self):
                    check_classification_targets(y)
                self.classes_ = []
                self._y = np.empty(y.shape, dtype=int)
                for k in range(self._y.shape[1]):
                    classes, self._y[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                    self.classes_.append(classes)

                if not self.outputs_2d_:
                    self.classes_ = self.classes_[0]
                    self._y = self._y.ravel()

                n_classes = len(self.classes_)
                if n_classes < 2:
                    correct_n_classes = False
            else:
                self._y = y
        else:
            if not X_incorrect_type:
                X, _ = validate_data(self,
                                     X,
                                     accept_sparse='csr',
                                     dtype=[np.float64, np.float32])

        if not X_incorrect_type:
            self.n_samples_fit_ = X.shape[0]
            self.n_features_in_ = X.shape[1]

        try:
            fptype = getFPType(X)
        except ValueError:
            fptype = None

        weights = getattr(self, 'weights', 'uniform')

        def stock_fit(self, X, y):
            if sklearn_check_version("0.24"):
                result = super(NeighborsBase, self)._fit(X, y)
            else:
                result = super(NeighborsBase, self)._fit(X)
            return result

        if self.n_neighbors is not None:
            if self.n_neighbors <= 0:
                raise ValueError("Expected n_neighbors > 0. Got %d" %
                                 self.n_neighbors)
            if not isinstance(self.n_neighbors, numbers.Integral):
                raise TypeError("n_neighbors does not take %s value, "
                                "enter integer value" % type(self.n_neighbors))

        condition = (self.metric == 'minkowski' and self.p == 2) or \
            self.metric == 'euclidean'
        if not X_incorrect_type and weights in ['uniform', 'distance'] \
            and self.algorithm in ['brute', 'kd_tree', 'auto', 'ball_tree'] \
            and condition \
            and single_output and fptype is not None and not sp.issparse(X) and \
                correct_n_classes:
            try:
                logging.info("sklearn.neighbors.KNeighborsMixin."
                             "kneighbors: " + get_patch_message("daal"))
                daal4py_fit(self, X, fptype)
                result = self
            except RuntimeError:
                logging.info("sklearn.neighbors.KNeighborsMixin."
                             "kneighbors: " +
                             get_patch_message("sklearn_after_daal"))
                result = stock_fit(self, X, y)
        else:
            logging.info("sklearn.neighbors.KNeighborsMixin."
                         "kneighbors: " + get_patch_message("sklearn"))
            result = stock_fit(self, X, y)

        if y is not None and is_regressor(self):
            self._y = y if shape is None else y.reshape(shape)

        return result