コード例 #1
0
def test_check_X_bad_input_args(X):
    """Test for the correct reaction for bad input in check_X."""
    with pytest.raises(ValueError):
        check_X(X)

    with pytest.raises(ValueError):
        check_X_y(X, y)
コード例 #2
0
def test_check_X_enforce_univariate():
    X, y = make_classification_problem(n_columns=2)
    msg = r"univariate"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_univariate=True)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_univariate=True)
コード例 #3
0
def test_check_X_enforce_min_columns():
    X, y = make_classification_problem(n_columns=2)
    msg = r"columns"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_columns=3)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_columns=3)
コード例 #4
0
def test_check_enforce_min_instances():
    X, y = make_classification_problem(n_instances=3)
    msg = r"instance"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_y(y, enforce_min_instances=4)
コード例 #5
0
ファイル: _ensemble.py プロジェクト: AidenRushbrooke/sktime
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score."""
        check_X_y(X, y)
        check_X(X, enforce_univariate=True)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            final_estimator = estimator.steps[-1][1]
            unsampled_indices = _generate_unsampled_indices(
                final_estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = predictions[k] / predictions[k].sum(axis=1)[:,
                                                                   np.newaxis]
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
コード例 #6
0
ファイル: _boss.py プロジェクト: whackteachers/sktime
    def fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
コード例 #7
0
    def fit(self, X, y):
        """Fit an estimator using transformed data from the MatrixProfile transformer.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)
        self.classes_ = np.unique(y)
        self.n_classes = self.classes_.shape[0]

        self._transformer = MatrixProfile(m=self.subsequence_length)
        self._estimator = _clone_estimator(
            KNeighborsClassifier(
                n_neighbors=1) if self.estimator is None else self.estimator,
            self.random_state,
        )

        m = getattr(self._estimator, "n_jobs", None)
        if callable(m):
            self._estimator.n_jobs = self.n_jobs

        X_t = self._transformer.fit_transform(X, y)
        self._estimator.fit(X_t, y)

        self._is_fitted = True
        return self
コード例 #8
0
    def fit(self, X, y):
        """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self.classifier = rocket_pipeline = make_pipeline(
            Rocket(
                num_kernels=self.num_kernels,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
            ),
            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
        )
        rocket_pipeline.fit(X, y)

        self._is_fitted = True
        return self
コード例 #9
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using supervised
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. STSF has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=True,
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, _ = X.shape

        rng = check_random_state(self.random_state)

        cls, class_counts = np.unique(y, return_counts=True)
        self.n_classes = class_counts.shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.intervals_ = [[[] for _ in range(3)]
                           for _ in range(self.n_estimators)]

        _, X_p = signal.periodogram(X)
        X_d = np.diff(X, 1)

        balance_cases = np.zeros(0, dtype=np.int32)
        average = math.floor(n_instances / self.n_classes)
        for i, c in enumerate(cls):
            if class_counts[i] < average:
                cls_idx = np.where(y == c)[0]
                balance_cases = np.concatenate(
                    (rng.choice(cls_idx, size=average - class_counts[i]),
                     balance_cases))

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_estimator)(
                X,
                X_p,
                X_d,
                y,
                np.concatenate((rng.choice(n_instances, size=n_instances),
                                balance_cases)),
                i,
            ) for i in range(self.n_estimators))

        self._is_fitted = True
        return self
コード例 #10
0
ファイル: base.py プロジェクト: whackteachers/sktime
    def fit(self, X, y):
        """Fit time series classifier to training data.

        Parameters
        ----------
        X : 3D np.array, array-like or sparse matrix
                of shape = [n_instances,n_dimensions,series_length]
                or shape = [n_instances,series_length]
            or single-column pd.DataFrame with pd.Series entries
        y : array-like, shape =  [n_instances] - the class labels.

        Returns
        -------
        self : reference to self.

        State change
        ------------
        creates fitted model (attributes ending in "_")
        sets is_fitted flag to true
        """

        coerce_to_numpy = self._all_tags()["coerce-X-to-numpy"]

        X, y = check_X_y(X, y, coerce_to_numpy=coerce_to_numpy)

        self._fit(X, y)

        # this should happen last
        self._is_fitted = True

        return self
コード例 #11
0
    def _get_train_probs(self, X, y):
        self.check_is_fitted()
        X, y = check_X_y(X, y, coerce_to_numpy=True, enforce_univariate=True)

        n_instances, _, series_length = X.shape

        if n_instances != self.n_instances_ or series_length != self.series_length_:
            raise ValueError(
                "n_instances, series_length mismatch. X should be "
                "the same as the training data used in fit for generating train "
                "probabilities.")

        results = np.zeros((n_instances, self.n_classes_))
        divisors = np.zeros(n_instances)

        for i, clf in enumerate(self.estimators_):
            subsample = clf._subsample
            preds = (clf._train_predictions if self.save_train_predictions else
                     Parallel(n_jobs=self._threads_to_use)(
                         delayed(clf._train_predict)(i, )
                         for i in range(len(subsample))))

            for n, pred in enumerate(preds):
                results[subsample[n]][
                    self._class_dictionary[pred]] += self.weights_[i]
                divisors[subsample[n]] += self.weights_[i]

        for i in range(n_instances):
            results[i] = (np.ones(self.n_classes_) * (1 / self.n_classes_)
                          if divisors[i] == 0 else results[i] /
                          (np.ones(self.n_classes_) * divisors[i]))

        return results
コード例 #12
0
ファイル: _checks.py プロジェクト: whackteachers/sktime
        def wrapper(self, data, labels=None, **kwargs):
            # Check if pandas so we can convert back
            is_pandas = True if isinstance(data, pd.DataFrame) else False
            pd_idx = data.index if is_pandas else None

            # Fit checks
            if check_fitted:
                self.check_is_fitted()

            # First convert to pandas so everything is the same format
            if labels is None:
                data = check_X(data, coerce_to_pandas=True)
            else:
                data, labels = check_X_y(data, labels, coerce_to_pandas=True)

            # Now convert it to a numpy array
            # Note sktime uses [N, C, L] whereas signature code uses shape
            # [N, L, C] (C being channels) so we must transpose.
            data = np.transpose(from_nested_to_3d_numpy(data), [0, 2, 1])

            # Apply the function to the transposed array
            if labels is None:
                output = func(self, data, **kwargs)
            else:
                output = func(self, data, labels, **kwargs)

            # Convert back
            if all(
                [is_pandas,
                 isinstance(output, np.ndarray), not force_numpy]):
                output = pd.DataFrame(index=pd_idx, data=output)

            return output
コード例 #13
0
ファイル: _checks.py プロジェクト: juanitorduz/sktime
        def wrapper(self, data, labels=None, **kwargs):
            # Check if pandas so we can convert back
            is_pandas = True if isinstance(data, pd.DataFrame) else False
            pd_idx = data.index if is_pandas else None

            # Fit checks
            if check_fitted:
                self.check_is_fitted()

            # First convert to pandas so everything is the same format
            if labels is None:
                data = check_X(data, coerce_to_pandas=True)
            else:
                data, labels = check_X_y(data, labels, coerce_to_pandas=True)

            # Apply the function to the transposed array
            if labels is None:
                output = func(self, data, **kwargs)
            else:
                output = func(self, data, labels, **kwargs)

            # Convert back
            if all(
                [is_pandas,
                 isinstance(output, np.ndarray), not force_numpy]):
                output = pd.DataFrame(index=pd_idx, data=output)

            return output
コード例 #14
0
    def fit(self, X, y):
        """Fit an estimator using transformed data from the Catch22 transformer.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, n_dims]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_classes = np.unique(y).shape[0]

        self._transformer = Catch22(outlier_norm=self.outlier_norm)
        self._estimator = _clone_estimator(
            RandomForestClassifier(n_estimators=200)
            if self.estimator is None else self.estimator,
            self.random_state,
        )

        m = getattr(self._estimator, "n_jobs", None)
        if callable(m):
            self._estimator.n_jobs = self.n_jobs

        X_t = self._transformer.fit_transform(X, y)
        X_t = np.nan_to_num(X_t, False, 0, 0, 0)
        self._estimator.fit(X_t, y)

        self._is_fitted = True
        return self
コード例 #15
0
ファイル: _proximity_forest.py プロジェクト: wh28325/sktime
    def fit(self, X, y):
        """
        Build the classifier on the training set (X, y)
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            column 0 is extracted.
        y : array-like, shape = [n_instances]
            The class labels.
        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_pandas=True)

        self.X = dataset_properties.positive_dataframe_indices(X)
        self.random_state = check_random_state(self.random_state)
        # setup label encoding
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(y)
        self.y = y
        self.classes_ = self.label_encoder.classes_
        if self.distance_measure is None:
            if self.get_distance_measure is None:
                self.get_distance_measure = self.setup_distance_measure(self)
            self.distance_measure = self.get_distance_measure(self)
        self.X_exemplar, self.y_exemplar = self.pick_exemplars(self)
        self._is_fitted = True
        return self
コード例 #16
0
ファイル: _tsf.py プロジェクト: AidenRushbrooke/sktime
    def fit(self, X, y, **kwargs):
        """Wrap BaseForest._fit.

        This is a temporary measure prior to the BaseRegressor refactor.
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True, enforce_univariate=True)
        return BaseTimeSeriesForest._fit(self, X, y, **kwargs)
コード例 #17
0
ファイル: base.py プロジェクト: fspinna/sktime_forked
    def fit(self, X, y):
        """Fit time series classifier to training data.

        Parameters
        ----------
        X : 3D np.array, array-like or sparse matrix
                of shape = [n_instances,n_dimensions,series_length]
                or shape = [n_instances,series_length]
            or single-column pd.DataFrame with pd.Series entries
        y : array-like, shape =  [n_instances] - the class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False)

        X, y = check_X_y(X, y, coerce_to_numpy=coerce_to_numpy)

        self._fit(X, y)

        # this should happen last
        self._is_fitted = True

        return self
コード例 #18
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y).

         Uses random intervals and catch22/tsf summary features.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,n_dimensions,
        series_length] or shape = [n_instances,series_length]
        The training input samples.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.base_estimator is None or self.base_estimator == "DTC":
            self.tree = DecisionTreeClassifier(criterion="entropy")
        elif self.base_estimator == "CIT":
            self.tree = ContinuousIntervalTree()
        elif isinstance(self.base_estimator, BaseEstimator):
            self.tree = self.base_estimator
        else:
            raise ValueError("DrCIF invalid base estimator given.")

        if self.n_intervals is None:
            self.__n_intervals = int(
                math.sqrt(self.series_length) * math.sqrt(self.n_dims)
            )
        if self.__n_intervals <= 0:
            self.__n_intervals = 1

        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        if self.max_interval is None:
            self.__max_interval = self.series_length / 2
        if self.__max_interval < self.min_interval:
            self.__max_interval = self.min_interval

        fit = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_estimator)(
                X,
                y,
                i,
            )
            for i in range(self.n_estimators)
        )

        self.classifiers, self.intervals, self.dims, self.atts = zip(*fit)

        self._is_fitted = True
        return self
コード例 #19
0
    def fit(self, X, y):
        """Fit a single TD classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self.dims, self.transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self.dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self.transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for i in range(self.n_instances):
                    for word, count in dim_words[i].items():
                        words[i][word << self.highest_dim_bit | dim] = count

            self.transformed_data = words
        else:
            self.transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    save_words=False,
                    n_jobs=self.n_jobs,
                ))
            sfa = self.transformers[0].fit_transform(X, y)
            self.transformed_data = sfa[0]

        self._is_fitted = True
        return self
コード例 #20
0
def check_and_clean_data(X, y=None, input_checks=True):
    '''
    Performs basic sktime data checks and prepares the train data for input to
    Keras models.

    Parameters
    ----------
    X: the train data
    y: the train labels
    input_checks: whether to perform the basic sktime checks

    Returns
    -------
    X
    '''
    if input_checks:
        if y is None:
            check_X(X)
        else:
            check_X_y(X, y)

    # want data in form: [instances = n][timepoints = m][dimensions = d]
    if isinstance(X, pd.DataFrame):
        if _is_nested_dataframe(X):
            if X.shape[1] > 1:
                # we have multiple columns, AND each cell contains a series,
                # so this is a multidimensional problem
                X = _multivariate_nested_df_to_array(X)
            else:
                # we have a single column containing a series, treat this as
                # a univariate problem
                X = _univariate_nested_df_to_array(X)
        else:
            # we have multiple columns each containing a primitive, treat as
            # univariate series
            X = _univariate_df_to_array(X)

    if len(X.shape) == 2:
        # add a dimension to make it multivariate with one dimension
        X = X.values.reshape(
            X.shape[0], X.shape[1], 1
        )  # go from [n][m] to [n][m][d=1]
    # return transposed data to conform with current model formats
    return X.transpose(0, 2, 1)
コード例 #21
0
    def fit(self, X, y):
        """Build the classifier on the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            column 0 is extracted.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_pandas=True)
        self.X = positive_dataframe_indices(X)
        self.random_state = check_random_state(self.random_state)
        if self.find_stump is None:
            self.find_stump = best_of_n_stumps(self.n_stump_evaluations)
        # setup label encoding
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(y)
        self.y = y
        self.classes_ = self.label_encoder.classes_
        if self.distance_measure is None:
            if self.get_distance_measure is None:
                self.get_distance_measure = self.setup_distance_measure(self)
            self.distance_measure = self.get_distance_measure(self)
        self.stump = self.find_stump(self)
        n_branches = len(self.stump.y_exemplar)
        self.branches = [None] * n_branches
        if self.depth < self.max_depth:
            for index in range(n_branches):
                sub_y = self.stump.y_branches[index]
                if not self.is_leaf(sub_y):
                    sub_tree = ProximityTree(
                        random_state=self.random_state,
                        get_exemplars=self.get_exemplars,
                        distance_measure=self.distance_measure,
                        setup_distance_measure=self.setup_distance_measure,
                        get_distance_measure=self.get_distance_measure,
                        get_gain=self.get_gain,
                        is_leaf=self.is_leaf,
                        verbosity=self.verbosity,
                        max_depth=self.max_depth,
                        n_jobs=self.n_jobs,
                    )
                    sub_tree.label_encoder = self.label_encoder
                    sub_tree.depth = self.depth + 1
                    self.branches[index] = sub_tree
                    sub_X = self.stump.X_branches[index]
                    sub_tree.fit(sub_X, sub_y)
        self._is_fitted = True
        return self
コード例 #22
0
ファイル: _hivecote_v1.py プロジェクト: ynjacobs/sktime
    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        cv_size = 10
        _, counts = np.unique(y, return_counts=True)
        min_class = np.min(counts)
        if min_class < cv_size:
            cv_size = min_class

        self.stc = ShapeletTransformClassifier(
            random_state=self.random_state,
            time_contract_in_mins=60,
        )
        self.stc.fit(X, y)
        train_preds = cross_val_predict(
            ShapeletTransformClassifier(
                random_state=self.random_state,
                time_contract_in_mins=60,
            ),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.stc_weight = accuracy_score(y, train_preds)**4

        self.tsf = TimeSeriesForest(random_state=self.random_state)
        self.tsf.fit(X, y)
        train_preds = cross_val_predict(
            TimeSeriesForest(random_state=self.random_state),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.tsf_weight = accuracy_score(y, train_preds)**4

        self.rise = RandomIntervalSpectralForest(
            random_state=self.random_state)
        self.fit(X, y)
        train_preds = cross_val_predict(
            RandomIntervalSpectralForest(random_state=self.random_state),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.rise_weight = accuracy_score(y, train_preds)**4

        self.cboss = ContractableBOSS(random_state=self.random_state)
        self.cboss.fit(X, y)
        train_probs = self.cboss._get_train_probs(X)
        train_preds = self.cboss.classes_[np.argmax(train_probs, axis=1)]
        self.cboss_weight = accuracy_score(y, train_preds)**4

        return self
コード例 #23
0
ファイル: _tde.py プロジェクト: juanitorduz/sktime
    def _get_train_probs(self,
                         X,
                         y,
                         train_estimate_method="loocv") -> np.ndarray:
        self.check_is_fitted()
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        n_instances, n_dims, series_length = X.shape

        if (n_instances != self.n_instances_ or n_dims != self.n_dims_
                or series_length != self.series_length_):
            raise ValueError(
                "n_instances, n_dims, series_length mismatch. X should be "
                "the same as the training data used in fit for generating train "
                "probabilities.")

        results = np.zeros((n_instances, self.n_classes_))
        divisors = np.zeros(n_instances)

        if train_estimate_method.lower() == "loocv":
            for i, clf in enumerate(self.estimators_):
                subsample = clf._subsample
                preds = (clf._train_predictions if self.save_train_predictions
                         else Parallel(n_jobs=self._threads_to_use)(
                             delayed(clf._train_predict)(i, )
                             for i in range(len(subsample))))

                for n, pred in enumerate(preds):
                    results[subsample[n]][
                        self._class_dictionary[pred]] += self.weights_[i]
                    divisors[subsample[n]] += self.weights_[i]
        elif train_estimate_method.lower() == "oob":
            indices = range(n_instances)
            for i, clf in enumerate(self.estimators_):
                oob = [n for n in indices if n not in clf._subsample]

                if len(oob) == 0:
                    continue

                preds = clf.predict(X[oob])

                for n, pred in enumerate(preds):
                    results[oob[n]][
                        self._class_dictionary[pred]] += self.weights_[i]
                    divisors[oob[n]] += self.weights_[i]
        else:
            raise ValueError(
                "Invalid train_estimate_method. Available options: loocv, oob")

        for i in range(n_instances):
            results[i] = (np.ones(self.n_classes_) * (1 / self.n_classes_)
                          if divisors[i] == 0 else results[i] /
                          (np.ones(self.n_classes_) * divisors[i]))

        return results
コード例 #24
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. TSF has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not TimeSeriesForest.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(
                X,
                y,
                self.base_estimator,
                self.intervals_[i],
                self.random_state,
            )
            for i in range(self.n_estimators)
        )

        self._is_fitted = True
        return self
コード例 #25
0
ファイル: base.py プロジェクト: RNKuhns/sktime
    def fit(self, X, y):
        """Fit time series classifier to training data.

        Parameters
        ----------
        X : 2D np.array (univariate, equal length series) of shape = [n_instances,
        series_length]
            or 3D np.array (any number of dimensions, equal length series) of shape =
            [n_instances,n_dimensions,series_length]
            or pd.DataFrame with each column a dimension, each cell a pd.Series (any
            number of dimensions, equal or unequal length series)
        y : 1D np.array of shape =  [n_instances] - the class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        coerce_to_numpy = self.get_tag("coerce-X-to-numpy")
        coerce_to_pandas = self.get_tag("coerce-X-to-pandas")
        allow_multivariate = self.get_tag("capability:multivariate")
        X, y = check_X_y(
            X,
            y,
            coerce_to_numpy=coerce_to_numpy,
            coerce_to_pandas=coerce_to_pandas,
            enforce_univariate=not allow_multivariate,
        )

        multithread = self.get_tag("capability:multithreading")
        if multithread:
            try:
                self._threads_to_use = check_n_jobs(self.n_jobs)
            except NameError:
                raise AttributeError(
                    "self.n_jobs must be set if capability:multithreading is True"
                )

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        self._fit(X, y)

        # this should happen last
        self._is_fitted = True

        return self
コード例 #26
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and catch22/tsf summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,series_length]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification).
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.n_intervals is None:
            self.__n_intervals = 4 + int(
                (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3)
        if self.__n_intervals <= 0:
            self.__n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        if self.max_interval is None:
            self.__max_interval = self.series_length / 2
        if self.__max_interval < self.min_interval:
            self.__max_interval = self.min_interval

        _, X_p = signal.periodogram(X)
        X_d = np.diff(X, 1)
        self.total_intervals = self.__n_intervals * 2 + int(
            self.__n_intervals / 2)

        fit = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_estimator)(
            X,
            X_p,
            X_d,
            y,
            i,
        ) for i in range(self.n_estimators))

        self.classifiers, self.intervals, self.dims, self.atts = zip(*fit)

        self._is_fitted = True
        return self
コード例 #27
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
コード例 #28
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        Xt: np.ndarray or pd.DataFrame
            Panel training data.
        y : np.ndarray
            The class labels.

        Returns
        -------
        self : object
            An fitted instance of the classifier
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not self.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        n_jobs = check_n_jobs(self.n_jobs)

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval,
                           self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=n_jobs)(
            delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng),
                                    X, y, self.intervals_[i])
            for i in range(self.n_estimators))

        self._is_fitted = True
        return self
コード例 #29
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest.

        Contract default for ST is 5 hours

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)

        # if y is a pd.series then convert to array.
        if isinstance(y, pd.Series):
            y = y.to_numpy()

        # generate pipeline in fit so that random state can be propagated properly.
        self.classifier_ = Pipeline([
            (
                "st",
                ContractedShapeletTransform(
                    time_contract_in_mins=self.transform_contract_in_mins,
                    verbose=False,
                    random_state=self.random_state,
                ),
            ),
            (
                "rf",
                RandomForestClassifier(n_estimators=self.n_estimators,
                                       random_state=self.random_state),
            ),
        ])

        self.n_classes_ = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier_.fit(X, y)

        self._is_fitted = True
        return self
コード例 #30
0
ファイル: tsfresh.py プロジェクト: zhaoyun0071/sktime
    def fit(self, X, y=None):
        """Fit.

        Parameters
        ----------
        X : pd.DataFrame
            nested pandas DataFrame of shape [n_samples, n_columns]
        y : pd.Series or np.array
            Target variable

        Returns
        -------
        self : an instance of self
        """
        # lazy imports to avoid hard dependency
        from tsfresh.transformers.feature_selector import FeatureSelector

        # input checks
        if y is None:
            raise ValueError(
                f"{self.__class__.__name__} requires `y` in `fit`.")
        X, y = check_X_y(X, y, coerce_to_pandas=True)

        self.extractor_ = TSFreshFeatureExtractor(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profiling=self.profiling,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting,
        )

        selection_params = self._get_selection_params()
        extraction_param = self._get_extraction_params()
        self.selector_ = FeatureSelector(
            n_jobs=extraction_param["n_jobs"],
            chunksize=extraction_param["chunksize"],
            ml_task=self.ml_task,
            **selection_params,
        )

        Xt = self.extractor_.fit_transform(X)
        self.selector_.fit(Xt, y)
        self._is_fitted = True
        return self