Esempio n. 1
0
    def _get_extraction_params(self):
        """Helper function to set default parameters from tsfresh"""
        # make n_jobs compatible with scikit-learn
        self.n_jobs = check_n_jobs(self.n_jobs)

        # lazy imports to avoid hard dependency
        from tsfresh.defaults import CHUNKSIZE
        from tsfresh.defaults import DISABLE_PROGRESSBAR
        from tsfresh.utilities.dataframe_functions import impute
        from tsfresh.defaults import N_PROCESSES
        from tsfresh.defaults import PROFILING
        from tsfresh.defaults import PROFILING_FILENAME
        from tsfresh.defaults import PROFILING_SORTING
        from tsfresh.defaults import SHOW_WARNINGS
        from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
        from tsfresh.feature_extraction.settings import EfficientFCParameters
        from tsfresh.feature_extraction.settings import MinimalFCParameters

        # Set defaults from tsfresh
        extraction_params = {
            "kind_to_fc_parameters": self.kind_to_fc_parameters,
            "n_jobs": N_PROCESSES,
            "chunksize": CHUNKSIZE,
            "show_warnings": SHOW_WARNINGS,
            "disable_progressbar": DISABLE_PROGRESSBAR,
            "impute_function": impute,
            "profiling_sorting": PROFILING_SORTING,
            "profiling_filename": PROFILING_FILENAME,
            "profile": PROFILING,
        }

        # Replace defaults with user defined parameters
        for name in extraction_params.keys():
            if hasattr(self, name):
                value = getattr(self, name)
                if value is not None:
                    extraction_params[name] = value

        # Convert convenience string arguments to tsfresh parameters classes
        fc_param_lookup = {
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters(),
            "comprehensive": ComprehensiveFCParameters(),
        }
        if isinstance(self.default_fc_parameters, str):
            if self.default_fc_parameters not in fc_param_lookup:
                raise ValueError(
                    f"If `default_fc_parameters` is passed as a "
                    f"string, "
                    f"it must be one of"
                    f" {fc_param_lookup.keys()}, but found: "
                    f"{self.default_fc_parameters}"
                )
            else:
                fc_parameters = fc_param_lookup[self.default_fc_parameters]
        else:
            fc_parameters = self.default_fc_parameters
        extraction_params["default_fc_parameters"] = fc_parameters

        return extraction_params
Esempio n. 2
0
    def fit(self, X: TimeSeriesInstances, y=None) -> BaseEstimator:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : Training time series instances to cluster. np.ndarray (2d or 3d array of
        shape (n_instances, series_length) or shape (n_instances, n_dimensions,
        series_length)) or pd.DataFrame (where each column is a dimension, each cell
        is a pd.Series (any number of dimensions, equal or unequal length series)).
        Converted to type _tags["X_inner_mtype"]
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        X = self._check_clusterer_input(X)

        multithread = self.get_tag("capability:multithreading")
        if multithread:
            try:
                self._threads_to_use = check_n_jobs(self.n_jobs)
            except NameError:
                raise AttributeError(
                    "self.n_jobs must be set if capability:multithreading is True"
                )

        start = int(round(time.time() * 1000))
        self._fit(X)
        self.fit_time_ = int(round(time.time() * 1000)) - start
        self._is_fitted = True
        return self
Esempio n. 3
0
    def fit(self, X, y):
        """Fit time series classifier to training data.

        Parameters
        ----------
        X : 3D np.array (any number of dimensions, equal length series)
                of shape [n_instances, n_dimensions, series_length]
            or 2D np.array (univariate, equal length series)
                of shape [n_instances, series_length]
            or pd.DataFrame with each column a dimension, each cell a pd.Series
                (any number of dimensions, equal or unequal length series)
            or of any other supported Panel mtype
                for list of mtypes, see datatypes.SCITYPE_REGISTER
                for specifications, see examples/AA_datatypes_and_datasets.ipynb
        y : 1D np.array of int, of shape [n_instances] - class labels for fitting
            indices correspond to instance indices in X

        Returns
        -------
        self : Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        start = int(round(time.time() * 1000))
        # convenience conversions to allow user flexibility:
        # if X is 2D array, convert to 3D, if y is Series, convert to numpy
        X, y = _internal_convert(X, y)
        X_metadata = _check_classifier_input(X, y)
        missing = X_metadata["has_nans"]
        multivariate = not X_metadata["is_univariate"]
        unequal = not X_metadata["is_equal_length"]
        # Check this classifier can handle characteristics
        self._check_capabilities(missing, multivariate, unequal)
        # Convert data as dictated by the classifier tags
        X = self._convert_X(X)
        multithread = self.get_tag("capability:multithreading")
        if multithread:
            try:
                self._threads_to_use = check_n_jobs(self.n_jobs)
            except NameError:
                raise AttributeError(
                    "self.n_jobs must be set if capability:multithreading is True"
                )

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        self._class_dictionary = {}
        for index, class_val in enumerate(self.classes_):
            self._class_dictionary[class_val] = index
        self._fit(X, y)
        self.fit_time_ = int(round(time.time() * 1000)) - start
        # this should happen last
        self._is_fitted = True
        return self
Esempio n. 4
0
    def fit(self, X, y):
        """Fit time series classifier to training data.

        Parameters
        ----------
        X : 2D np.array (univariate, equal length series) of shape = [n_instances,
        series_length]
            or 3D np.array (any number of dimensions, equal length series) of shape =
            [n_instances,n_dimensions,series_length]
            or pd.DataFrame with each column a dimension, each cell a pd.Series (any
            number of dimensions, equal or unequal length series)
        y : 1D np.array of shape =  [n_instances] - the class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        coerce_to_numpy = self.get_tag("coerce-X-to-numpy")
        coerce_to_pandas = self.get_tag("coerce-X-to-pandas")
        allow_multivariate = self.get_tag("capability:multivariate")
        X, y = check_X_y(
            X,
            y,
            coerce_to_numpy=coerce_to_numpy,
            coerce_to_pandas=coerce_to_pandas,
            enforce_univariate=not allow_multivariate,
        )

        multithread = self.get_tag("capability:multithreading")
        if multithread:
            try:
                self._threads_to_use = check_n_jobs(self.n_jobs)
            except NameError:
                raise AttributeError(
                    "self.n_jobs must be set if capability:multithreading is True"
                )

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        self._fit(X, y)

        # this should happen last
        self._is_fitted = True

        return self
Esempio n. 5
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.time_limit_in_minutes > 0:
            # contracting 2/3 transform (with 1/5 of that taken away for final
            # transform), 1/3 classifier
            third = self.time_limit_in_minutes / 3
            self._classifier_limit_in_minutes = third
            self._transform_limit_in_minutes = (third * 2) / 5 * 4
        elif self.transform_limit_in_minutes > 0:
            self._transform_limit_in_minutes = self.transform_limit_in_minutes

        self._transformer = RandomShapeletTransform(
            n_shapelet_samples=self.n_shapelet_samples,
            max_shapelets=self.max_shapelets,
            max_shapelet_length=self.max_shapelet_length,
            time_limit_in_minutes=self._transform_limit_in_minutes,
            contract_max_n_shapelet_samples=self.
            contract_max_n_shapelet_samples,
            n_jobs=self.n_jobs,
            batch_size=self.batch_size,
            random_state=self.random_state,
        )

        self._estimator = _clone_estimator(
            RotationForest() if self.estimator is None else self.estimator,
            self.random_state,
        )

        if isinstance(self._estimator, RotationForest):
            self._estimator.save_transformed_data = self.save_transformed_data

        m = getattr(self._estimator, "n_jobs", None)
        if m is not None:
            self._estimator.n_jobs = self._n_jobs

        m = getattr(self._estimator, "time_limit_in_minutes", None)
        if m is not None and self.time_limit_in_minutes > 0:
            self._estimator.time_limit_in_minutes = self._classifier_limit_in_minutes

        X_t = self._transformer.fit_transform(X, y).to_numpy()

        if self.save_transformed_data:
            self.transformed_data = X_t

        self._estimator.fit(X_t, y)
Esempio n. 6
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        Xt: np.ndarray or pd.DataFrame
            Panel training data.
        y : np.ndarray
            The class labels.

        Returns
        -------
        self : object
            An fitted instance of the classifier
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not self.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        n_jobs = check_n_jobs(self.n_jobs)

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval,
                           self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=n_jobs)(
            delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng),
                                    X, y, self.intervals_[i])
            for i in range(self.n_estimators))

        self._is_fitted = True
        return self
Esempio n. 7
0
    def _instantiate_model(self):
        n_jobs = check_n_jobs(self.n_jobs)
        sp = check_sp(self.sp, enforce_list=True)

        return self._ModelClass(
            use_box_cox=self.use_box_cox,
            box_cox_bounds=self.box_cox_bounds,
            use_trend=self.use_trend,
            use_damped_trend=self.use_damped_trend,
            seasonal_periods=sp,
            use_arma_errors=self.use_arma_errors,
            show_warnings=self.show_warnings,
            n_jobs=n_jobs,
            multiprocessing_start_method=self.multiprocessing_start_method,
            context=self.context,
        )
Esempio n. 8
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self._class_vals = y
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self._dims, self._transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self._dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self._transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for n in range(self.n_instances):
                    for word, count in dim_words[n].items():
                        words[n][word << self._highest_dim_bit | dim] = count

            self._transformed_data = words
        else:
            self._transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    lower_bounding=False,
                    save_words=False,
                    use_fallback_dft=True,
                    n_jobs=self._n_jobs,
                ))
            sfa = self._transformers[0].fit_transform(X, y)
            self._transformed_data = sfa[0]
Esempio n. 9
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.base_estimator == "DTC":
            self._base_estimator = DecisionTreeClassifier(criterion="entropy")
        elif self.base_estimator == "CIT":
            self._base_estimator = ContinuousIntervalTree()
        elif isinstance(self.base_estimator, BaseEstimator):
            self._base_estimator = self.base_estimator
        else:
            raise ValueError("DrCIF invalid base estimator given.")

        if self.n_intervals is None:
            self._n_intervals = int(
                math.sqrt(self.series_length) * math.sqrt(self.n_dims)
            )
        if self._n_intervals <= 0:
            self._n_intervals = 1

        if self.att_subsample_size > 25:
            self._att_subsample_size = 25

        if self.series_length < self.min_interval:
            self._min_interval = self.series_length
        elif self.min_interval < 3:
            self._min_interval = 3

        if self.max_interval is None:
            self._max_interval = self.series_length / 2
        if self._max_interval < self._min_interval:
            self._max_interval = self._min_interval

        fit = Parallel(n_jobs=self._n_jobs)(
            delayed(self._fit_estimator)(
                X,
                y,
                i,
            )
            for i in range(self.n_estimators)
        )

        self.estimators_, self.intervals, self.dims, self.atts = zip(*fit)
Esempio n. 10
0
    def fit(self, X, y):
        """
        Build an ensemble of pipelines containing the ROCKET transformer and
        RidgeClassifierCV classifier.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)
        n_jobs = check_n_jobs(self.n_jobs)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        base_estimator = _make_estimator(self.num_kernels, self.random_state)
        self.estimators_ = Parallel(n_jobs=n_jobs)(
            delayed(_fit_estimator)(_clone_estimator(base_estimator,
                                                     self.random_state), X, y)
            for _ in range(self.n_estimators))

        self.weights = []
        self.weight_sum = 0
        for rocket_pipeline in self.estimators_:
            weight = rocket_pipeline.steps[1][1].best_score_
            self.weights.append(weight)
            self.weight_sum += weight

        self._is_fitted = True
        return self
Esempio n. 11
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        if self.n_parameter_samples <= self.randomly_selected_params:
            print(  # noqa
                "TDE Warning: n_parameter_samples <= randomly_selected_params, ",
                "ensemble member parameters will be fully randomly selected.",
            )

        time_limit = self.time_limit_in_minutes * 60
        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        self.estimators_ = []
        self.weights = []
        self._prev_parameters_x = []
        self._prev_parameters_y = []

        # Window length parameter space dependent on series length
        max_window_searches = self.series_length / 4
        max_window = int(self.series_length * self.max_win_len_prop)
        win_inc = int((max_window - self.min_window) / max_window_searches)
        if win_inc < 1:
            win_inc = 1
        if self.min_window > max_window + 1:
            raise ValueError(
                f"Error in TemporalDictionaryEnsemble, min_window ="
                f"{self.min_window} is bigger"
                f" than max_window ={max_window},"
                f" series length is {self.series_length}"
                f" try set min_window to be smaller than series length in "
                f"the constructor, but the classifier may not work at "
                f"all with very short series")

        possible_parameters = self._unique_parameters(max_window, win_inc)
        num_classifiers = 0
        subsample_size = int(self.n_instances * 0.7)
        lowest_acc = 1
        lowest_acc_idx = 0

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        train_time = 0
        if time_limit > 0:
            n_parameter_samples = 0
        else:
            n_parameter_samples = self.n_parameter_samples

        rng = check_random_state(self.random_state)

        if self.bigrams is None:
            if self.n_dims > 1:
                use_bigrams = False
            else:
                use_bigrams = True
        else:
            use_bigrams = self.bigrams

        # use time limit or n_parameter_samples if limit is 0
        while (train_time < time_limit or num_classifiers < n_parameter_samples
               ) and len(possible_parameters) > 0:
            if num_classifiers < self.randomly_selected_params:
                parameters = possible_parameters.pop(
                    rng.randint(0, len(possible_parameters)))
            else:
                scaler = preprocessing.StandardScaler()
                scaler.fit(self._prev_parameters_x)
                gp = KernelRidge(kernel="poly", degree=1)
                gp.fit(scaler.transform(self._prev_parameters_x),
                       self._prev_parameters_y)
                preds = gp.predict(scaler.transform(possible_parameters))
                parameters = possible_parameters.pop(
                    rng.choice(np.flatnonzero(preds == preds.max())))

            subsample = rng.choice(self.n_instances,
                                   size=subsample_size,
                                   replace=False)
            X_subsample = X[subsample]
            y_subsample = y[subsample]

            tde = IndividualTDE(
                *parameters,
                alphabet_size=self._alphabet_size,
                bigrams=use_bigrams,
                dim_threshold=self.dim_threshold,
                max_dims=self.max_dims,
                random_state=self.random_state,
            )
            tde.fit(X_subsample, y_subsample)
            tde._subsample = subsample

            if self.save_train_predictions:
                tde._train_predictions = np.zeros(subsample_size)

            tde._accuracy = self._individual_train_acc(
                tde,
                y_subsample,
                subsample_size,
                0 if num_classifiers < self.max_ensemble_size else lowest_acc,
            )
            if tde._accuracy > 0:
                weight = math.pow(tde._accuracy, 4)
            else:
                weight = 0.000000001

            if num_classifiers < self.max_ensemble_size:
                if tde._accuracy < lowest_acc:
                    lowest_acc = tde._accuracy
                    lowest_acc_idx = num_classifiers
                self.weights.append(weight)
                self.estimators_.append(tde)
            elif tde._accuracy > lowest_acc:
                self.weights[lowest_acc_idx] = weight
                self.estimators_[lowest_acc_idx] = tde
                lowest_acc, lowest_acc_idx = self._worst_ensemble_acc()

            self._prev_parameters_x.append(parameters)
            self._prev_parameters_y.append(tde._accuracy)

            num_classifiers += 1
            train_time = time.time() - start_time

        self.n_estimators = len(self.estimators_)
        self._weight_sum = np.sum(self.weights)
Esempio n. 12
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        train_time = 0

        if self.base_estimator == "DTC":
            self._base_estimator = DecisionTreeClassifier(criterion="entropy")
        elif self.base_estimator == "CIT":
            self._base_estimator = ContinuousIntervalTree()
        elif isinstance(self.base_estimator, BaseEstimator):
            self._base_estimator = self.base_estimator
        else:
            raise ValueError("DrCIF invalid base estimator given.")

        X_p = np.zeros(
            (
                self.n_instances,
                self.n_dims,
                int(
                    math.pow(2, math.ceil(math.log(self.series_length, 2)))
                    - self.series_length
                ),
            )
        )
        X_p = np.concatenate((X, X_p), axis=2)
        X_p = np.abs(np.fft.fft(X_p)[:, :, : int(X_p.shape[2] / 2)])

        X_d = np.diff(X, 1)

        if self.n_intervals is None:
            self._n_intervals = [None, None, None]
            self._n_intervals[0] = 4 + int(
                (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3
            )
            self._n_intervals[1] = 4 + int(
                (math.sqrt(X_p.shape[2]) * math.sqrt(self.n_dims)) / 3
            )
            self._n_intervals[2] = 4 + int(
                (math.sqrt(X_d.shape[2]) * math.sqrt(self.n_dims)) / 3
            )
        elif isinstance(self.n_intervals, int):
            self._n_intervals = [self.n_intervals, self.n_intervals, self.n_intervals]
        elif isinstance(self.n_intervals, list) and len(self.n_intervals) == 3:
            self._n_intervals = self.n_intervals
        else:
            raise ValueError("DrCIF n_intervals must be an int or list of length 3.")
        for i, n in enumerate(self._n_intervals):
            if n <= 0:
                self._n_intervals[i] = 1

        if self.att_subsample_size > 25:
            self._att_subsample_size = 25

        if isinstance(self.min_interval, int):
            self._min_interval = [
                self.min_interval,
                self.min_interval,
                self.min_interval,
            ]
        elif isinstance(self.min_interval, list) and len(self.min_interval) == 3:
            self._min_interval = self.min_interval
        else:
            raise ValueError("DrCIF min_interval must be an int or list of length 3.")
        if self.series_length < self._min_interval[0]:
            self._min_interval[0] = self.series_length
        if X_p.shape[2] < self._min_interval[1]:
            self._min_interval[1] = X_p.shape[2]
        if X_d.shape[2] < self._min_interval[2]:
            self._min_interval[2] = X_d.shape[2]

        if self.max_interval is None:
            self._max_interval = [
                self.series_length / 2,
                X_p.shape[2] / 2,
                X_d.shape[2] / 2,
            ]
        elif isinstance(self.max_interval, int):
            self._max_interval = [
                self.max_interval,
                self.max_interval,
                self.max_interval,
            ]
        elif isinstance(self.max_interval, list) and len(self.max_interval) == 3:
            self._max_interval = self.max_interval
        else:
            raise ValueError("DrCIF max_interval must be an int or list of length 3.")
        for i, n in enumerate(self._max_interval):
            if n < self._min_interval[i]:
                self._max_interval[i] = self._min_interval[i]

        self.total_intervals = sum(self._n_intervals)

        if time_limit > 0:
            self._n_estimators = 0
            self.estimators_ = []
            self.intervals = []
            self.atts = []
            self.dims = []
            self.transformed_data = []

            while (
                train_time < time_limit
                and self._n_estimators < self.contract_max_n_estimators
            ):
                fit = Parallel(n_jobs=self._n_jobs)(
                    delayed(self._fit_estimator)(
                        X,
                        X_p,
                        X_d,
                        y,
                        i,
                    )
                    for i in range(self._n_jobs)
                )

                (
                    estimators,
                    intervals,
                    dims,
                    atts,
                    transformed_data,
                ) = zip(*fit)

                self.estimators_ += estimators
                self.intervals += intervals
                self.atts += atts
                self.dims += dims
                self.transformed_data += transformed_data

                self._n_estimators += self._n_jobs
                train_time = time.time() - start_time
        else:
            fit = Parallel(n_jobs=self._n_jobs)(
                delayed(self._fit_estimator)(
                    X,
                    X_p,
                    X_d,
                    y,
                    i,
                )
                for i in range(self._n_estimators)
            )

            (
                self.estimators_,
                self.intervals,
                self.dims,
                self.atts,
                self.transformed_data,
            ) = zip(*fit)
Esempio n. 13
0
    def _fit(self, X, y):
        """Fit HIVE-COTE 1.0 to training data.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self.n_jobs_ = check_n_jobs(self.n_jobs)

        self.n_classes_ = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        # Default values from HC1 paper
        if self.stc_params is None:
            self._stc_params = {"transform_limit_in_minutes": 120}
        if self.tsf_params is None:
            self._tsf_params = {"n_estimators": 500}
        if self.rise_params is None:
            self._rise_params = {"n_estimators": 500}
        if self.cboss_params is None:
            self._cboss_params = {}

        # Cross-validation size for TSF and RISE
        cv_size = 10
        _, counts = np.unique(y, return_counts=True)
        min_class = np.min(counts)
        if min_class < cv_size:
            cv_size = min_class

        # Build STC
        self._stc = ShapeletTransformClassifier(
            **self._stc_params,
            save_transformed_data=True,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._stc.fit(X, y)

        if self.verbose > 0:
            print("STC ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find STC weight using train set estimate
        train_probs = self._stc._get_train_probs(X, y)
        train_preds = self._stc.classes_[np.argmax(train_probs, axis=1)]
        self.stc_weight_ = accuracy_score(y, train_preds)**4

        if self.verbose > 0:
            print(  # noqa
                "STC train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("STC weight = " + str(self.stc_weight_))  # noqa

        # Build TSF
        self._tsf = TimeSeriesForestClassifier(
            **self._tsf_params,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._tsf.fit(X, y)

        if self.verbose > 0:
            print("TSF ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find TSF weight using train set estimate found through CV
        train_preds = cross_val_predict(
            TimeSeriesForestClassifier(**self._tsf_params,
                                       random_state=self.random_state),
            X=X,
            y=y,
            cv=cv_size,
            n_jobs=self.n_jobs_,
        )
        self.tsf_weight_ = accuracy_score(y, train_preds)**4

        if self.verbose > 0:
            print(  # noqa
                "TSF train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("TSF weight = " + str(self.tsf_weight_))  # noqa

        # Build RISE
        self._rise = RandomIntervalSpectralForest(
            **self._rise_params,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._rise.fit(X, y)

        if self.verbose > 0:
            print("RISE ",
                  datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find RISE weight using train set estimate found through CV
        train_preds = cross_val_predict(
            RandomIntervalSpectralForest(
                **self._rise_params,
                random_state=self.random_state,
            ),
            X=X,
            y=y,
            cv=cv_size,
            n_jobs=self.n_jobs_,
        )
        self.rise_weight_ = accuracy_score(y, train_preds)**4

        if self.verbose > 0:
            print(  # noqa
                "RISE train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("RISE weight = " + str(self.rise_weight_))  # noqa

        # Build cBOSS
        self._cboss = ContractableBOSS(**self._cboss_params,
                                       random_state=self.random_state,
                                       n_jobs=self.n_jobs_)
        self._cboss.fit(X, y)

        # Find cBOSS weight using train set estimate
        train_probs = self._cboss._get_train_probs(X, y)
        train_preds = self._cboss.classes_[np.argmax(train_probs, axis=1)]
        self.cboss_weight_ = accuracy_score(y, train_preds)**4

        if self.verbose > 0:
            print(  # noqa
                "cBOSS (estimate included)",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("cBOSS weight = " + str(self.cboss_weight_))  # noqa
Esempio n. 14
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        train_time = 0

        base_rocket = Rocket(num_kernels=self.num_kernels)

        if time_limit > 0:
            self.n_estimators = 0
            self.estimators_ = []
            self.transformed_data = []

            while (
                train_time < time_limit
                and self.n_estimators < self.contract_max_n_estimators
            ):
                fit = Parallel(n_jobs=self._n_jobs)(
                    delayed(self._fit_estimator)(
                        _clone_estimator(
                            base_rocket,
                            None
                            if self.random_state is None
                            else (255 if self.random_state == 0 else self.random_state)
                            * 37
                            * (i + 1),
                        ),
                        X,
                        y,
                    )
                    for i in range(self._n_jobs)
                )

                estimators, transformed_data = zip(*fit)

                self.estimators_ += estimators
                self.transformed_data += transformed_data

                self.n_estimators += self._n_jobs
                train_time = time.time() - start_time
        else:
            fit = Parallel(n_jobs=self._n_jobs)(
                delayed(self._fit_estimator)(
                    _clone_estimator(
                        base_rocket,
                        None
                        if self.random_state is None
                        else (255 if self.random_state == 0 else self.random_state)
                        * 37
                        * (i + 1),
                    ),
                    X,
                    y,
                )
                for i in range(self.n_estimators)
            )

            self.estimators_, self.transformed_data = zip(*fit)

        self.weights = []
        self._weight_sum = 0
        for rocket_pipeline in self.estimators_:
            weight = rocket_pipeline.steps[1][1].best_score_
            self.weights.append(weight)
            self._weight_sum += weight
    def fit(self, X, y):
        """Fit a forest of trees on cases (X,y), where y is the target variable.

        Parameters
        ----------
        X : ndarray of shape = [n_instances,n_attributes]
            The training input samples.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self : object
        """
        if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1:
            X = np.reshape(X, (X.shape[0], -1))
        elif isinstance(X, pd.DataFrame) and len(X.shape) == 2:
            X = X.to_numpy()
        elif not isinstance(X, np.ndarray) or len(X.shape) > 2:
            raise ValueError("RotationForest is not a time series classifier. "
                             "A 2d numpy array is required.")
        X, y = check_X_y(X, y)

        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_atts = X.shape
        self.classes_ = np.unique(y)
        self.n_classes = self.classes_.shape[0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        train_time = 0

        if self.base_estimator is None:
            self._base_estimator = DecisionTreeClassifier(criterion="entropy")

        # replace missing values with 0 and remove useless attributes
        X = np.nan_to_num(X, False, 0, 0, 0)
        self._useful_atts = ~np.all(X[1:] == X[:-1], axis=0)
        X = X[:, self._useful_atts]

        self._n_atts = X.shape[1]

        # normalise attributes
        self._min = X.min(axis=0)
        self._ptp = X.max(axis=0) - self._min
        X = (X - self._min) / self._ptp

        X_cls_split = [X[np.where(y == i)] for i in self.classes_]

        if time_limit > 0:
            self._n_estimators = 0
            self.estimators_ = []
            self._pcas = []
            self._groups = []

            while (train_time < time_limit
                   and self._n_estimators < self.contract_max_n_estimators):
                fit = Parallel(n_jobs=self._n_jobs)(
                    delayed(self._fit_estimator)(
                        X,
                        X_cls_split,
                        y,
                        i,
                    ) for i in range(self._n_jobs))

                estimators, pcas, groups, transformed_data = zip(*fit)

                self.estimators_ += estimators
                self._pcas += pcas
                self._groups += groups
                self.transformed_data += transformed_data

                self._n_estimators += self._n_jobs
                train_time = time.time() - start_time
        else:
            fit = Parallel(n_jobs=self._n_jobs)(delayed(self._fit_estimator)(
                X,
                X_cls_split,
                y,
                i,
            ) for i in range(self._n_estimators))

            self.estimators_, self._pcas, self._groups, self.transformed_data = zip(
                *fit)

        self._is_fitted = True
        return self
Esempio n. 16
0
    def _fit(self, X, y=None):
        """Fit the shapelet transform to a specified X and y.

        Parameters
        ----------
        X: pandas DataFrame or np.ndarray
            The training input samples.
        y: array-like or list
            The class values for X.

        Returns
        -------
        self : RandomShapeletTransform
            This estimator.
        """
        # this is a few versions away currently, and heaps dont support the replacement
        warnings.simplefilter("ignore",
                              category=NumbaPendingDeprecationWarning)

        self._n_jobs = check_n_jobs(self.n_jobs)

        self.classes_, self._class_counts = np.unique(y, return_counts=True)
        self.n_classes = self.classes_.shape[0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

        self.n_instances, self.n_dims, self.series_length = X.shape

        if self.max_shapelets is None:
            self._max_shapelets = min(10 * self.n_instances, 1000)

        if self.max_shapelet_length is None:
            self._max_shapelet_length = self.series_length

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        fit_time = 0

        max_shapelets_per_class = self._max_shapelets / self.n_classes
        shapelets = [[(-1.0, -1, -1, -1, -1, -1)]
                     for _ in range(self.n_classes)]
        n_shapelets_extracted = 0

        if time_limit > 0:
            while (fit_time < time_limit and n_shapelets_extracted <
                   self.contract_max_n_shapelet_samples):
                candidate_shapelets = Parallel(n_jobs=self._n_jobs)(
                    delayed(self._extract_random_shapelet)(
                        X,
                        y,
                        n_shapelets_extracted + i,
                        shapelets,
                        max_shapelets_per_class,
                    ) for i in range(self._batch_size))

                for i, heap in enumerate(shapelets):
                    RandomShapeletTransform._merge_shapelets(
                        heap,
                        candidate_shapelets,
                        max_shapelets_per_class,
                        i,
                    )

                if self.remove_self_similar:
                    for i, heap in enumerate(shapelets):
                        to_keep = (RandomShapeletTransform.
                                   _remove_self_similar_shapelets(heap))
                        shapelets[i] = [
                            n for (n, b) in zip(heap, to_keep) if b
                        ]

                n_shapelets_extracted += self._batch_size
                fit_time = time.time() - start_time
        else:
            while n_shapelets_extracted < self._n_shapelet_samples:
                n_shapelets_to_extract = (
                    self._batch_size if n_shapelets_extracted +
                    self._batch_size <= self._n_shapelet_samples else
                    self._n_shapelet_samples - n_shapelets_extracted)

                candidate_shapelets = Parallel(n_jobs=self._n_jobs)(
                    delayed(self._extract_random_shapelet)(
                        X,
                        y,
                        n_shapelets_extracted + i,
                        shapelets,
                        max_shapelets_per_class,
                    ) for i in range(n_shapelets_to_extract))

                for i, heap in enumerate(shapelets):
                    RandomShapeletTransform._merge_shapelets(
                        heap,
                        candidate_shapelets,
                        max_shapelets_per_class,
                        i,
                    )

                if self.remove_self_similar:
                    for i, heap in enumerate(shapelets):
                        to_keep = (RandomShapeletTransform.
                                   _remove_self_similar_shapelets(heap))
                        shapelets[i] = [
                            n for (n, b) in zip(heap, to_keep) if b
                        ]

                n_shapelets_extracted += n_shapelets_to_extract

        self.shapelets = [(
            s[0],
            s[1],
            s[2],
            s[3],
            s[4],
            self.classes_[s[5]],
            z_normalise_series(X[s[4], s[3], s[2]:s[2] + s[1]]),
        ) for class_shapelets in shapelets for s in class_shapelets
                          if s[0] > 0]
        self.shapelets.sort(reverse=True,
                            key=lambda s: (s[0], s[1], s[2], s[3], s[4]))

        to_keep = RandomShapeletTransform._remove_identical_shapelets(
            self.shapelets)
        self.shapelets = [n for (n, b) in zip(self.shapelets, to_keep) if b]

        self._sorted_indicies = []
        for s in self.shapelets:
            sabs = np.abs(s[6])
            self._sorted_indicies.append(
                sorted(range(s[1]), reverse=True, key=lambda i: sabs[i]))
        return self
Esempio n. 17
0
    def _fit(self, X, y):
        """Fit HIVE-COTE 2.0 to training data.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self.n_jobs_ = check_n_jobs(self.n_jobs)

        self.n_classes_ = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        # Default values from HC2 paper
        if self.stc_params is None:
            self._stc_params = {"transform_limit_in_minutes": 120}
        if self.drcif_params is None:
            self._drcif_params = {"n_estimators": 500}
        if self.arsenal_params is None:
            self._arsenal_params = {}
        if self.tde_params is None:
            self._tde_params = {}

        # If we are contracting split the contract time between each algorithm
        if self.time_limit_in_minutes > 0:
            # Leave 1/3 for train estimates
            ct = self.time_limit_in_minutes / 6
            self._stc_params["time_limit_in_minutes"] = ct
            self._drcif_params["time_limit_in_minutes"] = ct
            self._arsenal_params["time_limit_in_minutes"] = ct
            self._tde_params["time_limit_in_minutes"] = ct

        # Build STC
        self._stc = ShapeletTransformClassifier(
            **self._stc_params,
            save_transformed_data=True,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._stc.fit(X, y)

        if self.verbose > 0:
            print("STC ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find STC weight using train set estimate
        train_probs = self._stc._get_train_probs(X, y)
        train_preds = self._stc.classes_[np.argmax(train_probs, axis=1)]
        self.stc_weight_ = accuracy_score(y, train_preds) ** 4

        if self.verbose > 0:
            print(  # noqa
                "STC train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("STC weight = " + str(self.stc_weight_))  # noqa

        # Build DrCIF
        self._drcif = DrCIF(
            **self._drcif_params,
            save_transformed_data=True,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._drcif.fit(X, y)

        if self.verbose > 0:
            print("DrCIF ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find DrCIF weight using train set estimate
        train_probs = self._drcif._get_train_probs(X, y)
        train_preds = self._drcif.classes_[np.argmax(train_probs, axis=1)]
        self.drcif_weight_ = accuracy_score(y, train_preds) ** 4

        if self.verbose > 0:
            print(  # noqa
                "DrCIF train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("DrCIF weight = " + str(self.drcif_weight_))  # noqa

        # Build Arsenal
        self._arsenal = Arsenal(
            **self._arsenal_params,
            save_transformed_data=True,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._arsenal.fit(X, y)

        if self.verbose > 0:
            print("Arsenal ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find Arsenal weight using train set estimate
        train_probs = self._arsenal._get_train_probs(X, y)
        train_preds = self._arsenal.classes_[np.argmax(train_probs, axis=1)]
        self.arsenal_weight_ = accuracy_score(y, train_preds) ** 4

        if self.verbose > 0:
            print(  # noqa
                "Arsenal train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("Arsenal weight = " + str(self.arsenal_weight_))  # noqa

        # Build TDE
        self._tde = TemporalDictionaryEnsemble(
            **self._tde_params,
            save_train_predictions=True,
            random_state=self.random_state,
            n_jobs=self.n_jobs_,
        )
        self._tde.fit(X, y)

        if self.verbose > 0:
            print("TDE ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"))  # noqa

        # Find TDE weight using train set estimate
        train_probs = self._tde._get_train_probs(X, y, train_estimate_method="oob")
        train_preds = self._tde.classes_[np.argmax(train_probs, axis=1)]
        self.tde_weight_ = accuracy_score(y, train_preds) ** 4

        if self.verbose > 0:
            print(  # noqa
                "TDE train estimate ",
                datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
            )
            print("TDE weight = " + str(self.tde_weight_))  # noqa