def test_check_X_enforce_min_columns():
    X, y = make_classification_problem(n_columns=2)
    msg = r"columns"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_columns=3)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_columns=3)
def test_check_X_enforce_univariate():
    X, y = make_classification_problem(n_columns=2)
    msg = r"univariate"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_univariate=True)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_univariate=True)
def test_check_enforce_min_instances():
    X, y = make_classification_problem(n_instances=3)
    msg = r"instance"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_y(y, enforce_min_instances=4)
Esempio n. 4
0
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score"""
        check_X_y(X, y)
        check_X(X, enforce_univariate=True)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            final_estimator = estimator.steps[-1][1]
            unsampled_indices = _generate_unsampled_indices(
                final_estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = (predictions[k] /
                        predictions[k].sum(axis=1)[:, np.newaxis])
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
Esempio n. 5
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest.
        Contract default for ST is 5 hours
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
         """
        X, y = check_X_y(X, y, enforce_univariate=True)
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier.fit(X, y)

        #        self.shapelet_transform.fit(X,y)
        #        print("Shapelet Search complete")
        #        self.st_X =self.shapelet_transform.transform(X)
        #        print("Transform complete")
        #        X = np.asarray([a.values for a in X.iloc[:, 0]])
        #        self.classifier.fit(X,y)
        #       print("Build classifier complete")
        self._is_fitted = True
        return self
    def fit(self, X, y):
        """
        Build the classifier on the training set (X, y)
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            column 0 is extracted.
        y : array-like, shape = [n_instances]
            The class labels.
        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)

        self.X = dataset_properties.positive_dataframe_indices(X)
        self.random_state = check_random_state(self.random_state)
        # setup label encoding
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(y)
        self.y = y
        self.classes_ = self.label_encoder.classes_
        if self.distance_measure is None:
            if self.get_distance_measure is None:
                self.get_distance_measure = self.setup_distance_measure(self)
            self.distance_measure = self.get_distance_measure(self)
        self.X_exemplar, self.y_exemplar = self.pick_exemplars(self)
        self._is_fitted = True
        return self
Esempio n. 7
0
 def fit(self, X, y):
     """
     Build the classifier on the training set (X, y)
     ----------
     X : array-like or sparse matrix of shape = [n_instances, n_columns]
         The training input samples.  If a Pandas data frame is passed,
         column 0 is extracted.
     y : array-like, shape = [n_instances]
         The class labels.
     Returns
     -------
     self : object
     """
     X, y = check_X_y(X, y, enforce_univariate=True)
     self.X = dataset_properties.positive_dataframe_indices(X)
     self.random_state = check_random_state(self.random_state)
     if self.find_stump is None:
         self.find_stump = best_of_n_stumps(self.n_stump_evaluations)
     # setup label encoding
     if self.label_encoder is None:
         self.label_encoder = LabelEncoder()
         y = self.label_encoder.fit_transform(y)
     self.y = y
     self.classes_ = self.label_encoder.classes_
     if self.distance_measure is None:
         if self.get_distance_measure is None:
             self.get_distance_measure = self.setup_distance_measure(self)
         self.distance_measure = self.get_distance_measure(self)
     self.stump = self.find_stump(self)
     n_branches = len(self.stump.y_exemplar)
     self.branches = [None] * n_branches
     if self.depth < self.max_depth:
         for index in range(n_branches):
             sub_y = self.stump.y_branches[index]
             if not self.is_leaf(sub_y):
                 sub_tree = ProximityTree(
                     random_state=self.random_state,
                     get_exemplars=self.get_exemplars,
                     distance_measure=self.distance_measure,
                     setup_distance_measure=self.setup_distance_measure,
                     get_distance_measure=self.get_distance_measure,
                     get_gain=self.get_gain,
                     is_leaf=self.is_leaf,
                     verbosity=self.verbosity,
                     max_depth=self.max_depth,
                     n_jobs=self.n_jobs,
                 )
                 sub_tree.label_encoder = self.label_encoder
                 sub_tree.depth = self.depth + 1
                 self.branches[index] = sub_tree
                 sub_X = self.stump.X_branches[index]
                 sub_tree.fit(sub_X, sub_y)
     self._is_fitted = True
     return self
Esempio n. 8
0
def check_and_clean_data(X, y=None, input_checks=True):
    '''
    Performs basic sktime data checks and prepares the train data for input to
    Keras models.

    :param X: the train data
    :param y: teh train labels
    :param input_checks: whether to perform the basic sktime checks
    :return: X
    '''
    if input_checks:
        if y is None:
            check_X(X)
        else:
            check_X_y(X, y)

    # want data in form: [instances = n][timepoints = m][dimensions = d]
    if isinstance(X, pd.DataFrame):
        if _is_nested_dataframe(X):
            if X.shape[1] > 1:
                # we have multiple columns, AND each cell contains a series,
                # so this is a multidimensional problem
                X = _multivariate_nested_df_to_array(X)
            else:
                # we have a single column containing a series, treat this as
                # a univariate problem
                X = _univariate_nested_df_to_array(X)
        else:
            # we have multiple columns each containing a primitive, treat as
            # univariate series
            X = _univariate_df_to_array(X)

    if len(X.shape) == 2:
        # add a dimension to make it multivariate with one dimension
        X = X.reshape(
            X.shape[0], X.shape[1], 1
        )  # go from [n][m] to [n][m][d=1]

    return X
Esempio n. 9
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = [series.to_dict() for series in sfa.iloc[:, 0]]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Esempio n. 10
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval,
                           self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(
                X,
                y,
                self.base_estimator,
                self.intervals_[i],
                self.random_state,
            ) for i in range(self.n_estimators))

        self._is_fitted = True
        return self
Esempio n. 11
0
    def fit(self, X, y=None):
        """Fit.

        Parameters
        ----------
        X : pd.DataFrame
            nested pandas DataFrame of shape [n_samples, n_columns]
        y : pd.Series or np.array
            Target variable

        Returns
        -------
        self : an instance of self
        """
        # lazy imports to avoid hard dependency
        from tsfresh.transformers.feature_selector import FeatureSelector

        # input checks
        if y is None:
            raise ValueError(
                f"{self.__class__.__name__} requires `y` in `fit`.")
        X, y = check_X_y(X, y, coerce_to_pandas=True)

        self.extractor_ = TSFreshFeatureExtractor(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profiling=self.profiling,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting,
        )

        selection_params = self._get_selection_params()
        extraction_param = self._get_extraction_params()
        self.selector_ = FeatureSelector(
            n_jobs=extraction_param["n_jobs"],
            chunksize=extraction_param["chunksize"],
            ml_task=self.ml_task,
            **selection_params,
        )

        Xt = self.extractor_.fit_transform(X)
        self.selector_.fit(Xt, y)
        self._is_fitted = True
        return self
Esempio n. 12
0
    def _set_oob_score(self, X, y):
        """
        Compute out-of-bag scores."""
        X, y = check_X_y(X, y, enforce_univariate=True)

        n_samples = y.shape[0]

        predictions = np.zeros((n_samples, self.n_outputs_))
        n_predictions = np.zeros((n_samples, self.n_outputs_))

        n_samples_bootstrap = _get_n_samples_bootstrap(
            n_samples, self.max_samples
        )

        for estimator in self.estimators_:
            final_estimator = estimator.steps[-1][1]
            unsampled_indices = _generate_unsampled_indices(
                final_estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict(
                X[unsampled_indices, :], check_input=False)

            if self.n_outputs_ == 1:
                p_estimator = p_estimator[:, np.newaxis]

            predictions[unsampled_indices, :] += p_estimator
            n_predictions[unsampled_indices, :] += 1

        if (n_predictions == 0).any():
            warn("Some inputs do not have OOB scores. "
                 "This probably means too few trees were used "
                 "to compute any reliable oob estimates.")
            n_predictions[n_predictions == 0] = 1

        predictions /= n_predictions
        self.oob_prediction_ = predictions

        if self.n_outputs_ == 1:
            self.oob_prediction_ = \
                self.oob_prediction_.reshape((n_samples, ))

        self.oob_score_ = 0.0

        for k in range(self.n_outputs_):
            self.oob_score_ += r2_score(y[:, k],
                                        predictions[:, k])

        self.oob_score_ /= self.n_outputs_
Esempio n. 13
0
    def fit(self, X, y):

        if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
            X, y = check_X_y(X, y, enforce_univariate=True)
            X = tabularize(X, return_array=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Esempio n. 14
0
    def fit(self, X, y):
        """
        Method to perform training on the classifier.

        Parameters
        ----------
        X - pandas dataframe of training data of shape [n_instances,1].
        y - list of class labels of shape [n_instances].

        Returns
        -------
        self : the shapeDTW object
        """
        # Perform preprocessing on params.
        if not (isinstance(self.shape_descriptor_function, str)):
            raise TypeError("shape_descriptor_function must be an 'str'. \
                            Found '" +
                            type(self.shape_descriptor_function).__name__ +
                            "' instead.")

        X, y = check_X_y(X, y, enforce_univariate=False)

        if self.metric_params is None:
            self.metric_params = {}

        # If the shape descriptor is 'compound',
        # calculate the appropriate weighting_factor
        if self.shape_descriptor_function == "compound":
            self._calculate_weighting_factor_value(X, y)

        # Fit the SlidingWindowSegmenter
        sw = SlidingWindowSegmenter(self.subsequence_length)
        sw.fit(X)
        self.sw = sw

        # Transform the training data.
        X = self._preprocess(X)

        # Fit the kNN classifier
        self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=self.n_neighbors)
        self.knn.fit(X, y)
        self.classes_ = self.knn.classes_

        return self
Esempio n. 15
0
    def fit(self, X, y):
        """
        Build the classifier on the training set (X, y)
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            column 0 is extracted.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)
        self.X = dataset_properties.positive_dataframe_indices(X)
        self.random_state = check_random_state(self.random_state)
        # setup label encoding
        if self.label_encoder is None:
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(y)
        self.y = y
        self.classes_ = self.label_encoder.classes_
        if self.distance_measure is None:
            if self.get_distance_measure is None:
                self.get_distance_measure = self.setup_distance_measure_getter(
                    self)
            self.distance_measure = self.get_distance_measure(self)
        if self.n_jobs > 1 or self.n_jobs < 0:
            parallel = Parallel(self.n_jobs)
            self.trees = parallel(
                delayed(self._fit_tree)
                (X, y, index, self.random_state.randint(0, self.n_estimators))
                for index in range(self.n_estimators))
        else:
            self.trees = [
                self._fit_tree(X, y, index,
                               self.random_state.randint(0, self.n_estimators))
                for index in range(self.n_estimators)
            ]
        self._is_fitted = True
        return self
Esempio n. 16
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest.
        Contract default for ST is 5 hours
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
         """
        X, y = check_X_y(X, y, enforce_univariate=True)

        # if y is a pd.series then convert to array.
        if isinstance(y, pd.Series):
            y = y.to_numpy()

# generate pipeline in fit so that random state can be propogated properly.
        self.classifier_ = Pipeline([
            ('st',
             ContractedShapeletTransform(
                 time_contract_in_mins=self.time_contract_in_mins,
                 verbose=False,
                 random_state=self.random_state)),
            ('rf',
             RandomForestClassifier(n_estimators=self.n_estimators,
                                    random_state=self.random_state))
        ])

        self.n_classes_ = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier_.fit(X, y)

        self._is_fitted = True
        return self
Esempio n. 17
0
    def fit(self, X, y):
        """Build an ensemble of 1-NN classifiers from th training set (X, y),
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            it must have a single column. BOSS not configured
            to handle multivariate
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)

        # Derivative DTW (DDTW) uses the regular DTW algorithm on data that
        # are transformed into derivatives.
        # To increase the efficiency of DDTW we can pre-transform the data
        # into derivatives, and then call the
        # standard DTW algorithm on it, rather than transforming each series
        # every time a distance calculation
        # is made. Please note that using DDTW elsewhere will not benefit
        # from this speed enhancement
        if self.distance_measures.__contains__(
                ddtw_c) or self.distance_measures.__contains__(wddtw_c):
            der_X = DerivativeSlopeTransformer().fit_transform(X)
            # reshape X for use with the efficient cython distance measures
            der_X = np.array(
                [np.asarray([x]).reshape(len(x), 1) for x in der_X.iloc[:, 0]])
        else:
            der_X = None

        # reshape X for use with the efficient cython distance measures
        X = np.array(
            [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]])

        self.train_accs_by_classifier = np.zeros(len(self.distance_measures))
        self.train_preds_by_classifier = [None] * len(self.distance_measures)
        self.estimators_ = [None] * len(self.distance_measures)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        rand = np.random.RandomState(self.random_state)

        # The default EE uses all training instances for setting parameters,
        # and 100 parameter options per
        # elastic measure. The prop_train_in_param_finding and
        # prop_of_param_options attributes of this class
        # can be used to control this however, using less cases to optimise
        # parameters on the training data
        # and/or using less parameter options.
        #
        # For using less training instances the appropriate number of cases
        # must be sampled from the data.
        # This is achieved through the use of a deterministic
        # StratifiedShuffleSplit
        #
        # For using less parameter options a RandomizedSearchCV is used in
        # place of a GridSearchCV

        param_train_x = None
        der_param_train_x = None
        param_train_y = None

        # If using less cases for parameter optimisation, use the
        # StratifiedShuffleSplit:
        if self.proportion_train_in_param_finding < 1:
            if self.verbose > 0:
                print(
                    "Restricting training cases for parameter optimisation: ",
                    end="")
            sss = StratifiedShuffleSplit(
                n_splits=1,
                test_size=1 - self.proportion_train_in_param_finding,
                random_state=rand)
            for train_index, test_index in sss.split(X, y):
                param_train_x = X[train_index, :]
                param_train_y = y[train_index]
                if der_X is not None:
                    der_param_train_x = der_X[train_index, :]
                if self.verbose > 0:
                    print("using " + str(len(param_train_x)) +
                          " training cases instead of " + str(len(X)) +
                          " for parameter optimisation")
        # else, use the full training data for optimising parameters
        else:
            if self.verbose > 0:
                print("Using all training cases for parameter optimisation")
            param_train_x = X
            param_train_y = y
            if der_X is not None:
                der_param_train_x = der_X

        self.constituent_build_times = []

        if self.verbose > 0:
            print("Using " + str(100 * self.proportion_of_param_options) +
                  " parameter "
                  "options per "
                  "measure")
        for dm in range(0, len(self.distance_measures)):
            this_measure = self.distance_measures[dm]

            # uses the appropriate training data as required (either full or
            # smaller sample as per the StratifiedShuffleSplit)
            param_train_to_use = param_train_x
            full_train_to_use = X
            if this_measure is ddtw_c or dm is wddtw_c:
                param_train_to_use = der_param_train_x
                full_train_to_use = der_X
                if this_measure is ddtw_c:
                    this_measure = dtw_c
                elif this_measure is wddtw_c:
                    this_measure = wdtw_c

            start_build_time = time.time()
            if self.verbose > 0:
                if self.distance_measures[dm] is ddtw_c or \
                        self.distance_measures[dm] is wddtw_c:
                    print("Currently evaluating " +
                          str(self.distance_measures[dm].__name__) +
                          " (implemented as " + str(this_measure.__name__) +
                          " with pre-transformed derivative data)")
                else:
                    print("Currently evaluating " +
                          str(self.distance_measures[dm].__name__))

            # If 100 parameter options are being considered per measure,
            # use a GridSearchCV
            if self.proportion_of_param_options == 1:

                grid = GridSearchCV(
                    estimator=KNeighborsTimeSeriesClassifier(
                        metric=this_measure, n_neighbors=1, algorithm="brute"),
                    param_grid=ElasticEnsemble._get_100_param_options(
                        self.distance_measures[dm], X),
                    cv=LeaveOneOut(),
                    scoring='accuracy',
                    verbose=self.verbose)
                grid.fit(param_train_to_use, param_train_y)

            # Else, used RandomizedSearchCV to randomly sample parameter
            # options for each measure
            else:
                grid = RandomizedSearchCV(
                    estimator=KNeighborsTimeSeriesClassifier(
                        metric=this_measure, n_neighbors=1, algorithm="brute"),
                    param_distributions=ElasticEnsemble._get_100_param_options(
                        self.distance_measures[dm], X),
                    cv=LeaveOneOut(),
                    scoring='accuracy',
                    n_iter=100 * self.proportion_of_param_options,
                    random_state=rand,
                    verbose=self.verbose)
                grid.fit(param_train_to_use, param_train_y)

            # once the best parameter option has been estimated on the
            # training data, perform a final pass with this parameter option
            # to get the individual predictions with cross_cal_predict (
            # Note: optimisation potentially possible here if a GridSearchCV
            # was used previously. TO-DO: determine how to extract
            # predictions for the best param option from GridSearchCV)
            best_model = KNeighborsTimeSeriesClassifier(
                algorithm="brute",
                n_neighbors=1,
                metric=this_measure,
                metric_params=grid.best_params_['metric_params'])
            preds = cross_val_predict(best_model,
                                      full_train_to_use,
                                      y,
                                      cv=LeaveOneOut())
            acc = accuracy_score(y, preds)

            if self.verbose > 0:
                print("Training accuracy for " +
                      str(self.distance_measures[dm].__name__) + ": " +
                      str(acc) + " (with parameter setting: " +
                      str(grid.best_params_['metric_params']) + ")")

            # Finally, reset the classifier for this measure and parameter
            # option, ready to be called for test classification
            best_model = KNeighborsTimeSeriesClassifier(
                algorithm="brute",
                n_neighbors=1,
                metric=this_measure,
                metric_params=grid.best_params_['metric_params'])
            best_model.fit(full_train_to_use, y)
            end_build_time = time.time()

            self.constituent_build_times.append(
                str(end_build_time - start_build_time))
            self.estimators_[dm] = best_model
            self.train_accs_by_classifier[dm] = acc
            self.train_preds_by_classifier[dm] = preds

        self._is_fitted = True
        return self
Esempio n. 18
0
    def fit(self, X, y):
        """Fit the model using X as training data and y as target values

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])

        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]

        """
        X, y = check_X_y(X, y, enforce_univariate=False)
        y = np.asarray(y)
        X = nested_to_3d_numpy(X)
        check_classification_targets(y)

        # print(X)
        # if internal cv is desired, the relevant flag forces a grid search
        # to evaluate the possible values,
        # find the best, and then set this classifier's params to match
        if self._cv_for_params:
            grid = GridSearchCV(
                estimator=KNeighborsTimeSeriesClassifier(metric=self.metric,
                                                         n_neighbors=1,
                                                         algorithm="brute"),
                param_grid=self._param_matrix,
                cv=LeaveOneOut(),
                scoring='accuracy'
            )
            grid.fit(X, y)
            self.metric_params = grid.best_params_['metric_params']

        if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
            if y.ndim != 1:
                warnings.warn("A column-vector y was passed when a 1d array "
                              "was expected. Please change the shape of y to "
                              "(n_samples, ), for example using ravel().",
                              DataConversionWarning, stacklevel=2)

            self.outputs_2d_ = False
            y = y.reshape((-1, 1))
        else:
            self.outputs_2d_ = True

        self.classes_ = []
        self._y = np.empty(y.shape, dtype=np.int)
        for k in range(self._y.shape[1]):
            classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes)

        if not self.outputs_2d_:
            self.classes_ = self.classes_[0]
            self._y = self._y.ravel()

        if hasattr(check_array, '__wrapped__'):
            temp = check_array.__wrapped__.__code__
            check_array.__wrapped__.__code__ = _check_array_ts.__code__
        else:
            temp = check_array.__code__
            check_array.__code__ = _check_array_ts.__code__

        fx = self._fit(X)

        if hasattr(check_array, '__wrapped__'):
            check_array.__wrapped__.__code__ = temp
        else:
            check_array.__code__ = temp

        self._is_fitted = True
        return fx
Esempio n. 19
0
def test_check_X_bad_input_args(X):
    with pytest.raises(ValueError):
        check_X(X)

    with pytest.raises(ValueError):
        check_X_y(X, y)
Esempio n. 20
0
    def fit(self, X, y):
        """Build a WEASEL+MUSE classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, coerce_to_pandas=True)
        y = np.asarray(y)

        # add first order differences in each dimension to TS
        if self.use_first_order_differences:
            X = self.add_first_order_differences(X)

        # Window length parameter space dependent on series length
        self.col_names = X.columns

        rng = check_random_state(self.random_state)

        self.n_dims = len(self.col_names)
        self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1
        self.highest_bits = np.zeros(self.n_dims)

        self.SFA_transformers = [[] for _ in range(self.n_dims)]

        # the words of all dimensions and all time series
        all_words = [dict() for _ in range(X.shape[0])]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[[column]]
            X_dim = from_nested_to_3d_numpy(X_dim)
            series_length = X_dim.shape[
                -1]  # TODO compute minimum over all ts ?

            # increment window size in steps of 'win_inc'
            win_inc = self.compute_window_inc(series_length)

            self.max_window = int(min(series_length, self.max_window))
            self.window_sizes.append(
                list(range(self.min_window, self.max_window, win_inc)))

            self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1

            for window_size in self.window_sizes[ind]:

                transformer = SFA(
                    word_length=rng.choice(self.word_lengths),
                    alphabet_size=self.alphabet_size,
                    window_size=window_size,
                    norm=rng.choice(self.norm_options),
                    anova=self.anova,
                    binning_method=rng.choice(self.binning_strategies),
                    bigrams=self.bigrams,
                    remove_repeat_words=False,
                    lower_bounding=False,
                    save_words=False,
                )

                sfa_words = transformer.fit_transform(X_dim, y)

                self.SFA_transformers[ind].append(transformer)
                bag = sfa_words[0]  # .iloc[:, 0]

                # chi-squared test to keep only relevant features
                relevant_features = {}
                apply_chi_squared = self.chi2_threshold > 0
                if apply_chi_squared:
                    bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                    chi2_statistics, p = chi2(bag_vec, y)
                    relevant_features = np.where(
                        chi2_statistics >= self.chi2_threshold)[0]

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = MUSE.shift_left(key, highest, ind,
                                                   self.highest_dim_bit,
                                                   window_size)

                            all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=False),
            StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        self._is_fitted = True
        return self
Esempio n. 21
0
    def fit(self, X, y=None):
        """A method to fit the shapelet transform to a specified X and y

        Parameters
        ----------
        X: pandas DataFrame
            The training input samples.
        y: array-like or list
            The class values for X

        Returns
        -------
        self : FullShapeletTransform
            This estimator
        """
        X, y = check_X_y(X, y, enforce_univariate=True)

        # if y is a pd.series then convert to array.
        if isinstance(y, pd.Series):
            y = y.to_numpy()

        if type(
                self) is ContractedShapeletTransform and \
                self.time_contract_in_mins <= 0:
            raise ValueError(
                "Error: time limit cannot be equal to or less than 0")

        X_lens = np.array([
            len(X.iloc[r, 0]) for r in range(len(X))
        ])  # note, assumes all dimensions of a case are the same
        # length. A shapelet would not be well defined if indices do not match!
        X = np.array([[X.iloc[r, c].values for c in range(len(X.columns))]
                      for r in range(len(X))
                      ])  # may need to pad with nans here for uneq length,
        # look at later

        num_ins = len(y)
        distinct_class_vals = \
            class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        candidates_evaluated = 0

        if type(self) is _RandomEnumerationShapeletTransform:
            num_series_to_visit = min(self.num_cases_to_sample, len(y))
        else:
            num_series_to_visit = num_ins

        shapelet_heaps_by_class = {
            i: ShapeletPQ()
            for i in distinct_class_vals
        }

        self._random_state = check_random_state(self.random_state)

        # Here we establish the order of cases to sample. We need to sample
        # x cases and y shapelets from each (where x = num_cases_to_sample
        # and y = num_shapelets_to_sample_per_case). We could simply sample
        # x cases without replacement and y shapelets from each case, but
        # the idea is that if we are using a time contract we may extract
        # all y shapelets from each x candidate and still have time remaining.
        # Therefore, if we get a list of the indices of the series and
        # shuffle them appropriately, we can go through the list again and
        # extract
        # another y shapelets from each series (if we have time).

        # We also want to ensure that we visit all classes so we will visit
        # in round-robin order. Therefore, the code below extracts the indices
        # of all series by class, shuffles the indices for each class
        # independently, and then combines them in alternating order. This
        # results in
        # a shuffled list of indices that are in alternating class order (
        # e.g. 1,2,3,1,2,3,1,2,3,1...)

        def _round_robin(*iterables):
            sentinel = object()
            return (a for x in zip_longest(*iterables, fillvalue=sentinel)
                    for a in x if a != sentinel)

        case_ids_by_class = {
            i: np.where(y == i)[0]
            for i in distinct_class_vals
        }

        # if transform is random/contract then shuffle the data initially
        # when determining which cases to visit
        if type(self) is _RandomEnumerationShapeletTransform or type(
                self) is ContractedShapeletTransform:
            for i in range(len(distinct_class_vals)):
                self._random_state.shuffle(
                    case_ids_by_class[distinct_class_vals[i]])

        num_train_per_class = {
            i: len(case_ids_by_class[i])
            for i in case_ids_by_class
        }
        round_robin_case_order = _round_robin(
            *[list(v) for k, v in case_ids_by_class.items()])
        cases_to_visit = [(i, y[i]) for i in round_robin_case_order]
        # this dictionary will be used to store all possible starting
        # positions and shapelet lengths for a give series length. This
        # is because we enumerate all possible candidates and sample without
        # replacement when assessing a series. If we have two series
        # of the same length then they will obviously have the same valid
        # shapelet starting positions and lengths (especially in standard
        # datasets where all series are equal length) so it makes sense to
        # store the possible candidates and reuse, rather than
        # recalculating each time

        # Initially the dictionary will be empty, and each time a new series
        # length is seen the dict will be updated. Next time that length
        # is used the dict will have an entry so can simply reuse
        possible_candidates_per_series_length = {}

        # a flag to indicate if extraction should stop (contract has ended)
        time_finished = False

        # max time calculating a shapelet
        # for timing the extraction when contracting
        start_time = time.time()

        def time_taken():
            return time.time() - start_time

        max_time_calc_shapelet = -1
        time_last_shapelet = time_taken()

        # for every series
        case_idx = 0
        while case_idx < len(cases_to_visit):

            series_id = cases_to_visit[case_idx][0]
            this_class_val = cases_to_visit[case_idx][1]

            # minus 1 to remove this candidate from sums
            binary_ig_this_class_count = num_train_per_class[this_class_val] - 1
            binary_ig_other_class_count = (num_ins -
                                           binary_ig_this_class_count - 1)

            if self.verbose:
                if type(self) == _RandomEnumerationShapeletTransform:
                    print("visiting series: " + str(series_id) + " (#" +
                          str(case_idx + 1) + "/" + str(num_series_to_visit) +
                          ")")
                else:
                    print("visiting series: " + str(series_id) + " (#" +
                          str(case_idx + 1) + ")")

            this_series_len = len(X[series_id][0])

            # The bound on possible shapelet lengths will differ
            # series-to-series if using unequal length data.
            # However, shapelets cannot be longer than the series, so set to
            # the minimum of the series length
            # and max shapelet length (which is inf by default)
            if self.max_shapelet_length == -1:
                this_shapelet_length_upper_bound = this_series_len
            else:
                this_shapelet_length_upper_bound = min(
                    this_series_len, self.max_shapelet_length)

            # all possible start and lengths for shapelets within this
            # series (calculates if series length is new, a simple look-up
            # if not)
            # enumerate all possible candidate starting positions and lengths.

            # First, try to reuse if they have been calculated for a series
            # of the same length before.
            candidate_starts_and_lens = \
                possible_candidates_per_series_length.get(
                    this_series_len)
            # else calculate them for this series length and store for
            # possible use again
            if candidate_starts_and_lens is None:
                candidate_starts_and_lens = [
                    [start, length] for start in range(
                        0, this_series_len - self.min_shapelet_length + 1)
                    for length in range(self.min_shapelet_length,
                                        this_shapelet_length_upper_bound + 1)
                    if start + length <= this_series_len
                ]
                possible_candidates_per_series_length[
                    this_series_len] = candidate_starts_and_lens

            # default for full transform
            candidates_to_visit = candidate_starts_and_lens
            num_candidates_per_case = len(candidate_starts_and_lens)

            # limit search otherwise:
            if hasattr(self, "num_candidates_to_sample_per_case"):
                num_candidates_per_case = min(
                    self.num_candidates_to_sample_per_case,
                    num_candidates_per_case)
                cand_idx = list(
                    self._random_state.choice(list(
                        range(0, len(candidate_starts_and_lens))),
                                              num_candidates_per_case,
                                              replace=False))
                candidates_to_visit = [
                    candidate_starts_and_lens[x] for x in cand_idx
                ]

            for candidate_idx in range(num_candidates_per_case):

                # if shapelet heap for this class is not full yet, set entry
                # criteria to be the predetermined IG threshold
                ig_cutoff = self.predefined_ig_rejection_level
                # otherwise if we have max shapelets already, set the
                # threshold as the IG of the current 'worst' shapelet we have
                if shapelet_heaps_by_class[
                    this_class_val].get_size() >= \
                        self.max_shapelets_to_store_per_class:
                    ig_cutoff = max(
                        shapelet_heaps_by_class[this_class_val].peek()[0],
                        ig_cutoff)

                cand_start_pos = candidates_to_visit[candidate_idx][0]
                cand_len = candidates_to_visit[candidate_idx][1]

                candidate = ShapeletTransform.zscore(
                    X[series_id][:, cand_start_pos:cand_start_pos + cand_len])

                # now go through all other series and get a distance from
                # the candidate to each
                orderline = []

                # initialise here as copy, decrease the new val each time we
                # evaluate a comparison series
                num_visited_this_class = 0
                num_visited_other_class = 0

                candidate_rejected = False

                for comparison_series_idx in range(len(cases_to_visit)):
                    i = cases_to_visit[comparison_series_idx][0]

                    if y[i] != cases_to_visit[comparison_series_idx][1]:
                        raise ValueError("class match sanity test broken")

                    if i == series_id:
                        # don't evaluate candidate against own series
                        continue

                    if y[i] == this_class_val:
                        num_visited_this_class += 1
                        binary_class_identifier = 1  # positive for this class
                    else:
                        num_visited_other_class += 1
                        binary_class_identifier = -1  # negative for any
                        # other class

                    bsf_dist = np.inf

                    start_left = cand_start_pos
                    start_right = cand_start_pos + 1

                    if X_lens[i] == cand_len:
                        start_left = 0
                        start_right = 0

                    for num_cals in range(
                            max(1, int(np.ceil(
                                (X_lens[i] - cand_len) / 2)))):  # max
                        # used to force iteration where series len ==
                        # candidate len
                        if start_left < 0:
                            start_left = X_lens[i] - 1 - cand_len

                        comparison = ShapeletTransform.zscore(
                            X[i][:, start_left:start_left + cand_len])
                        dist_left = np.linalg.norm(candidate - comparison)
                        bsf_dist = min(dist_left * dist_left, bsf_dist)

                        # for odd lengths
                        if start_left == start_right:
                            continue

                        # right
                        if start_right == X_lens[i] - cand_len + 1:
                            start_right = 0
                        comparison = ShapeletTransform.zscore(
                            X[i][:, start_right:start_right + cand_len])
                        dist_right = np.linalg.norm(candidate - comparison)
                        bsf_dist = min(dist_right * dist_right, bsf_dist)

                        start_left -= 1
                        start_right += 1

                    orderline.append((bsf_dist, binary_class_identifier))
                    # sorting required after each add for early IG abandon.
                    # timsort should be efficient as array is almost in
                    # order - insertion-sort like behaviour in this case.
                    # Can't use heap as need to traverse in order multiple
                    # times, not just access root
                    orderline.sort()

                    if len(orderline) > 2:
                        ig_upper_bound = \
                            ShapeletTransform.calc_early_binary_ig(
                                orderline, num_visited_this_class,
                                num_visited_other_class,
                                binary_ig_this_class_count -
                                num_visited_this_class,
                                binary_ig_other_class_count -
                                num_visited_other_class)
                        # print("upper: "+str(ig_upper_bound))
                        if ig_upper_bound <= ig_cutoff:
                            candidate_rejected = True
                            break

                candidates_evaluated += 1
                if self.verbose > 3 and candidates_evaluated % 100 == 0:
                    print("candidates evaluated: " + str(candidates_evaluated))

                # only do if candidate was not rejected
                if candidate_rejected is False:
                    final_ig = ShapeletTransform.calc_binary_ig(
                        orderline, binary_ig_this_class_count,
                        binary_ig_other_class_count)
                    accepted_candidate = Shapelet(series_id, cand_start_pos,
                                                  cand_len, final_ig,
                                                  candidate)

                    # add to min heap to store shapelets for this class
                    shapelet_heaps_by_class[this_class_val].push(
                        accepted_candidate)

                    # informal, but extra 10% allowance for self similar later
                    if shapelet_heaps_by_class[
                        this_class_val].get_size() > \
                            self.max_shapelets_to_store_per_class * 3:
                        shapelet_heaps_by_class[this_class_val].pop()

                # Takes into account the use of the MAX shapelet calculation
                # time to not exceed the time_limit (not exact, but likely a
                # good guess).
                if hasattr(self,
                           'time_contract_in_mins') and \
                        self.time_contract_in_mins \
                        > 0:
                    time_now = time_taken()
                    time_this_shapelet = (time_now - time_last_shapelet)
                    if time_this_shapelet > max_time_calc_shapelet:
                        max_time_calc_shapelet = time_this_shapelet
                        print(max_time_calc_shapelet)
                    time_last_shapelet = time_now

                    # add a little 1% leeway to the timing incase one run was slightly faster than
                    # another based on the CPU.
                    time_in_seconds = self.time_contract_in_mins * 60
                    max_shapelet_time_percentage = (max_time_calc_shapelet /
                                                    100.0) * 0.75
                    if (time_now + max_shapelet_time_percentage) > \
                            time_in_seconds:
                        if self.verbose > 0:
                            print("No more time available! It's been {0:02d}:{"
                                  "1:02}".format(
                                      int(round(time_now / 60, 3)),
                                      int((round(time_now / 60, 3) -
                                           int(round(time_now / 60, 3))) *
                                          60)))
                        time_finished = True
                        break
                    else:
                        if self.verbose > 0:
                            if candidate_rejected is False:
                                print(
                                    "Candidate finished. {0:02d}:{1:02} "
                                    "remaining".format(
                                        int(
                                            round(
                                                self.time_contract_in_mins -
                                                time_now / 60, 3)),
                                        int((round(
                                            self.time_contract_in_mins -
                                            time_now / 60, 3) - int(
                                                round(
                                                    (self.time_contract_in_mins
                                                     - time_now) / 60, 3))) *
                                            60)))
                            else:
                                print(
                                    "Candidate rejected. {0:02d}:{1:02} "
                                    "remaining".format(
                                        int(
                                            round((self.time_contract_in_mins -
                                                   time_now) / 60, 3)),
                                        int((round(
                                            (self.time_contract_in_mins -
                                             time_now) / 60,
                                            3) - int(
                                                round(
                                                    (self.time_contract_in_mins
                                                     - time_now) / 60, 3))) *
                                            60)))

            # stopping condition: in case of iterative transform (i.e.
            # num_cases_to_sample have been visited)
            #                     in case of contracted transform (i.e. time
            #                     limit has been reached)
            case_idx += 1

            if case_idx >= num_series_to_visit:
                if hasattr(self,
                           'time_contract_in_mins') and time_finished is not \
                        True:
                    case_idx = 0
            elif case_idx >= num_series_to_visit or time_finished:
                if self.verbose > 0:
                    print("Stopping search")
                break

        # remove self similar here
        # for each class value
        #       get list of shapelets
        #       sort by quality
        #       remove self similar

        self.shapelets = []
        for class_val in distinct_class_vals:
            by_class_descending_ig = sorted(
                shapelet_heaps_by_class[class_val].get_array(),
                key=itemgetter(0),
                reverse=True)

            if self.remove_self_similar and len(by_class_descending_ig) > 0:
                by_class_descending_ig = \
                    ShapeletTransform.remove_self_similar_shapelets(
                        by_class_descending_ig)
            else:
                # need to extract shapelets from tuples
                by_class_descending_ig = [x[2] for x in by_class_descending_ig]

            # if we have more than max_shapelet_per_class, trim to that
            # amount here
            if len(by_class_descending_ig) > \
                    self.max_shapelets_to_store_per_class:
                max_n = self.max_shapelets_to_store_per_class
                by_class_descending_ig = by_class_descending_ig[:max_n]

            self.shapelets.extend(by_class_descending_ig)

        # final sort so that all shapelets from all classes are in
        # descending order of information gain
        self.shapelets.sort(key=lambda x: x.info_gain, reverse=True)
        self.is_fitted_ = True

        # warn the user if fit did not produce any valid shapelets
        if len(self.shapelets) == 0:
            warnings.warn(
                "No valid shapelets were extracted from this dataset and "
                "calling the transform method "
                "will raise an Exception. Please re-fit the transform with "
                "other data and/or "
                "parameter options.")

        self._is_fitted = True
        return self
Esempio n. 22
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)
        X = tabularize(X, return_array=True)
        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length
        self.intervals = np.zeros((self.n_estimators, self.n_intervals, 2),
                                  dtype=int)
        for i in range(self.n_estimators):
            transformed_x = np.empty(shape=(3 * self.n_intervals, n_instances))
            # Find the random intervals for classifier i and concatentate
            # features
            for j in range(self.n_intervals):
                self.intervals[i][j][0] = rng.randint(self.series_length -
                                                      self.min_interval)
                length = rng.randint(self.series_length -
                                     self.intervals[i][j][0] - 1)
                if length < self.min_interval:
                    length = self.min_interval
                self.intervals[i][j][1] = self.intervals[i][j][0] + length
                # Transforms here, just hard coding it, so not configurable
                means = np.mean(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]],
                    axis=1)
                std_dev = np.std(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]],
                    axis=1)
                slope = self._lsq_fit(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]])
                transformed_x[3 * j] = means
                transformed_x[3 * j + 1] = std_dev
                transformed_x[3 * j + 2] = slope
            tree = clone(self.base_estimator)
            tree.set_params(**{"random_state": self.random_state})
            transformed_x = transformed_x.T
            tree.fit(transformed_x, y)
            self.classifiers.append(tree)
        self._is_fitted = True
        return self
Esempio n. 23
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a forest of trees from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix of shape (n_samples, n_features)
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.
        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)

        # Validate or convert input data
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
        if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        # Remap output
        self.n_columns = X.shape[1]
        self.n_features_ = X.shape[1] if X.ndim == 2 else 1

        y = np.atleast_1d(y)
        if y.ndim == 2 and y.shape[1] == 1:
            warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        y, expanded_class_weight = self._validate_y_class_weight(y)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Get bootstrap sample size
        n_samples_bootstrap = _get_n_samples_bootstrap(
            n_samples=X.shape[0], max_samples=self.max_samples)

        # Check parameters
        self._validate_estimator()

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        random_state = check_random_state(self.random_state)

        if not self.warm_start or not hasattr(self, "estimators_"):
            # Free allocated memory, if any
            self.estimators_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
        else:
            if self.warm_start and len(self.estimators_) > 0:
                # We draw from the random state to get the random state we
                # would have got if we hadn't used a warm_start.
                random_state.randint(MAX_INT, size=len(self.estimators_))

            trees = [
                self._make_estimator(append=False, random_state=random_state)
                for i in range(n_more_estimators)
            ]

            # Parallel loop: for standard random forests, the threading
            # backend is preferred as the Cython code for fitting the trees
            # is internally releasing the Python GIL making threading more
            # efficient than multiprocessing in that case.
            # However, in this case,for fitting pipelines in parallel,
            # multiprocessing is more efficient.
            trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_parallel_build_trees)(
                    t,
                    self,
                    X,
                    y,
                    sample_weight,
                    i,
                    len(trees),
                    verbose=self.verbose,
                    class_weight=self.class_weight,
                    n_samples_bootstrap=n_samples_bootstrap)
                for i, t in enumerate(trees))

            # Collect newly grown trees
            self.estimators_.extend(trees)

        if self.oob_score:
            self._set_oob_score(X, y)

        # Decapsulate classes_ attributes
        if hasattr(self, "classes_") and self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        self._is_fitted = True
        return self
Esempio n. 24
0
    def fit(self, X, y):
        """Build a WEASEL classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        # Window length parameter space dependent on series length
        self.n_instances, self.series_length = X.shape[0], X.shape[-1]

        win_inc = self.compute_window_inc()

        self.max_window = int(min(self.series_length, self.max_window))
        self.window_sizes = list(
            range(self.min_window, self.max_window, win_inc))

        self.highest_bit = (math.ceil(math.log2(self.max_window))) + 1
        rng = check_random_state(self.random_state)

        all_words = [dict() for x in range(len(X))]

        for window_size in self.window_sizes:

            transformer = SFA(
                word_length=rng.choice(self.word_lengths),
                alphabet_size=self.alphabet_size,
                window_size=window_size,
                norm=rng.choice(self.norm_options),
                anova=self.anova,
                # levels=rng.choice([1, 2, 3]),
                binning_method=self.binning_strategy,
                bigrams=self.bigrams,
                remove_repeat_words=False,
                lower_bounding=False,
                save_words=False,
            )

            sfa_words = transformer.fit_transform(X, y)

            self.SFA_transformers.append(transformer)
            bag = sfa_words[0]  # .iloc[:, 0]

            # chi-squared test to keep only relevant features
            relevant_features = {}
            apply_chi_squared = self.chi2_threshold > 0
            if apply_chi_squared:
                bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                chi2_statistics, p = chi2(bag_vec, y)
                relevant_features = np.where(
                    chi2_statistics >= self.chi2_threshold)[0]

            # merging bag-of-patterns of different window_sizes
            # to single bag-of-patterns with prefix indicating
            # the used window-length
            for j in range(len(bag)):
                for (key, value) in bag[j].items():
                    # chi-squared test
                    if (not apply_chi_squared) or (key in relevant_features):
                        # append the prefices to the words to
                        # distinguish between window-sizes
                        if isinstance(key, tuple):
                            word = (((key[0] << self.highest_bit) | key[1]) <<
                                    3) | window_size
                        else:
                            # word = ((key << self.highest_bit) << 3) \
                            #        | window_size
                            word = WEASEL.shift_left(key, self.highest_bit,
                                                     window_size)

                        all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=False),
            StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        self._is_fitted = True
        return self
Esempio n. 25
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and spectral features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)
        X = X.squeeze(1)

        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.estimators_ = []
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.intervals = np.zeros((self.n_estimators, 2), dtype=int)
        self.intervals[0][0] = 0
        self.intervals[0][1] = self.series_length
        for i in range(1, self.n_estimators):
            self.intervals[i][0] = rng.randint(self.series_length -
                                               self.min_interval)
            self.intervals[i][1] = rng.randint(
                self.intervals[i][0] + self.min_interval, self.series_length)
        # Check lag against global properties
        self.acf_lag_ = self.acf_lag
        if self.acf_lag > self.series_length - self.acf_min_values:
            self.acf_lag_ = self.series_length - self.acf_min_values
        if self.acf_lag < 0:
            self.acf_lag_ = 1
        self.lags = np.zeros(self.n_estimators, dtype=int)
        for i in range(0, self.n_estimators):
            temp_lag = self.acf_lag_
            if (temp_lag > self.intervals[i][1] - self.intervals[i][0] -
                    self.acf_min_values):
                temp_lag = (self.intervals[i][1] - self.intervals[i][0] -
                            self.acf_min_values)
            if temp_lag < 0:
                temp_lag = 1
            self.lags[i] = int(temp_lag)
            acf_x = np.empty(shape=(n_instances, self.lags[i]))
            ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2
            ps_x = np.empty(shape=(n_instances, int(ps_len)))
            for j in range(0, n_instances):
                acf_x[j] = acf(X[j, self.intervals[i][0]:self.intervals[i][1]],
                               temp_lag)
                ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]])
            transformed_x = np.concatenate((acf_x, ps_x), axis=1)
            #            transformed_x=acf_x
            tree = clone(self.base_estimator)
            # set random state, but not the same, so that estimators vary
            tree.set_params(
                **{"random_state": rng.randint(np.iinfo(np.int32).max)})
            tree.fit(transformed_x, y)
            self.estimators_.append(tree)

        self._is_fitted = True
        return self
Esempio n. 26
0
    def fit(self, X, y):
        """Build an ensemble of individual TDE classifiers from the training
        set (X,y), through randomising over the parameter space to a set
        number of times then selecting new parameters using Gaussian
        processes

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)
        y = y.values if isinstance(y, pd.Series) else y

        self.time_limit = self.time_limit * 60
        self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0])
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self.classifiers = []
        self.weights = []
        self.prev_parameters_x = []
        self.prev_parameters_y = []

        # Window length parameter space dependent on series length
        max_window_searches = self.series_length / 4
        max_window = int(self.series_length * self.max_win_len_prop)
        win_inc = int((max_window - self.min_window) / max_window_searches)
        if win_inc < 1:
            win_inc = 1

        possible_parameters = self._unique_parameters(max_window, win_inc)
        num_classifiers = 0
        start_time = time.time()
        train_time = 0
        subsample_size = int(self.n_instances * 0.7)
        lowest_acc = 0
        lowest_acc_idx = 0

        if self.time_limit > 0:
            self.n_parameter_samples = 0

        rng = check_random_state(self.random_state)

        while (train_time < self.time_limit or num_classifiers <
               self.n_parameter_samples) and len(possible_parameters) > 0:
            if num_classifiers < self.randomly_selected_params:
                parameters = possible_parameters.pop(
                    rng.randint(0, len(possible_parameters)))
            else:
                gp = GaussianProcessRegressor(random_state=self.random_state)
                gp.fit(self.prev_parameters_x, self.prev_parameters_y)
                preds = gp.predict(possible_parameters)
                parameters = possible_parameters.pop(
                    rng.choice(np.flatnonzero(preds == preds.max())))

            subsample = rng.choice(self.n_instances,
                                   size=subsample_size,
                                   replace=False)
            X_subsample = X.iloc[subsample, :]
            y_subsample = y[subsample]

            tde = IndividualTDE(*parameters,
                                alphabet_size=self.alphabet_size,
                                random_state=self.random_state)
            tde.fit(X_subsample, y_subsample)

            tde.accuracy = self._individual_train_acc(tde, y_subsample,
                                                      subsample_size,
                                                      lowest_acc)
            weight = math.pow(tde.accuracy, 4)

            if num_classifiers < self.max_ensemble_size:
                if tde.accuracy < lowest_acc:
                    lowest_acc = tde.accuracy
                    lowest_acc_idx = num_classifiers
                self.weights.append(weight)
                self.classifiers.append(tde)
            elif tde.accuracy > lowest_acc:
                self.weights[lowest_acc_idx] = weight
                self.classifiers[lowest_acc_idx] = tde
                lowest_acc, lowest_acc_idx = self._worst_ensemble_acc()

            self.prev_parameters_x.append(parameters)
            self.prev_parameters_y.append(tde.accuracy)

            num_classifiers += 1
            train_time = time.time() - start_time

        self.n_estimators = len(self.classifiers)
        self.weight_sum = np.sum(self.weights)

        self._is_fitted = True
        return self
Esempio n. 27
0
    def fit(self, X, y):
        """Build an ensemble of BOSS classifiers from the training set (X,
        y), either through randomising over the para
         space to make a fixed size ensemble quickly or by creating a
         variable size ensemble of those within a threshold
         of the best
        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)
        y = y.values if isinstance(y, pd.Series) else y

        self.time_limit = self.time_limit * 60
        self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0])
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self.classifiers = []
        self.weights = []

        # Window length parameter space dependent on series length

        max_window_searches = self.series_length / 4
        max_window = int(self.series_length * self.max_win_len_prop)
        win_inc = int((max_window - self.min_window) / max_window_searches)
        if win_inc < 1:
            win_inc = 1

        # cBOSS
        if self.randomised_ensemble:
            possible_parameters = self._unique_parameters(max_window, win_inc)
            num_classifiers = 0
            start_time = time.time()
            train_time = 0
            subsample_size = int(self.n_instances * 0.7)
            lowest_acc = 0
            lowest_acc_idx = 0

            rng = check_random_state(self.random_state)

            if self.time_limit > 0:
                self.n_parameter_samples = 0

            while (train_time < self.time_limit or num_classifiers <
                   self.n_parameter_samples) and len(possible_parameters) > 0:
                parameters = possible_parameters.pop(
                    rng.randint(0, len(possible_parameters)))

                subsample = rng.choice(self.n_instances, size=subsample_size,
                                       replace=False)
                X_subsample = X.iloc[subsample, :]
                y_subsample = y[subsample]

                boss = BOSSIndividual(*parameters,
                                      alphabet_size=self.alphabet_size,
                                      save_words=False,
                                      random_state=self.random_state)
                boss.fit(X_subsample, y_subsample)
                boss._clean()

                boss.accuracy = self._individual_train_acc(boss, y_subsample,
                                                           subsample_size,
                                                           lowest_acc)
                weight = math.pow(boss.accuracy, 4)

                if num_classifiers < self.max_ensemble_size:
                    if boss.accuracy < lowest_acc:
                        lowest_acc = boss.accuracy
                        lowest_acc_idx = num_classifiers
                    self.weights.append(weight)
                    self.classifiers.append(boss)

                elif boss.accuracy > lowest_acc:
                    self.weights[lowest_acc_idx] = weight
                    self.classifiers[lowest_acc_idx] = boss
                    lowest_acc, lowest_acc_idx = self._worst_ensemble_acc()

                num_classifiers += 1
                train_time = time.time() - start_time
        # BOSS
        else:
            max_acc = -1
            min_max_acc = -1

            for i, normalise in enumerate(self.norm_options):
                for win_size in range(self.min_window, max_window + 1,
                                      win_inc):
                    boss = BOSSIndividual(win_size, self.word_lengths[0],
                                          normalise, self.alphabet_size,
                                          save_words=True,
                                          random_state=self.random_state)
                    boss.fit(X, y)

                    best_classifier_for_win_size = boss
                    best_acc_for_win_size = -1

                    # the used work length may be shorter
                    best_word_len = boss.transformer.word_length

                    for n, word_len in enumerate(self.word_lengths):
                        if n > 0:
                            boss = boss._shorten_bags(word_len)

                        boss.accuracy = self._individual_train_acc(
                            boss, y, self.n_instances, best_acc_for_win_size)

                        # print(win_size, boss.accuracy)
                        if boss.accuracy >= best_acc_for_win_size:
                            best_acc_for_win_size = boss.accuracy
                            best_classifier_for_win_size = boss
                            best_word_len = word_len

                    if self._include_in_ensemble(best_acc_for_win_size,
                                                 max_acc,
                                                 min_max_acc,
                                                 len(self.classifiers)):
                        best_classifier_for_win_size._clean()
                        best_classifier_for_win_size._set_word_len(
                            best_word_len)
                        self.classifiers.append(best_classifier_for_win_size)

                        # print("appending", best_acc_for_win_size, win_size)
                        if best_acc_for_win_size > max_acc:
                            max_acc = best_acc_for_win_size
                            self.classifiers = list(compress(
                                self.classifiers, [
                                    classifier.accuracy >= max_acc *
                                    self.threshold for c, classifier in
                                    enumerate(self.classifiers)]))

                        min_max_acc, min_acc_ind = \
                            self._worst_ensemble_acc()

                        if len(self.classifiers) > self.max_ensemble_size:
                            if min_acc_ind > -1:
                                del self.classifiers[min_acc_ind]
                                min_max_acc, min_acc_ind = \
                                    self._worst_ensemble_acc()

            self.weights = [1 for n in range(len(self.classifiers))]

        self.n_estimators = len(self.classifiers)
        self.weight_sum = np.sum(self.weights)

        self._is_fitted = True
        return self
Esempio n. 28
0
    def fit(self, X, y):
        """Build a WEASEL classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)
        y = y.values if isinstance(y, pd.Series) else y

        # Window length parameter space dependent on series length
        self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0])
        self.max_window = min(self.series_length, self.max_window)
        self.window_sizes = list(
            range(self.min_window, self.max_window, self.win_inc))

        max_acc = -1
        self.highest_bit = (math.ceil(math.log2(self.max_window)) + 1)

        final_bag_vec = None

        for norm in self.norm_options:
            # transformers = []

            for w, word_length in enumerate(self.word_lengths):
                all_words = [dict() for x in range(len(X))]
                transformers = []

                for i, window_size in enumerate(self.window_sizes):
                    # if w == 0:  # only compute once, otherwise shorten
                    transformer = SFA(word_length=np.max(word_length),
                                      alphabet_size=self.alphabet_size,
                                      window_size=window_size,
                                      norm=norm,
                                      anova=self.anova,
                                      binning_method=self.binning_strategy,
                                      bigrams=self.bigrams,
                                      remove_repeat_words=False,
                                      lower_bounding=False,
                                      save_words=False)
                    sfa_words = transformer.fit_transform(X, y)
                    transformers.append(transformer)

                    # use the shortening of words trick
                    # sfa_words = transformers[i]._shorten_bags(word_length)

                    # TODO refactor? dicts not really needed here ...
                    bag = sfa_words.iloc[:, 0]

                    # chi-squared test to keep only relevent features
                    # bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                    # chi2_statistics, p = chi2(bag_vec, y)
                    # relevant_features = np.where(
                    #    chi2_statistics >= self.chi2_threshold)[0]

                    # merging bag-of-patterns of different window_sizes
                    # to single bag-of-patterns with prefix indicating
                    # the used window-length
                    for j in range(len(bag)):
                        for (key, value) in bag[j].items():
                            # if key in relevant_features:  # chi-squared test
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = (key << self.highest_bit) | window_size
                            # X_all_words[j].append((word, value))
                            all_words[j][word] = value

                # TODO use CountVectorizer instead on actual words ... ???
                vectorizer = DictVectorizer(sparse=True)
                bag_vec = vectorizer.fit_transform(all_words)

                clf = LogisticRegression(max_iter=5000,
                                         solver="liblinear",
                                         dual=True,
                                         penalty="l2",
                                         random_state=self.random_state)
                current_acc = cross_val_score(clf, bag_vec, y, cv=5).mean()

                # clf = RandomForestClassifier(oob_score=True,
                #                              n_estimators=1000,
                #                              n_jobs=-1).fit(bag_vec, y)
                # current_acc = clf.oob_score_

                # print("Train acc:", norm, word_length, current_acc)

                if current_acc > max_acc:
                    max_acc = current_acc
                    self.vectorizer = vectorizer
                    self.clf = clf
                    self.SFA_transformers = transformers
                    self.best_word_length = word_length
                    final_bag_vec = bag_vec

                if max_acc == 1.0:
                    break  # there can be no better model than 1.0

        # # fit final model using all words
        # for i, window_size in enumerate(self.window_sizes):
        #     self.SFA_transformers[i] = \
        #         SFA(word_length=np.max(self.word_lengths),
        #             alphabet_size=self.alphabet_size,
        #             window_size=window_size,
        #             norm=norm,
        #             anova=self.anova,
        #             binning_method=self.binning_strategy,
        #             bigrams=self.bigrams,
        #             remove_repeat_words=False,
        #             lower_bounding=False,
        #             save_words=False)
        #     self.SFA_transformers[i].fit_transform(X, y)

        self.clf.fit(final_bag_vec, y)
        self._is_fitted = True
        return self