Exemple #1
0
def test_knn_on_arrowhead():
    # load gunpoint data
    X_train, y_train = load_arrow_head(split="train", return_X_y=True)
    X_test, y_test = load_arrow_head(split="test", return_X_y=True)
    for i in range(0, len(distance_functions)):
        knn = KNeighborsTimeSeriesClassifier(distance=distance_functions[i], )
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        correct = 0
        for j in range(0, len(pred)):
            if pred[j] == y_test[j]:
                correct = correct + 1
        assert correct == expected_correct[distance_functions[i]]
def test_knn_on_unit_test():
    """Test function for elastic knn, to be reinstated soon."""
    # load arrowhead data for unit tests
    X_train, y_train = load_unit_test(split="train", return_X_y=True)
    X_test, y_test = load_unit_test(split="test", return_X_y=True)
    for i in range(0, len(distance_functions)):
        knn = KNeighborsTimeSeriesClassifier(distance=distance_functions[i], )
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        correct = 0
        for j in range(0, len(pred)):
            if pred[j] == y_test[j]:
                correct = correct + 1
        assert correct == expected_correct[distance_functions[i]]
def test_knn_bounding_matrix():
    """Test knn with custom bounding parameters."""
    X_train, y_train = load_unit_test(split="train", return_X_y=True)
    X_test, y_test = load_unit_test(split="test", return_X_y=True)
    for i in range(0, len(distance_functions)):
        knn = KNeighborsTimeSeriesClassifier(
            distance=distance_functions[i], distance_params={"window": 0.5}
        )
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        correct = 0
        for j in range(0, len(pred)):
            if pred[j] == y_test[j]:
                correct = correct + 1
        assert correct == expected_correct[distance_functions[i]]
Exemple #4
0
    def bop_pipeline(X, y):
        steps = [
            ("transform", SAX(remove_repeat_words=True)),
            (
                "clf",
                KNeighborsTimeSeriesClassifier(
                    n_neighbors=1, metric=euclidean_distance
                ),
            ),
        ]
        pipeline = Pipeline(steps)

        series_length = X.iloc[0, 0].shape[0]
        max_window_searches = series_length / 4
        win_inc = int((series_length - 10) / max_window_searches)
        if win_inc < 1:
            win_inc = 1
        window_sizes = [win_size for win_size in range(10, series_length + 1, win_inc)]

        cv_params = {
            "transform__word_length": [8, 10, 12, 14, 16],
            "transform__alphabet_size": [2, 3, 4],
            "transform__window_size": window_sizes,
        }
        model = GridSearchCV(pipeline, cv_params, cv=5)
        model.fit(X, y)
        return model
Exemple #5
0
    def _fit(self, X, y):
        """Train the classifier.

        Parameters
        ----------
        X - pandas dataframe of training data of shape [n_instances,1].
        y - list of class labels of shape [n_instances].

        Returns
        -------
        self : the shapeDTW object
        """
        # Perform preprocessing on params.
        if not (isinstance(self.shape_descriptor_function, str)):
            raise TypeError("shape_descriptor_function must be an 'str'. \
                            Found '" +
                            type(self.shape_descriptor_function).__name__ +
                            "' instead.")

        if self.metric_params is None:
            self.metric_params = {}
            _reset = True

        # If the shape descriptor is 'compound',
        # calculate the appropriate weighting_factor
        if self.shape_descriptor_function == "compound":
            self._calculate_weighting_factor_value(X, y)

        # Fit the SlidingWindowSegmenter
        sw = SlidingWindowSegmenter(self.subsequence_length)
        sw.fit(X)
        self.sw = sw

        # Transform the training data.
        X = self._preprocess(X)

        # Fit the kNN classifier
        self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=self.n_neighbors)
        self.knn.fit(X, y)
        self.classes_ = self.knn.classes_
        # Hack to pass the unit tests
        if _reset:
            self.metric_params = None
        return self
Exemple #6
0
    def fit(self, X, y):
        """Build an ensemble of 1-NN classifiers from th training set (X, y),
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,
            it must have a single column. BOSS not configured
            to handle multivariate
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)

        # Derivative DTW (DDTW) uses the regular DTW algorithm on data that
        # are transformed into derivatives.
        # To increase the efficiency of DDTW we can pre-transform the data
        # into derivatives, and then call the
        # standard DTW algorithm on it, rather than transforming each series
        # every time a distance calculation
        # is made. Please note that using DDTW elsewhere will not benefit
        # from this speed enhancement
        if self.distance_measures.__contains__(
                ddtw_c) or self.distance_measures.__contains__(wddtw_c):
            der_X = DerivativeSlopeTransformer().fit_transform(X)
            # reshape X for use with the efficient cython distance measures
            der_X = np.array(
                [np.asarray([x]).reshape(len(x), 1) for x in der_X.iloc[:, 0]])
        else:
            der_X = None

        # reshape X for use with the efficient cython distance measures
        X = np.array(
            [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]])

        self.train_accs_by_classifier = np.zeros(len(self.distance_measures))
        self.train_preds_by_classifier = [None] * len(self.distance_measures)
        self.estimators_ = [None] * len(self.distance_measures)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        rand = np.random.RandomState(self.random_state)

        # The default EE uses all training instances for setting parameters,
        # and 100 parameter options per
        # elastic measure. The prop_train_in_param_finding and
        # prop_of_param_options attributes of this class
        # can be used to control this however, using less cases to optimise
        # parameters on the training data
        # and/or using less parameter options.
        #
        # For using less training instances the appropriate number of cases
        # must be sampled from the data.
        # This is achieved through the use of a deterministic
        # StratifiedShuffleSplit
        #
        # For using less parameter options a RandomizedSearchCV is used in
        # place of a GridSearchCV

        param_train_x = None
        der_param_train_x = None
        param_train_y = None

        # If using less cases for parameter optimisation, use the
        # StratifiedShuffleSplit:
        if self.proportion_train_in_param_finding < 1:
            if self.verbose > 0:
                print(
                    "Restricting training cases for parameter optimisation: ",
                    end="")
            sss = StratifiedShuffleSplit(
                n_splits=1,
                test_size=1 - self.proportion_train_in_param_finding,
                random_state=rand)
            for train_index, test_index in sss.split(X, y):
                param_train_x = X[train_index, :]
                param_train_y = y[train_index]
                if der_X is not None:
                    der_param_train_x = der_X[train_index, :]
                if self.verbose > 0:
                    print("using " + str(len(param_train_x)) +
                          " training cases instead of " + str(len(X)) +
                          " for parameter optimisation")
        # else, use the full training data for optimising parameters
        else:
            if self.verbose > 0:
                print("Using all training cases for parameter optimisation")
            param_train_x = X
            param_train_y = y
            if der_X is not None:
                der_param_train_x = der_X

        self.constituent_build_times = []

        if self.verbose > 0:
            print("Using " + str(100 * self.proportion_of_param_options) +
                  " parameter "
                  "options per "
                  "measure")
        for dm in range(0, len(self.distance_measures)):
            this_measure = self.distance_measures[dm]

            # uses the appropriate training data as required (either full or
            # smaller sample as per the StratifiedShuffleSplit)
            param_train_to_use = param_train_x
            full_train_to_use = X
            if this_measure is ddtw_c or dm is wddtw_c:
                param_train_to_use = der_param_train_x
                full_train_to_use = der_X
                if this_measure is ddtw_c:
                    this_measure = dtw_c
                elif this_measure is wddtw_c:
                    this_measure = wdtw_c

            start_build_time = time.time()
            if self.verbose > 0:
                if self.distance_measures[dm] is ddtw_c or \
                        self.distance_measures[dm] is wddtw_c:
                    print("Currently evaluating " +
                          str(self.distance_measures[dm].__name__) +
                          " (implemented as " + str(this_measure.__name__) +
                          " with pre-transformed derivative data)")
                else:
                    print("Currently evaluating " +
                          str(self.distance_measures[dm].__name__))

            # If 100 parameter options are being considered per measure,
            # use a GridSearchCV
            if self.proportion_of_param_options == 1:

                grid = GridSearchCV(
                    estimator=KNeighborsTimeSeriesClassifier(
                        metric=this_measure, n_neighbors=1, algorithm="brute"),
                    param_grid=ElasticEnsemble._get_100_param_options(
                        self.distance_measures[dm], X),
                    cv=LeaveOneOut(),
                    scoring='accuracy',
                    verbose=self.verbose)
                grid.fit(param_train_to_use, param_train_y)

            # Else, used RandomizedSearchCV to randomly sample parameter
            # options for each measure
            else:
                grid = RandomizedSearchCV(
                    estimator=KNeighborsTimeSeriesClassifier(
                        metric=this_measure, n_neighbors=1, algorithm="brute"),
                    param_distributions=ElasticEnsemble._get_100_param_options(
                        self.distance_measures[dm], X),
                    cv=LeaveOneOut(),
                    scoring='accuracy',
                    n_iter=100 * self.proportion_of_param_options,
                    random_state=rand,
                    verbose=self.verbose)
                grid.fit(param_train_to_use, param_train_y)

            # once the best parameter option has been estimated on the
            # training data, perform a final pass with this parameter option
            # to get the individual predictions with cross_cal_predict (
            # Note: optimisation potentially possible here if a GridSearchCV
            # was used previously. TO-DO: determine how to extract
            # predictions for the best param option from GridSearchCV)
            best_model = KNeighborsTimeSeriesClassifier(
                algorithm="brute",
                n_neighbors=1,
                metric=this_measure,
                metric_params=grid.best_params_['metric_params'])
            preds = cross_val_predict(best_model,
                                      full_train_to_use,
                                      y,
                                      cv=LeaveOneOut())
            acc = accuracy_score(y, preds)

            if self.verbose > 0:
                print("Training accuracy for " +
                      str(self.distance_measures[dm].__name__) + ": " +
                      str(acc) + " (with parameter setting: " +
                      str(grid.best_params_['metric_params']) + ")")

            # Finally, reset the classifier for this measure and parameter
            # option, ready to be called for test classification
            best_model = KNeighborsTimeSeriesClassifier(
                algorithm="brute",
                n_neighbors=1,
                metric=this_measure,
                metric_params=grid.best_params_['metric_params'])
            best_model.fit(full_train_to_use, y)
            end_build_time = time.time()

            self.constituent_build_times.append(
                str(end_build_time - start_build_time))
            self.estimators_[dm] = best_model
            self.train_accs_by_classifier[dm] = acc
            self.train_preds_by_classifier[dm] = preds

        self._is_fitted = True
        return self
Exemple #7
0
class ShapeDTW(BaseClassifier):
    """ShapeDTW classifier.

    ShapeDTW[1] works by initially extracting a set of subsequences
    describing local neighbourhoods around each data point in a time series.
    These subsequences are then passed into a shape descriptor function that
    transforms these local neighbourhoods into a new representation. This
    new representation is then sent into DTW with 1-NN.

    Parameters
    ----------
    n_neighbours                : int, int, set k for knn (default =1).
    subsequence_length          : int, defines the length of the
                                  subsequences(default=sqrt(n_timepoints)).

    shape_descriptor_function   : string, defines the function to describe
                                  the set of subsequences
                                  (default = 'raw').


    The possible shape descriptor functions are as follows:

        - 'raw'                 : use the raw subsequence as the
                                  shape descriptor function.
                                - params = None

        - 'paa'                 : use PAA as the shape descriptor function.
                                - params = num_intervals_paa (default=8)

        - 'dwt'                 : use DWT (Discrete Wavelet Transform)
                                  as the shape descriptor function.
                                - params = num_levels_dwt (default=3)

        - 'slope'               : use the gradient of each subsequence
                                  fitted by a total least squares
                                  regression as the shape descriptor
                                  function.
                                - params = num_intervals_slope (default=8)

        - 'derivative'          : use the derivative of each subsequence
                                  as the shape descriptor function.
                                - params = None

        - 'hog1d'               : use a histogram of gradients in one
                                  dimension as the shape desciptor
                                  function.
                                - params = num_intervals_hog1d
                                                    (defualt=2)
                                         = num_bins_hod1d
                                                    (default=8)
                                         = scaling_factor_hog1d
                                                    (default=0.1)

        - 'compound'            : use a combination of two shape
                                  descriptors simultaneously.
                                - params = weighting_factor
                                          (default=None)
                                           Defines how to scale
                                           values of a shape
                                           descriptor.
                                           If a value is not given,
                                           this value is tuned
                                           by 10-fold cross-validation
                                           on the training data.


    shape_descriptor_functions  : string list, only applicable when the
                                  shape_descriptor_function is
                                  set to 'compound'.
                                  Use a list of shape descriptor
                                  functions at the same time.
                                  (default = ['raw','derivative'])

    metric_params               : dictionary for metric parameters
                                  (default = None).

    Notes
    -----
    ..[1] Jiaping Zhao and Laurent Itti, "shapeDTW: Shape Dynamic Time Warping",
        Pattern Recognition, 74, pp 171-184, 2018
        http://www.sciencedirect.com/science/article/pii/S0031320317303710,

    """
    def __init__(
        self,
        n_neighbors=1,
        subsequence_length=30,
        shape_descriptor_function="raw",
        shape_descriptor_functions=["raw",
                                    "derivative"],  # noqa from flake8 B006
        metric_params=None,
    ):
        self.n_neighbors = n_neighbors
        self.subsequence_length = subsequence_length
        self.shape_descriptor_function = shape_descriptor_function
        self.shape_descriptor_functions = shape_descriptor_functions
        self.metric_params = metric_params

        super(ShapeDTW, self).__init__()

    def _fit(self, X, y):
        """Train the classifier.

        Parameters
        ----------
        X - pandas dataframe of training data of shape [n_instances,1].
        y - list of class labels of shape [n_instances].

        Returns
        -------
        self : the shapeDTW object
        """
        # Perform preprocessing on params.
        if not (isinstance(self.shape_descriptor_function, str)):
            raise TypeError("shape_descriptor_function must be an 'str'. \
                            Found '" +
                            type(self.shape_descriptor_function).__name__ +
                            "' instead.")

        if self.metric_params is None:
            self.metric_params = {}
            _reset = True

        # If the shape descriptor is 'compound',
        # calculate the appropriate weighting_factor
        if self.shape_descriptor_function == "compound":
            self._calculate_weighting_factor_value(X, y)

        # Fit the SlidingWindowSegmenter
        sw = SlidingWindowSegmenter(self.subsequence_length)
        sw.fit(X)
        self.sw = sw

        # Transform the training data.
        X = self._preprocess(X)

        # Fit the kNN classifier
        self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=self.n_neighbors)
        self.knn.fit(X, y)
        self.classes_ = self.knn.classes_
        # Hack to pass the unit tests
        if _reset:
            self.metric_params = None
        return self

    def _calculate_weighting_factor_value(self, X, y):
        """Calculate the appropriate weighting_factor.

        Check for the compound shape descriptor.
        If a value is given, the weighting_factor is set
        as the given value. If not, its tuned via
        a 10-fold cross-validation on the training data.

        Parameters
        ----------
        X - training data in a dataframe of shape [n_instances,1]
        y - training data classes of shape [n_instances].
        """
        self.metric_params = {
            k.lower(): v
            for k, v in self.metric_params.items()
        }

        # Get the weighting_factor if one is provided
        if self.metric_params.get("weighting_factor") is not None:
            self.weighting_factor = self.metric_params.get("weighting_factor")
        else:
            # Tune it otherwise
            self._param_matrix = {
                "metric_params": [
                    {
                        "weighting_factor": 0.1
                    },
                    {
                        "weighting_factor": 0.125
                    },
                    {
                        "weighting_factor": (1 / 6)
                    },
                    {
                        "weighting_factor": 0.25
                    },
                    {
                        "weighting_factor": 0.5
                    },
                    {
                        "weighting_factor": 1
                    },
                    {
                        "weighting_factor": 2
                    },
                    {
                        "weighting_factor": 4
                    },
                    {
                        "weighting_factor": 6
                    },
                    {
                        "weighting_factor": 8
                    },
                    {
                        "weighting_factor": 10
                    },
                ]
            }

            n = self.n_neighbors
            sl = self.subsequence_length
            sdf = self.shape_descriptor_function
            sdfs = self.shape_descriptor_functions
            if sdfs is None or not (len(sdfs) == 2):
                raise ValueError("When using 'compound', " +
                                 "shape_descriptor_functions must be a " +
                                 "string array of length 2.")
            mp = self.metric_params

            grid = GridSearchCV(
                estimator=ShapeDTW(
                    n_neighbours=n,
                    subsequence_length=sl,
                    shape_descriptor_function=sdf,
                    shape_descriptor_functions=sdfs,
                    metric_params=mp,
                ),
                param_grid=self._param_matrix,
                cv=KFold(n_splits=10, shuffle=True),
                scoring="accuracy",
            )
            grid.fit(X, y)
            self.weighting_factor = grid.best_params_["metric_params"][
                "weighting_factor"]

    def _preprocess(self, X):
        # private method for performing the transformations on
        # the test/training data. It extracts the subsequences
        # and then performs the shape descriptor function on
        # each subsequence.
        X = self.sw.transform(X)

        # Feed X into the appropriate shape descriptor function
        X = self._generate_shape_descriptors(X)

        return X

    def _predict_proba(self, X):
        """Perform predictions on the testing data X.

        This function returns the probabilities for each class.

        Parameters
        ----------
        X - pandas dataframe of testing data of shape [n_instances,1].

        Returns
        -------
        output : numpy array of shape =
                [n_instances, num_classes] of probabilities
        """
        # Transform the test data in the same way as the training data.
        X = self._preprocess(X)

        # Classify the test data
        return self.knn.predict_proba(X)

    def _predict(self, X):
        """Find predictions for all cases in X.

        Parameters
        ----------
        X : The testing input samples of shape [n_instances,1].

        Returns
        -------
        output : numpy array of shape = [n_instances]
        """
        # Transform the test data in the same way as the training data.
        X = self._preprocess(X)

        # Classify the test data
        return self.knn.predict(X)

    def _generate_shape_descriptors(self, data):
        """Generate shape descriptors.

        This function is used to convert a list of
        subsequences into a list of shape descriptors
        to be used for classification.
        """
        # Get the appropriate transformer objects
        if self.shape_descriptor_function != "compound":
            self.transformer = [
                self._get_transformer(self.shape_descriptor_function)
            ]
        else:
            self.transformer = []
            for x in self.shape_descriptor_functions:
                self.transformer.append(self._get_transformer(x))
            if not (len(self.transformer) == 2):
                raise ValueError("When using 'compound', " +
                                 "shape_descriptor_functions must be a " +
                                 "string array of length 2.")

        # To hold the result of each transformer
        dataFrames = []
        col_names = [x for x in range(len(data.columns))]

        # Apply each transformer on the set of subsequences
        for t in self.transformer:
            if t is None:
                # Do no transformations
                dataFrames.append(data)
            else:
                # Do the transformation and extract the resulting data frame.
                t.fit(data)
                newData = t.transform(data)
                dataFrames.append(newData)

        # Combine the arrays into one dataframe
        if self.shape_descriptor_function == "compound":
            result = self._combine_data_frames(dataFrames,
                                               self.weighting_factor,
                                               col_names)
        else:
            result = dataFrames[0]
            result.columns = col_names

        return result

    def _get_transformer(self, tName):
        """Extract the appropriate transformer.

        Parameters
        ----------
        self   : the ShapeDTW object.
        tName  : the name of the required transformer.

        Returns
        -------
        output : Base Transformer object corresponding to the class
                 (or classes if its a compound transformer) of the
                 required transformer. The transformer is
                 configured with the parameters given in self.metric_params.

        throws : ValueError if a shape descriptor doesn't exist.
        """
        parameters = self.metric_params

        tName = tName.lower()

        if parameters is None:
            parameters = {}

        parameters = {k.lower(): v for k, v in parameters.items()}

        self._check_metric_params(parameters)

        if tName == "raw":
            return None
        elif tName == "paa":
            num_intervals = parameters.get("num_intervals_paa")
            if num_intervals is None:
                return PAA()
            return PAA(num_intervals)
        elif tName == "dwt":
            num_levels = parameters.get("num_levels_dwt")
            if num_levels is None:
                return DWTTransformer()
            return DWTTransformer(num_levels)
        elif tName == "slope":
            num_intervals = parameters.get("num_intervals_slope")
            if num_intervals is None:
                return SlopeTransformer()
            return SlopeTransformer(num_intervals)
        elif tName == "derivative":
            return DerivativeSlopeTransformer()
        elif tName == "hog1d":
            num_intervals = parameters.get("num_intervals_hog1d")
            num_bins = parameters.get("num_bins_hog1d")
            scaling_factor = parameters.get("scaling_factor_hog1d")

            # All 3 paramaters are None
            if num_intervals is None and num_bins is None and scaling_factor is None:
                return HOG1DTransformer()

            # 2 parameters are None
            if num_intervals is None and num_bins is None:
                return HOG1DTransformer(scaling_factor=scaling_factor)
            if num_intervals is None and scaling_factor is None:
                return HOG1DTransformer(num_bins=num_bins)
            if num_bins is None and scaling_factor is None:
                return HOG1DTransformer(num_intervals=num_intervals)

            # 1 parameter is None
            if num_intervals is None:
                return HOG1DTransformer(scaling_factor=scaling_factor,
                                        num_bins=num_bins)
            if scaling_factor is None:
                return HOG1DTransformer(num_intervals=num_intervals,
                                        num_bins=num_bins)
            if num_bins is None:
                return HOG1DTransformer(scaling_factor=scaling_factor,
                                        num_intervals=num_intervals)

            # All parameters are given
            return HOG1DTransformer(
                num_intervals=num_intervals,
                num_bins=num_bins,
                scaling_factor=scaling_factor,
            )
        else:
            raise ValueError("Invalid shape desciptor function.")

    def _check_metric_params(self, parameters):
        """Check for an invalid metric_params."""
        valid_metric_params = [
            "num_intervals_paa",
            "num_levels_dwt",
            "num_intervals_slope",
            "num_intervals_hog1d",
            "num_bins_hog1d",
            "scaling_factor_hog1d",
            "weighting_factor",
        ]

        names = list(parameters.keys())

        for x in names:
            if not (x in valid_metric_params):
                raise ValueError(x + " is not a valid metric parameter." +
                                 "Make sure the shape descriptor function" +
                                 " name is at the end of the metric " +
                                 "parameter name.")

    def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
        """Combine two dataframes together into a single dataframe.

        Used when the shape_descriptor_function is set to "compound".
        """
        first_desc = dataFrames[0]
        second_desc = dataFrames[1]

        first_desc_array = []
        second_desc_array = []

        # Convert the dataframes into arrays
        for x in first_desc.columns:
            first_desc_array.append(
                from_nested_to_2d_array(first_desc[x], return_numpy=True))

        for x in second_desc.columns:
            second_desc_array.append(
                from_nested_to_2d_array(second_desc[x], return_numpy=True))

        # Concatenate the arrays together
        res = []
        for x in range(len(first_desc_array)):
            dim1 = []
            for y in range(len(first_desc_array[x])):
                dim2 = []
                dim2.extend(first_desc_array[x][y])
                dim2.extend(second_desc_array[x][y] * weighting_factor)
                dim1.append(dim2)
            res.append(dim1)

        res = np.asarray(res)

        # Convert to pandas dataframe
        df = pd.DataFrame()

        for col in col_names:
            colToAdd = []
            for row in range(len(res[col])):
                inst = res[col][row]
                colToAdd.append(pd.Series(inst))
            df[col] = colToAdd
        return df