Exemple #1
0
 def distance(instance_a, instance_b, **params):
     # find distance
     instance_a = from_nested_to_2d_array(
         instance_a, return_numpy=True)  # todo use specific
     # dimension rather than whole
     # thing?
     instance_b = from_nested_to_2d_array(
         instance_b, return_numpy=True)  # todo use specific
     # dimension rather than whole thing?
     instance_a = np.transpose(instance_a)
     instance_b = np.transpose(instance_b)
     return distance_measure(instance_a, instance_b, **params)
Exemple #2
0
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints):
    """Test from_nested_to_2d_array for correctness."""
    nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints)

    array = from_nested_to_2d_array(nested)
    assert array.shape == (n_instances, n_columns * n_timepoints)
    assert array.index.equals(nested.index)
Exemple #3
0
    def transform(self, X, y=None):
        """Concatenate multivariate time series/panel data into long
        univariate time series/panel
        data by simply concatenating times series in time.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and single
          column
        """
        self.check_is_fitted()
        X = check_X(X)

        # We concatenate by tabularizing all columns and then detabularizing
        # them into a single column
        if isinstance(X, pd.DataFrame):
            Xt = from_nested_to_2d_array(X)
        else:
            Xt = from_3d_numpy_to_2d_array(X)
        return from_2d_array_to_nested(Xt)
def test_output_format_dim(len_series, n_instances, n_components):
    np.random.seed(42)
    X = from_2d_array_to_nested(
        pd.DataFrame(data=np.random.randn(n_instances, len_series)))

    trans = PCATransformer(n_components=n_components)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of principal components in the output.
    assert from_nested_to_2d_array(Xt).shape[1] == min(
        n_components,
        from_nested_to_2d_array(X).shape[1])
Exemple #5
0
    def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
        """Combine two dataframes together into a single dataframe.

        Used when the shape_descriptor_function is set to "compound".
        """
        first_desc = dataFrames[0]
        second_desc = dataFrames[1]

        first_desc_array = []
        second_desc_array = []

        # Convert the dataframes into arrays
        for x in first_desc.columns:
            first_desc_array.append(
                from_nested_to_2d_array(first_desc[x], return_numpy=True))

        for x in second_desc.columns:
            second_desc_array.append(
                from_nested_to_2d_array(second_desc[x], return_numpy=True))

        # Concatenate the arrays together
        res = []
        for x in range(len(first_desc_array)):
            dim1 = []
            for y in range(len(first_desc_array[x])):
                dim2 = []
                dim2.extend(first_desc_array[x][y])
                dim2.extend(second_desc_array[x][y] * weighting_factor)
                dim1.append(dim2)
            res.append(dim1)

        res = np.asarray(res)

        # Convert to pandas dataframe
        df = pd.DataFrame()

        for col in col_names:
            colToAdd = []
            for row in range(len(res[col])):
                inst = res[col][row]
                colToAdd.append(pd.Series(inst))
            df[col] = colToAdd
        return df
 def row_first(X):
     if isinstance(X, pd.Series):
         X = pd.DataFrame(X)
     Xt = pd.concat(
         [
             pd.Series(from_nested_to_2d_array(col).iloc[:, 0])
             for _, col in X.items()
         ],
         axis=1,
     )
     return Xt
Exemple #7
0
def test_padding_fill_value_transformer():
    """Test full fill padding."""
    # load data
    X_train, y_train = load_basic_motions(split="train", return_X_y=True)

    padding_transformer = PaddingTransformer(pad_length=120, fill_value=1)
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabularize the data it has 6 dimensions
    # and we've padded them all to 120 long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 120 * 6
Exemple #8
0
def test_padding_transformer():
    """Test the dimensions after padding."""
    # load data
    X_train, y_train = load_basic_motions(split="train", return_X_y=True)

    padding_transformer = PaddingTransformer()
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabularize the data it has 6 dimensions
    # and we've padded them to there normal length of 100
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 100 * 6
Exemple #9
0
def test_truncation_transformer():
    """Test truncation to the shortest series length."""
    # load data
    X_train, y_train = load_basic_motions(split="train", return_X_y=True)

    truncated_transformer = TruncationTransformer(5)
    Xt = truncated_transformer.fit_transform(X_train)

    # when we tabularize the data it has 6 dimensions
    # and we've truncated them all to 5 long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 5 * 6
Exemple #10
0
def test_truncation_paramterised_transformer():
    """Test truncation to the a user defined length."""
    # load data
    X_train, y_train = load_basic_motions(split="train", return_X_y=True)

    truncated_transformer = TruncationTransformer(2, 10)
    Xt = truncated_transformer.fit_transform(X_train)

    # when we tabularize the data it has 6 dimensions
    # and we've truncated them all to (10-2) long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 8 * 6
Exemple #11
0
def test_dft_mft(use_fallback_dft, norm):
    # load training data
    X, y = load_gunpoint(split="train", return_X_y=True)
    X_tab = from_nested_to_2d_array(X, return_numpy=True)

    word_length = 6
    alphabet_size = 4

    # Single DFT transformation
    window_size = np.shape(X_tab)[1]

    p = SFA(
        word_length=6,
        alphabet_size=4,
        window_size=window_size,
        norm=norm,
        use_fallback_dft=use_fallback_dft,
    ).fit(X, y)

    if use_fallback_dft:
        dft = p._discrete_fourier_transform(X_tab[0], word_length, norm, 1, True)
    else:
        dft = p._fast_fourier_transform(X_tab[0])

    mft = p._mft(X_tab[0])

    assert (mft - dft < 0.0001).all()

    # Windowed DFT transformation
    window_size = 140

    p = SFA(
        word_length=word_length,
        alphabet_size=alphabet_size,
        window_size=window_size,
        norm=norm,
        use_fallback_dft=use_fallback_dft,
    ).fit(X, y)

    mft = p._mft(X_tab[0])
    for i in range(len(X_tab[0]) - window_size + 1):
        if use_fallback_dft:
            dft = p._discrete_fourier_transform(
                X_tab[0, i : window_size + i], word_length, norm, 1, True
            )
        else:
            dft = p._fast_fourier_transform(X_tab[0, i : window_size + i])

        assert (mft[i] - dft < 0.001).all()

    assert len(mft) == len(X_tab[0]) - window_size + 1
    assert len(mft[0]) == word_length
Exemple #12
0
def test_tsfresh_extractor(default_fc_parameters):
    X, y = make_classification_problem()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    transformer = TSFreshFeatureExtractor(
        default_fc_parameters=default_fc_parameters, disable_progressbar=True
    )

    Xt = transformer.fit_transform(X_train, y_train)
    actual = Xt.filter(like="__mean", axis=1).values.ravel()
    expected = from_nested_to_2d_array(X_train).mean(axis=1).values

    assert expected[0] == X_train.iloc[0, 0].mean()
    np.testing.assert_allclose(actual, expected)
def test_pca_results(n_components):
    np.random.seed(42)

    # sklearn
    X = pd.DataFrame(data=np.random.randn(10, 5))
    pca = PCA(n_components=n_components)
    Xt1 = pca.fit_transform(X)

    # sktime
    Xs = from_2d_array_to_nested(X)
    pca_transform = PCATransformer(n_components=n_components)
    Xt2 = pca_transform.fit_transform(Xs)

    assert np.allclose(np.asarray(Xt1),
                       np.asarray(from_nested_to_2d_array(Xt2)))
Exemple #14
0
    def transform(self, X, y=None):

        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        df: a pandas data frame of shape = [num_intervals, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, coerce_to_pandas=True)

        # Get information about the dataframe
        n_timepoints = len(X.iloc[0, 0])
        num_instances = X.shape[0]
        col_names = X.columns

        self._check_parameters(n_timepoints)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True)

            # Calculate gradients
            transformedData = []
            for y in range(num_instances):
                res = self._get_gradients_of_lines(arr[y])
                transformedData.append(res)

            # Convert to Numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemple #15
0
def plot_cluster_algorithm(model: BaseClusterer, predict_series: NumpyOrDF,
                           k: int):
    """
    Method that is used to plot a clustering algorithms output

    Parameters
    ----------
    model: BaseClusterer
        Clustering model to plot

    predict_series: Numpy or Dataframe
        The series to predict the values for

    k: int
        Number of centers
    """
    _check_soft_dependencies("matplotlib")
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches

    if isinstance(predict_series, pd.DataFrame):
        predict_series = from_nested_to_2d_array(predict_series,
                                                 return_numpy=True)
    plt.figure(figsize=(5, 10))
    plt.rcParams["figure.dpi"] = 100
    indexes = model.predict(predict_series)
    centers = model.get_centers()

    series_values = TimeSeriesLloydsPartitioning.get_cluster_values(
        indexes, predict_series, k)
    fig, axes = plt.subplots(nrows=k, ncols=1)
    for i in range(k):
        _plot(series_values[i], centers[i], axes[i])

    blue_patch = mpatches.Patch(color="blue",
                                label="Series that belong to the cluster")
    red_patch = mpatches.Patch(color="red", label="Cluster centers")
    plt.legend(
        handles=[red_patch, blue_patch],
        loc="upper center",
        bbox_to_anchor=(0.5, -0.40),
        fancybox=True,
        shadow=True,
        ncol=5,
    )
    plt.tight_layout()
    plt.show()
Exemple #16
0
    def _perform_paa_along_dim(self, X):
        X = from_nested_to_2d_array(X, return_numpy=True)

        num_atts = X.shape[1]
        num_insts = X.shape[0]
        dims = pd.DataFrame()
        data = []

        for i in range(num_insts):
            series = X[i, :]

            frames = []
            current_frame = 0
            current_frame_size = 0
            frame_length = num_atts / self.num_intervals
            frame_sum = 0

            for n in range(num_atts):
                remaining = frame_length - current_frame_size

                if remaining > 1:
                    frame_sum += series[n]
                    current_frame_size += 1
                else:
                    frame_sum += remaining * series[n]
                    current_frame_size += remaining

                if current_frame_size == frame_length:
                    frames.append(frame_sum / frame_length)
                    current_frame += 1

                    frame_sum = (1 - remaining) * series[n]
                    current_frame_size = 1 - remaining

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims
Exemple #17
0
    def transform(self, X, y=None):
        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape
              = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        self._check_parameters()

        # Get information about the dataframe
        col_names = X.columns

        df = pd.DataFrame()
        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            transformedData = self._extract_wavelet_coefficients(arr)

            # Convert to a numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemple #18
0
    def transform(self, X, y=None):
        """Transform nested pandas dataframe into tabular dataframe.

        Parameters
        ----------
        X : pandas DataFrame
            Nested dataframe with pandas series or numpy arrays in cells.
        y : array-like, optional (default=None)

        Returns
        -------
        Xt : pandas DataFrame
            Transformed dataframe with only primitives in cells.
        """
        self.check_is_fitted()
        X = check_X(X)
        if isinstance(X, pd.DataFrame):
            return from_nested_to_2d_array(X)
        else:
            return from_3d_numpy_to_2d_array(X)
Exemple #19
0
    def _transform_single_feature(self, X, feature):
        """transforms data into a specified catch22 feature

        Parameters
        ----------
        X : pandas DataFrame, input time series
        feature : int, catch22 feature id or String, catch22 feature
                  name.

        Returns
        -------
        Numpy array containing a catch22 feature for each input series
        """
        if isinstance(feature,
                      (int, np.integer)) or isinstance(feature,
                                                       (float, np.float)):
            if feature > 21 or feature < 0:
                raise ValueError("Invalid catch22 feature ID")
        elif isinstance(feature, str):
            if feature in feature_names:
                feature = feature_names.index(feature)
            else:
                raise ValueError("Invalid catch22 feature name")
        else:
            raise ValueError("catch22 feature name or ID required")

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))

        c22_list = Parallel(n_jobs=self.n_jobs)(
            delayed(self._transform_case_single)(
                X[i],
                feature,
            ) for i in range(n_instances))

        return np.asarray(c22_list)
Exemple #20
0
    def predict(self, X: NumpyOrDF, y=None) -> NumpyArray:
        """
        Return cluster center index for data samples.

        Parameters
        ----------
        X: 2D np.array with shape (n_instances, n_timepoints)
           or pd.DataFrame in nested format
            panel of time series to cluster

        y: ignored, exists for API consistency reasons

        Returns
        -------
        Numpy_Array: 1D np.array of length n_instances
            Index of the cluster each sample belongs to
        """
        self.check_is_fitted()

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        self._check_params(X)
        return self._predict(X)
Exemple #21
0
    def fit(self, X: NumpyOrDF, y=None):
        """
        Fit the clustering algorithm on the dataset X

        Parameters
        ----------
        X: 2D np.array with shape (n_instances, n_timepoints)
           or pd.DataFrame in nested format
            panel of univariate time series to train the clustering model on

        y: ignored, exists for API consistency reasons

        Returns
        -------
        reference to self
        """
        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        self._check_params(X)
        self._fit(X)

        self._is_fitted = True
        return self
Exemple #22
0
    def transform_single_feature(self, X, feature, case_id=None):
        """Transform data into a specified catch22 feature.

        Parameters
        ----------
        X : pandas DataFrame, input time series.
        feature : int, catch22 feature id or String, catch22 feature
                  name.
        case_id : int, identifier for the current set of cases. If the case_id is not
                  None and the same as the previously used case_id, calculations from
                  previous features will be reused.

        Returns
        -------
        Numpy array containing a catch22 feature for each input series.
        """
        if isinstance(feature,
                      (int, np.integer)) or isinstance(feature,
                                                       (float, np.float)):
            if feature > 21 or feature < 0:
                raise ValueError("Invalid catch22 feature ID")
        elif isinstance(feature, str):
            if feature in feature_names:
                feature = feature_names.index(feature)
            else:
                raise ValueError("Invalid catch22 feature name")
        else:
            raise ValueError("catch22 feature name or ID required")

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))
        series_length = X.shape[1]

        if case_id is not None:
            if case_id != self._case_id:
                self._case_id = case_id
                self._st_n_instances = n_instances
                self._st_series_length = series_length
                self._outlier_series = [None] * n_instances
                self._smin = [None] * n_instances
                self._smax = [None] * n_instances
                self._smean = [None] * n_instances
                self._fft = [None] * n_instances
                self._ac = [None] * n_instances
                self._acfz = [None] * n_instances
            else:
                if (n_instances != self._st_n_instances
                        or series_length != self._st_series_length):
                    raise ValueError(
                        "Catch22: case_is the same, but n_instances and "
                        "series_length do not match last seen for single "
                        "feature transform.")

        c22_list = Parallel(n_jobs=self.n_jobs)(
            delayed(self._transform_case_single)(
                X[i],
                feature,
                case_id,
                i,
            ) for i in range(n_instances))

        if self.replace_nans:
            c22_list = np.nan_to_num(c22_list, False, 0, 0, 0)

        return np.asarray(c22_list)