Example #1
0
 def test_kmeans_univariate(self):
     data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1],
                    [-1, -1, -0.5, 1, 1, 0.5],
                    [-0.5, -0.5, -0.5, -1, -1, -1]]
     sample_points = [0, 2, 4, 6, 8, 10]
     fd = FDataGrid(data_matrix, sample_points)
     init = np.array([[0, 0, 0, 0, 0, 0], [2, 1, -1, 0.5, 0, -0.5]])
     init_fd = FDataGrid(init, sample_points)
     kmeans = KMeans(init=init_fd)
     kmeans.fit(fd)
     np.testing.assert_array_equal(kmeans.predict(fd),
                                   np.array([0, 0, 0, 1]))
     np.testing.assert_allclose(kmeans.transform(fd),
                                np.array([[2.98142397, 9.23534876],
                                          [0.68718427, 6.50960828],
                                          [3.31243449, 4.39222798],
                                          [6.49679408, 0.]]))
     centers = FDataGrid(data_matrix=np.array(
         [[0.16666667, 0.16666667, 0.83333333, 2., 1.66666667, 1.16666667],
          [-0.5, -0.5, -0.5, -1., -1., -1.]]),
         sample_points=sample_points)
     np.testing.assert_array_almost_equal(
         kmeans.cluster_centers_.data_matrix,
         centers.data_matrix)
     np.testing.assert_allclose(kmeans.score(fd), np.array([-20.33333333]))
     np.testing.assert_array_equal(kmeans.n_iter_, np.array([3.]))
 def test_fuzzy_kmeans_univariate(self):
     data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1],
                    [-1, -1, -0.5, 1, 1, 0.5],
                    [-0.5, -0.5, -0.5, -1, -1, -1]]
     sample_points = [0, 2, 4, 6, 8, 10]
     fd = FDataGrid(data_matrix, sample_points)
     fuzzy_kmeans = FuzzyKMeans()
     fuzzy_kmeans.fit(fd)
     np.testing.assert_array_equal(
         fuzzy_kmeans.predict(fd),
         np.array([[0.965, 0.035], [0.94, 0.06], [0.227, 0.773],
                   [0.049, 0.951]]))
     np.testing.assert_allclose(
         fuzzy_kmeans.transform(fd),
         np.array([[1.49228858, 7.87898791], [1.29380155, 5.12696975],
                   [4.85542339, 2.63309793], [7.77455633, 1.75920889]]))
     centers = FDataGrid(data_matrix=np.array([[
         0.7065078, 0.7065078, 1.45508111, 2.46698825, 1.98143302,
         1.48206743
     ],
                                               [
                                                   -0.69456401, -0.69456401,
                                                   -0.49444239, -0.19713489,
                                                   -0.19872214, -0.39844583
                                               ]]),
                         sample_points=sample_points)
     np.testing.assert_allclose(fuzzy_kmeans.cluster_centers_.data_matrix,
                                centers.data_matrix)
     np.testing.assert_allclose(fuzzy_kmeans.score(fd),
                                np.array([-13.928868250627902]))
     np.testing.assert_array_equal(fuzzy_kmeans.n_iter_, np.array([18.]))
Example #3
0
    def test_concatenate(self):
        sample1 = np.arange(0, 10)
        sample2 = np.arange(10, 20)
        fd1 = FDataGrid([sample1]).to_basis(Fourier(n_basis=5))
        fd2 = FDataGrid([sample2]).to_basis(Fourier(n_basis=5))

        fd = concatenate([fd1, fd2])

        np.testing.assert_equal(fd.n_samples, 2)
        np.testing.assert_equal(fd.dim_codomain, 1)
        np.testing.assert_equal(fd.dim_domain, 1)
        np.testing.assert_array_equal(fd.coefficients, np.concatenate(
            [fd1.coefficients, fd2.coefficients]))
Example #4
0
 def test_fuzzy_kmeans_univariate(self):
     data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1],
                    [-1, -1, -0.5, 1, 1, 0.5],
                    [-0.5, -0.5, -0.5, -1, -1, -1]]
     sample_points = [0, 2, 4, 6, 8, 10]
     fd = FDataGrid(data_matrix, sample_points)
     fuzzy_kmeans = FuzzyCMeans()
     fuzzy_kmeans.fit(fd)
     np.testing.assert_array_equal(fuzzy_kmeans.predict(fd).round(3),
                                   np.array([[0.965, 0.035],
                                             [0.94, 0.06],
                                             [0.227, 0.773],
                                             [0.049, 0.951]]))
     np.testing.assert_allclose(fuzzy_kmeans.transform(fd).round(3),
                                np.array([[1.492, 7.879],
                                          [1.294, 5.127],
                                          [4.856, 2.633],
                                          [7.775, 1.759]]))
     centers = np.array([[0.707, 0.707, 1.455, 2.467, 1.981, 1.482],
                         [-0.695, -0.695, -0.494, -0.197, -0.199, -0.398]])
     np.testing.assert_allclose(
         fuzzy_kmeans.cluster_centers_.data_matrix[..., 0].round(3),
         centers)
     np.testing.assert_allclose(fuzzy_kmeans.score(fd),
                                np.array([-12.025179]))
     self.assertEqual(fuzzy_kmeans.n_iter_, 19)
Example #5
0
 def __init__(self,
              target: pd.DataFrame,
              ref: pd.DataFrame,
              var: str,
              mpt: float = 0.001,
              **kwargs):
     y1, y2, x = estimate_pdfs(target, ref, var)
     landmarks = [peaks(y, x, mph=mpt * y.max(), **kwargs) for y in [y1, y2]]
     plabels = np.concatenate([[0 for i in range(len(landmarks[0]))],
                               [1 for i in range(len(landmarks[1]))]])
     landmarks = np.array([x for sl in landmarks for x in sl])
     self.landmarks = match_landmarks(landmarks, plabels)
     self.original_functions = FDataGrid([y1, y2], grid_points=x)
     self.warping_function = None
     self.adjusted_functions = None
     self.landmark_shift_deltas = None
Example #6
0
 def __init__(self, grid: FDataGrid, smoothed=False):
     self.init_grid = grid.copy()
     self.sample_points = self.init_grid.sample_points[0]
     self._nSeries = self.init_grid.data_matrix.shape[0]
     self._nObs = self.init_grid.data_matrix.shape[1]
     self._nVar = self.init_grid.data_matrix.shape[2]
     self.coordinates_grids = list(self.init_grid.coordinates)
     self.coordinate_names = self.init_grid.coordinate_names
     self._smoothed = smoothed
     if self._smoothed == True:
         self.coordinates_grids_dx1 = [
             grid.derivative(order=1) for grid in self.coordinates_grids
         ]
         self.coordinates_grids_dx2 = [
             grid.derivative(order=2) for grid in self.coordinates_grids
         ]
     self._scaled = False
Example #7
0
 def compute_arc_length(self):
     if not self._smoothed:
         _ = self.smooth_grids()
     dx1_mat = np.empty([self._nVar, self._nObs])
     result_matrix = np.empty([self._nSeries, self._nObs])
     for i in range(self._nSeries):
         for j in range(self._nVar):
             dx1_mat[j, :] = self.coordinates_grids_dx1[j].data_matrix[i, :,
                                                                       0]
         result_matrix[i, :] = _calculate_arc_length(DX1=dx1_mat,
                                                     t=self.sample_points)
     return FDataGrid(data_matrix=result_matrix,
                      sample_points=self.sample_points,
                      dataset_label="arc_length")
Example #8
0
 def test_qr(self):
     t = np.linspace(0, 1, 5)
     x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t)
     basis = BSpline((0, 1), n_basis=5)
     fd = FDataGrid(data_matrix=x, sample_points=t)
     smoother = smoothing.BasisSmoother(basis=basis,
                                        smoothing_parameter=10,
                                        penalty=2,
                                        method='qr',
                                        return_basis=True)
     fd_basis = smoother.fit_transform(fd)
     np.testing.assert_array_almost_equal(
         fd_basis.coefficients.round(2),
         np.array([[0.60, 0.47, 0.20, -0.07, -0.20]]))
Example #9
0
    def data_to_basis(self, X, fit_fPCA=True):
        """Project the data to basis functions.

        Parameters
        ----------
        X: array, shape (n,n_points,d)
            Array of paths. It is a 3-dimensional array, containing the coordinates in R^d of n piecewise linear paths,
            each composed of n_points.

        fit_fPCA: boolean, default=True
            If n_basis='fPCA' and fit_fPCA=True, the basis functions are fitted to be the functional principal
            components of X.

        Returns
        -------
        fd_basis: object
            Instance of skfda.representation.basis.FDataBasis, the basis representation of X, where the type of basis is
            determined by self.n_basis.
        """
        grid_points = np.linspace(0, 1, X.shape[1])
        fd = FDataGrid(X, grid_points)
        basis_vec = []
        for i in range(X.shape[2]):
            if self.basis_type == 'bspline':
                basis_vec.append(BSpline(n_basis=self.nbasis))
            elif self.basis_type == 'fourier':
                basis_vec.append(Fourier(n_basis=self.nbasis))
            elif self.basis_type == 'fPCA':
                basis_vec.append(BSpline(n_basis=7))

        basis = VectorValued(basis_vec)
        fd_basis = fd.to_basis(basis)
        if self.basis_type == 'fPCA':
            if fit_fPCA:
                self.fpca_basis = self.fpca_basis.fit(fd_basis)
            fd_basis = self.fpca_basis.transform(fd_basis)
        return fd_basis
Example #10
0
 def test_cholesky(self):
     t = np.linspace(0, 1, 5)
     x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t)
     basis = BSpline((0, 1), n_basis=5)
     fd = FDataGrid(data_matrix=x, sample_points=t)
     smoother = smoothing.BasisSmoother(
         basis=basis,
         smoothing_parameter=10,
         regularization=TikhonovRegularization(
             LinearDifferentialOperator(2)),
         method='cholesky',
         return_basis=True)
     fd_basis = smoother.fit_transform(fd)
     np.testing.assert_array_almost_equal(
         fd_basis.coefficients.round(2),
         np.array([[0.60, 0.47, 0.20, -0.07, -0.20]]))
Example #11
0
 def test_monomial_smoothing(self):
     # It does not have much sense to apply smoothing in this basic case
     # where the fit is very good but its just for testing purposes
     t = np.linspace(0, 1, 5)
     x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t)
     basis = Monomial(n_basis=4)
     fd = FDataGrid(data_matrix=x, sample_points=t)
     smoother = smoothing.BasisSmoother(basis=basis,
                                        smoothing_parameter=1,
                                        penalty=2,
                                        return_basis=True)
     fd_basis = smoother.fit_transform(fd)
     # These results where extracted from the R package fda
     np.testing.assert_array_almost_equal(
         fd_basis.coefficients.round(2),
         np.array([[0.61, -0.88, 0.06, 0.02]]))
Example #12
0
    def scale_grids(self, axis=0, with_std=False):
        '''
            Perform scaling for each time series for each variables
            if axis=0 it will minus each time series by its mean,else it will minus
            every timestep by the mean of each time series evaluated at that timestep
        '''
        if self._scaled == True:
            print("Data was already scaled, no additionnal scale done")
            return

        def _scale(x, with_std=False):
            xi = np.array(x)
            mean = np.mean(xi)
            if with_std:
                sd = np.std(xi)
                return (xi - mean) / sd
            else:
                return xi - mean

        if axis > 1:
            raise ValueError("axis should be either 0 or 1")

        for grid in self.coordinates_grids:
            for i in range(grid.data_matrix.shape[axis]):
                if axis == 0:
                    grid.data_matrix[i, :, 0] = _scale(grid.data_matrix[i, :,
                                                                        0],
                                                       with_std=with_std)
                else:
                    grid.data_matrix[:, i, 0] = _scale(grid.data_matrix[:, i,
                                                                        0],
                                                       with_std=with_std)
            grid = FDataGrid(data_matrix=grid.data_matrix,
                             sample_points=self.sample_points,
                             domain_range=grid.domain_range,
                             dataset_label=grid.dataset_label)
        self._scaled = True
        return None
Example #13
0
from skfda import datasets
from skfda.representation.grid import FDataGrid
from skfda.ml.clustering.base_kmeans import KMeans
from skfda.exploratory.visualization.clustering_plots import *

##################################################################################
# First, the Canadian Weather dataset is downloaded from the package 'fda' in CRAN.
# It contains a FDataGrid with daily temperatures and precipitations, that is, it
# has a 2-dimensional image. We are interested only in the daily average temperatures,
# so another FDataGrid is constructed with the desired values.

dataset = datasets.fetch_weather()
fd = dataset["data"]
fd_temperatures = FDataGrid(data_matrix=fd.data_matrix[:, :, 0],
                            sample_points=fd.sample_points,
                            dataset_label=fd.dataset_label,
                            axes_labels=fd.axes_labels[0:2])

# The desired FDataGrid only contains 10 random samples, so that the example provides
# clearer plots.
indices_samples = np.array([1, 3, 5, 10, 14, 17, 21, 25, 27, 30])
fd = fd_temperatures[indices_samples]

############################################################################################
# The data is plotted to show the curves we are working with. They are divided according to the
# target. In this case, it includes the different climates to which the weather stations belong to.

climate_by_sample = [dataset["target"][i] for i in indices_samples]
# Note that the samples chosen belong to three of the four possible target groups. By
# coincidence, these three groups correspond to indices 1, 2, 3, that is why the indices
# (´climate_by_sample´) are decremented in 1. In case of reproducing the example with other
Example #14
0
class LandmarkReg:
    """
    One technique for handling technical variation in cytometry data is local normalisation by
    aligning the probability density function of some data to a reference sample. This should
    be applied to a population immediately prior to applying a gate.

    The alignment algorithm is inspired by previous work [1, 2] and is performed as follows:
    1. The probability density function of some target data and a reference sample are estimated
    using a convolution based fast kernel density estimation algorithm (KDEpy.FFTKDE)
    2. Landmarks are identified in both samples as peaks of local maximal density.
    3. The peaks from both target and reference are combined and clustered using K means clustering; the
    number of clusters is chosen as the number of peaks identified in the target
    4. Unique pairings of peaks between samples, closest to the centroid of a cluster, are generated and
    used as landmarks.
    5. Landmark registration is performed using the Scikit-FDA package to generate a warping function, with
    the target location being the mean between paired peaks
    6. The warping function is applied to the target data, generating a new adjusted vector with high
    density regions matched to the reference sample

    [1] Hahne F, Khodabakhshi AH, Bashashati A, Wong CJ, Gascoyne RD,
    Weng AP, Seyfert-Margolis V, Bourcier K, Asare A, Lumley T, Gentleman R,
    Brinkman RR. Per-channel basis normalization methods for flow cytometry data.
    Cytometry A. 2010 Feb;77(2):121-31. doi: 10.1002/cyto.a.20823. PMID: 19899135; PMCID: PMC3648208.

    [2] Finak G, Jiang W, Krouse K, et al. High-throughput flow cytometry data normalization
    for clinical trials. Cytometry A. 2014;85(3):277-286. doi:10.1002/cyto.a.22433

    Parameters
    ----------
    target: Pandas.DataFrame
        Target data to be transformed; must contain column corresponding to 'var'
    ref: Pandas.DataFrame
        Reference data for computing alignment; must contain column corresponding to 'var'
    var: str
        Name of the target variable to align
    mpt: float (default=0.001)
        Minimum peak threshold; peaks that are less than the given percentage of the 'highest' peak
        (max density) will be ignored. Use this to remove small perturbations.
    kwargs:
        Additional keyword arguments passed to cytopy.flow.fda_norm.peaks call

    Attributes
    ----------
    landmarks: numpy.ndarray
        (2, n) array, where n is the number of clusters. Order conserved between samples; first
        row is peaks from target, second row is peaks from reference.
    original_functions: skfda.representation.grid.FDataGrid
        Original PDFs for target and reference
    warping_function: skfda.representation.grid.FDataGrid
        Warping function
    adjusted_functions: skfda.representation.grid.FDataGrid
        Registered curves following function compostion of original PDFs and warping function
    landmark_shift_deltas: numpy.ndarray
        Corresponding shifts to align the landmarks of the PDFs described in original_functions
    """
    def __init__(self,
                 target: pd.DataFrame,
                 ref: pd.DataFrame,
                 var: str,
                 mpt: float = 0.001,
                 **kwargs):
        y1, y2, x = estimate_pdfs(target, ref, var)
        landmarks = [peaks(y, x, mph=mpt * y.max(), **kwargs) for y in [y1, y2]]
        plabels = np.concatenate([[0 for i in range(len(landmarks[0]))],
                                  [1 for i in range(len(landmarks[1]))]])
        landmarks = np.array([x for sl in landmarks for x in sl])
        self.landmarks = match_landmarks(landmarks, plabels)
        self.original_functions = FDataGrid([y1, y2], grid_points=x)
        self.warping_function = None
        self.adjusted_functions = None
        self.landmark_shift_deltas = None

    def __call__(self):
        """
        Calculate the warping function, registered curves and landmark shift deltas

        Returns
        -------
        self
        """
        self.warping_function = landmark_registration_warping(self.original_functions,
                                                              self.landmarks,
                                                              location=np.mean(self.landmarks, axis=0))
        self.adjusted_functions = self.original_functions.compose(self.warping_function)
        self.landmark_shift_deltas = landmark_shift_deltas(self.original_functions, self.landmarks)
        return self

    def plot_warping(self, ax: list or None = None):
        """
        Generate a figure that plots the PDFs prior to landmark registration,
        the warping function, and the registered curves.

        Parameters
        ----------
        ax: Matplotlib.Axes, optional

        Returns
        -------
        Matplotlib.Axes
        """
        assert self.warping_function is not None, "Call object prior to plot"
        ax = ax or plt.subplots(1, 3, figsize=(15, 4))[1]
        assert len(ax) == 3, "Must provide exactly 3 axis objects"
        self.original_functions.plot(axes=ax[0])
        ax[0].set_title("Before")
        self.warping_function.plot(axes=ax[1])
        ax[1].set_title("Warping function")
        self.adjusted_functions.plot(axes=ax[2])
        ax[2].set_title("After")
        ax[0].legend(labels=["Target", "Reference"])
        return ax

    def shift_data(self,
                   x: np.ndarray):
        """
        Provided the original vector of data to transform, use the warping
        function to normalise the data and align the reference.

        Parameters
        ----------
        x: numpy.ndarray

        Returns
        -------
        numpy.ndarray

        Raises
        ------
        AssertionError
            If the class has not been called and therefore a warping function has not
            been defined
        """
        assert self.warping_function is not None, "No warping function defined"
        return self.warping_function.evaluate(x)[1].reshape(-1)

    def plot_shift(self,
                   x: np.ndarray,
                   ax: plt.Axes or None = None):
        """
        Plot the reference PDF and overlay the target data before and after landmark
        registration.

        Parameters
        ----------
        x: numpy.ndarray
            Target data
        ax: Matplotlib.Axes, optional

        Returns
        -------
        Matplotlib.Axes
        """
        ax = ax or plt.subplots(figsize=(5, 5))[1]
        shifted = self.shift_data(x)
        x = np.linspace(np.min(x) - 0.1,
                        np.max(x) + 0.1,
                        10000)
        y2 = (FFTKDE(kernel="gaussian",
                     bw="silverman")
              .fit(shifted)
              .evaluate(x))

        self.original_functions.plot(axes=ax)
        ax.plot(x, y2)
        ax.legend(labels=["Before", "Ref", "After"])
        return ax
Example #15
0
# var_angles = [np.var(x, axis=1) for x in angles]
# min_angles = [mean_angles[i] - var_angles[i] for i in range(len(mean_angles))]
# max_angles = [mean_angles[i] + var_angles[i] for i in range(len(mean_angles))]

# torques = [np.array(x[101:202,:]) for x in dfs]
# mean_torques = [np.mean(x, axis=1) for x in torques]
# var_torques = [np.var(x, axis=1) for x in torques]
# min_torques = [mean_torques[i] - var_torques[i] for i in range(len(mean_torques))]
# max_torques = [mean_torques[i] + var_torques[i] for i in range(len(mean_torques))]

# plt.plot(angles[0])
# plt.show()

# a0 = angles[0].T
a0 = ka.T
df = FDataGrid(a0)

dataset = skfda.datasets.fetch_growth()
# y = dataset['target']
# fd = dataset['data']
# df = dataset['data']
fd = copy.deepcopy(df)
fd.plot()
plt.show()

##############################################################################
# FPCA can be done in two ways. The first way is to operate directly with the
# raw data. We call it discretized FPCA as the functional data in this case
# consists in finite values dispersed over points in a domain range.
# We initialize and setup the FPCADiscretized object and run the fit method to
# obtain the first two components. By default, if we do not specify the number
Example #16
0
    def _fit_grid(self, X: FDataGrid, y=None):
        r"""Computes the n_components first principal components and saves them.

        The eigenvalues associated with these principal
        components are also saved. For more details about how it is implemented
        please view the referenced book, chapter 8.

        In summary, we are performing standard multivariate PCA over
        :math:`\frac{1}{\sqrt{N}} \mathbf{X} \mathbf{W}^{1/2}` where :math:`N`
        is the number of samples in the dataset, :math:`\mathbf{X}` is the data
        matrix and :math:`\mathbf{W}` is the weight matrix (this matrix
        defines the numerical integration). By default the weight matrix is
        obtained using the trapezoidal rule.

        Args:
            X (FDataGrid):
                the functional data object to be analysed in basis
                representation
            y (None, not used):
                only present for convention of a fit function

        Returns:
            self (object)

        References:
            .. [RS05-8-4-1] Ramsay, J., Silverman, B. W. (2005). Discretizing
            the functions. In *Functional Data Analysis* (p. 161). Springer.
        """

        # check that the number of components is smaller than the sample size
        if self.n_components > X.n_samples:
            raise AttributeError("The sample size must be bigger than the "
                                 "number of components")

        # check that we do not exceed limits for n_components as it should
        # be smaller than the number of attributes of the funcional data object
        if self.n_components > X.data_matrix.shape[1]:
            raise AttributeError("The number of components should be "
                                 "smaller than the number of discretization "
                                 "points of the functional data object.")

        # data matrix initialization
        fd_data = X.data_matrix.reshape(X.data_matrix.shape[:-1])

        # get the number of samples and the number of points of descretization
        n_samples, n_points_discretization = fd_data.shape

        # if centering is True then subtract the mean function to each function
        # in FDataBasis
        X = self._center_if_necessary(X)

        # establish weights for each point of discretization
        if not self.weights:
            # sample_points is a list with one array in the 1D case
            # in trapezoidal rule, suppose \deltax_k = x_k - x_{k-1}, the weight
            # vector is as follows: [\deltax_1/2, \deltax_1/2 + \deltax_2/2,
            # \deltax_2/2 + \deltax_3/2, ... , \deltax_n/2]
            differences = np.diff(X.sample_points[0])
            differences = np.concatenate(((0, ), differences, (0, )))
            self.weights = (differences[:-1] + differences[1:]) / 2
        elif callable(self.weights):
            self.weights = self.weights(X.sample_points[0])
            # if its a FDataGrid then we need to reduce the dimension to 1-D
            # array
            if isinstance(self.weights, FDataGrid):
                self.weights = np.squeeze(self.weights.data_matrix)

        weights_matrix = np.diag(self.weights)

        basis = FDataGrid(data_matrix=np.identity(n_points_discretization),
                          sample_points=X.sample_points)

        regularization_matrix = compute_penalty_matrix(
            basis_iterable=(basis, ),
            regularization_parameter=1,
            regularization=self.regularization)

        fd_data = np.transpose(
            np.linalg.solve(
                np.transpose(basis.data_matrix[..., 0] +
                             regularization_matrix), np.transpose(fd_data)))

        # see docstring for more information
        final_matrix = fd_data @ np.sqrt(weights_matrix) / np.sqrt(n_samples)

        pca = PCA(n_components=self.n_components)
        pca.fit(final_matrix)
        self.components_ = X.copy(data_matrix=np.transpose(
            np.linalg.solve(np.sqrt(weights_matrix),
                            np.transpose(pca.components_))))
        self.explained_variance_ratio_ = pca.explained_variance_ratio_
        self.explained_variance_ = pca.explained_variance_

        return self