def test_rmh(self): n_samples = 10000 n_features = 100 def mean_1(t): return (np.abs(t - 0.25) - 2 * np.abs(t - 0.5) + np.abs(t - 0.75)) X_0 = make_gaussian_process(n_samples=n_samples // 2, n_features=n_features, random_state=0) X_1 = make_gaussian_process(n_samples=n_samples // 2, n_features=n_features, mean=mean_1, random_state=1) X = skfda.concatenate((X_0, X_1)) y = np.zeros(n_samples) y[n_samples // 2:] = 1 correction = vs.recursive_maxima_hunting.GaussianSampleCorrection() stopping_condition = vs.recursive_maxima_hunting.ScoreThresholdStop( threshold=0.05) rmh = vs.RecursiveMaximaHunting(correction=correction, stopping_condition=stopping_condition) _ = rmh.fit(X, y) point_mask = rmh.get_support() points = X.grid_points[0][point_mask] np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1)
def test_concatenate(self): sample1 = np.arange(0, 10) sample2 = np.arange(10, 20) fd1 = FDataGrid([sample1]).to_basis(Fourier(n_basis=5)) fd2 = FDataGrid([sample2]).to_basis(Fourier(n_basis=5)) fd = concatenate([fd1, fd2]) np.testing.assert_equal(fd.n_samples, 2) np.testing.assert_equal(fd.dim_codomain, 1) np.testing.assert_equal(fd.dim_domain, 1) np.testing.assert_array_equal(fd.coefficients, np.concatenate( [fd1.coefficients, fd2.coefficients]))
def test_concatenate2(self): sample1 = np.arange(0, 10) sample2 = np.arange(10, 20) fd1 = FDataGrid([sample1]) fd2 = FDataGrid([sample2]) fd1.argument_names = ["x"] fd1.coordinate_names = ["y"] fd = concatenate([fd1, fd2]) np.testing.assert_equal(fd.n_samples, 2) np.testing.assert_equal(fd.dim_codomain, 1) np.testing.assert_equal(fd.dim_domain, 1) np.testing.assert_array_equal(fd.data_matrix[..., 0], [sample1, sample2]) np.testing.assert_array_equal(fd1.argument_names, fd.argument_names) np.testing.assert_array_equal(fd1.coordinate_names, fd.coordinate_names)
def _anova_bootstrap(fd_grouped, n_reps, random_state=None, p=2, equal_var=True): n_groups = len(fd_grouped) if n_groups < 2: raise ValueError("At least two groups must be passed in fd_grouped.") for fd in fd_grouped[1:]: if not np.array_equal(fd.domain_range, fd_grouped[0].domain_range): raise ValueError("Domain range must match for every FData in " "fd_grouped.") start, stop = fd_grouped[0].domain_range[0] sizes = [fd.n_samples for fd in fd_grouped] # List with sizes of each group # Instance a random state object in case random_state is an int random_state = check_random_state(random_state) if equal_var: k_est = concatenate(fd_grouped).cov().data_matrix[0, ..., 0] k_est = [k_est] * len(fd_grouped) else: # Estimating covariances for each group k_est = [fd.cov().data_matrix[0, ..., 0] for fd in fd_grouped] # Number of sample points for gaussian processes have to match # the features of the covariances. n_features = k_est[0].shape[0] # Simulating n_reps observations for each of the n_groups gaussian # processes sim = [make_gaussian_process(n_reps, n_features=n_features, start=start, stop=stop, cov=k_est[i], random_state=random_state) for i in range(n_groups)] v_samples = np.empty(n_reps) for i in range(n_reps): fd = FDataGrid([s.data_matrix[i, ..., 0] for s in sim]) v_samples[i] = v_asymptotic_stat(fd, sizes, p=p) return v_samples
def oneway_anova(*args, n_reps=2000, return_dist=False, random_state=None, p=2, equal_var=True): r""" Performs one-way functional ANOVA. This function implements an asymptotic method to test the following null hypothesis: Let :math:`\{X_i\}_{i=1}^k` be a set of :math:`k` independent samples each one with :math:`n_i` trajectories, and let :math:`E(X_i) = m_i( t)`. The null hypothesis is defined as: .. math:: H_0: m_1(t) = \dots = m_k(t) This function calculates the value of the statistic :func:`~skfda.inference.anova.v_sample_stat` :math:`V_n` with the means of the given samples. Under the null hypothesis this statistic is asymptotically equivalent to :func:`~skfda.inference.anova.v_asymptotic_stat`, where each sample is replaced by a gaussian process, with mean zero and the same covariance function as the original. The simulation of the distribution of the asymptotic statistic :math:`V` is implemented using a bootstrap procedure. One observation of the :math:`k` different gaussian processes defined above is simulated, and the value of :func:`~skfda.inference.anova.v_asymptotic_stat` is calculated. This procedure is repeated `n_reps` times, creating a sampling distribution of the statistic. This procedure is from Cuevas[1]. Args: fd1,fd2,.... (FDataGrid): The sample measurements for each each group. n_reps (int, optional): Number of simulations for the bootstrap procedure. Defaults to 2000 (This value may change in future versions). return_dist (bool, optional): Flag to indicate if the function should return a numpy.array with the sampling distribution simulated. random_state (optional): Random state. p (int, optional): p of the lp norm. Must be greater or equal than 1. If p='inf' or p=np.inf it is used the L infinity metric. Defaults to 2. equal_var (bool, optional): If True (default), perform a One-way ANOVA assuming the same covariance operator for all the groups, else considers an independent covariance operator for each group. Returns: Value of the sample statistic, p-value and sampling distribution of the simulated asymptotic statistic. Return type: (float, float, numpy.array) Raises: ValueError: In case of bad arguments. Examples: >>> from skfda.inference.anova import oneway_anova >>> from skfda.datasets import fetch_gait >>> from numpy.random import RandomState >>> from numpy import printoptions >>> fd = fetch_gait()["data"].coordinates[1] >>> fd1, fd2, fd3 = fd[:13], fd[13:26], fd[26:] >>> oneway_anova(fd1, fd2, fd3, random_state=RandomState(42)) (179.52499999999998, 0.5945) >>> _, _, dist = oneway_anova(fd1, fd2, fd3, n_reps=3, ... random_state=RandomState(42), ... return_dist=True) >>> with printoptions(precision=4): ... print(dist) [ 184.0698 212.7395 195.3663] References: [1] Antonio Cuevas, Manuel Febrero-Bande, and Ricardo Fraiman. "An anova test for functional data". *Computational Statistics Data Analysis*, 47:111-112, 02 2004 """ if len(args) < 2: raise ValueError("At least two groups must be passed as parameter.") if not all(isinstance(fd, FData) for fd in args): raise ValueError("Argument type must inherit FData.") if n_reps < 1: raise ValueError("Number of simulations must be positive.") fd_groups = args if not all([isinstance(fd, type(fd_groups[0])) for fd in fd_groups[1:]]): raise TypeError('Found mixed FData types in arguments.') for fd in fd_groups[1:]: if not np.array_equal(fd.domain_range, fd_groups[0].domain_range): raise ValueError("Domain range must match for every FData passed.") if isinstance(fd_groups[0], FDataGrid): # Creating list with all the sample points list_sample = [fd.sample_points[0].tolist() for fd in fd_groups] # Checking that the all the entries in the list are the same if not list_sample.count(list_sample[0]) == len(list_sample): raise ValueError("All FDataGrid passed must have the same sample " "points.") else: # If type is FDataBasis, check same basis list_basis = [fd.basis for fd in fd_groups] if not list_basis.count(list_basis[0]) == len(list_basis): raise NotImplementedError("Not implemented for FDataBasis with " "different basis.") # FData where each sample is the mean of each group fd_means = concatenate([fd.mean() for fd in fd_groups]) # Base statistic vn = v_sample_stat(fd_means, [fd.n_samples for fd in fd_groups], p=p) # Computing sampling distribution simulation = _anova_bootstrap(fd_groups, n_reps, random_state=random_state, p=p, equal_var=equal_var) p_value = np.sum(simulation > vn) / len(simulation) if return_dist: return vn, p_value, simulation return vn, p_value
# consists in 39 different trajectories, each representing the movement of the # hip of each of the boys studied. fig = fd_hip.plot() ############################################################################### # The example is going to be divided in three different groups. Then we are # going to apply the ANOVA procedure to this groups to test if the means of this # three groups are equal or not. fd_hip1 = fd_hip[0:13] fd_hip2 = fd_hip[13:26] fd_hip3 = fd_hip[26:39] fd_hip.plot(group=[0 if i < 13 else 1 if i < 26 else 39 for i in range(39)]) means = [fd_hip1.mean(), fd_hip2.mean(), fd_hip3.mean()] fd_means = skfda.concatenate(means) fig = fd_means.plot() ############################################################################### # At this point is time to perform the *ANOVA* test. This functionality is # implemented in the function :func:`~skfda.inference.anova.oneway_anova`. As # it consists in an asymptotic method it is possible to set the number of # simulations necessary to approximate the result of the statistic. It is # possible to set the :math:`p` of the :math:`L_p` norm used in the # calculations (defaults 2). v_n, p_val = oneway_anova(fd_hip1, fd_hip2, fd_hip3) ################################################################################ # The function returns first the statistic :func:`~skfda.inference.anova # .v_sample_stat` used to measure the variability between groups,