def _prepare_clustering(data, adjacency, cluster_fun, backend, min_adj_ch=0): '''Prepare clustering - perform checks and create necessary variables.''' # FIXME - some these lines should be put in _get_cluster_fun if cluster_fun is None and backend == 'auto': if data.ndim < 3: backend = 'auto' if has_numba() else 'mne' if data.ndim < 3 and min_adj_ch > 0: if backend not in ['auto', 'numba']: raise ValueError('currently ``min_adj_ch`` is implemented only for' ' 3d clustering.') # mne_reshape_clusters=True, if backend == 'mne': # prepare mne clustering, maybe put this in a separate function? if min_adj_ch > 0: raise ValueError('mne backend does not supprot ``min_adj_ch`` ' 'filtering') try: from mne.stats.cluster_level import _setup_connectivity argname = 'connectivity' except ImportError: from mne.stats.cluster_level import (_setup_adjacency as _setup_connectivity) argname = 'adjacency' if adjacency is not None and isinstance(adjacency, np.ndarray): if not sparse.issparse(adjacency): adjacency = sparse.coo_matrix(adjacency) if adjacency.ndim == 2: adjacency = _setup_connectivity(adjacency, np.prod(data.shape), data.shape[0]) return _find_clusters_mne, adjacency, argname else: if cluster_fun is None: cluster_fun = _get_cluster_fun(data, adjacency=adjacency, min_adj_ch=min_adj_ch, backend=backend) return _find_clusters_borsar, adjacency, cluster_fun
spacing = "ico4" conds = ["audio","visselten","visual"] wavs = ["4000Hz","4000cheby","7000Hz","4000fftf"] band = opt.band indep_var = "Angenehm" n_freqs = 1 n_srcs = 5124 n_subjs = len(subjs) perm_n = opt.perm # setup connectivity fs_src = mne.read_source_spaces("{}{}_{}-src.fif".format(proc_dir,"fsaverage", spacing)) cnx = mne.spatial_src_connectivity(fs_src) del fs_src connectivity = _setup_connectivity(cnx, n_srcs, n_freqs) exclude = np.load("{}fsaverage_{}_exclude.npy".format(proc_dir,spacing)) include = np.ones(cnx.shape[0],dtype="bool") include[exclude] = 0 # threshold for clustering threshold = dict(start=0, step=0.2) #random_state = 42 random = np.random.RandomState() df_laut = pd.read_pickle("/scratch/jeffhanna/ATT_dat/behave/laut") df_ang = pd.read_pickle("/scratch/jeffhanna/ATT_dat/behave/ang") predictor_vars = ["Laut","Subj","Block","Wav"] dm_laut = df_laut.copy()[predictor_vars]
# set random state for replication random_state = 42 random = np.random.RandomState(random_state) # number of random samples boot = 2000 # place holders for bootstrap samples cluster_H0 = np.zeros(boot) f_H0 = np.zeros(boot) # setup connectivity n_tests = betas.shape[1] connectivity, ch_names = find_ch_connectivity(epochs_info, ch_type='eeg') connectivity = _setup_connectivity(connectivity, n_tests, n_times) # threshond for clustering threshold = 100. # run bootstrap for regression coefficients for i in range(boot): # extract random subjects from overall sample resampled_subjects = random.choice(range(betas.shape[0]), betas.shape[0], replace=True) # resampled betas resampled_betas = betas[resampled_subjects, :] # compute standard error of bootstrap sample se = resampled_betas.std(axis=0) / np.sqrt(resampled_betas.shape[0])
def cluster_based_regression(data, preds, conn, n_permutations=1000, progressbar=True): # data has to have observations as 1st dim and channels/vert as last dim from mypy.stats import compute_regression_t from mne.stats.cluster_level import (_setup_connectivity, _find_clusters, _cluster_indices_to_mask) # TODO - move this piece of code to utils # maybe a simple ProgressBar class? # - then support tqdm pbar as input if progressbar: if not progressbar == 'text': from tqdm import tqdm_notebook pbar = tqdm_notebook(total=n_permutations) else: from tqdm import tqdm pbar = tqdm(total=n_permutations) n_obs, n_times, n_channels = data.shape connectivity = _setup_connectivity(conn, n_channels * n_times, n_times) pos_dist = np.zeros(n_permutations) neg_dist = np.zeros(n_permutations) perm_preds = preds.copy() # regression on non-permuted data t_values = compute_regression_t(data, preds) clusters, cluster_stats = _find_clusters(t_values.ravel(), threshold=2., tail=0, connectivity=connectivity) clusters = _cluster_indices_to_mask(clusters, n_channels * n_times) clusters = [clst.reshape((n_times, n_channels)) for clst in clusters] if not clusters: print('No clusters found, permutations are not performed.') return t_values, clusters, cluster_stats else: msg = 'Found {} clusters, computing permutations.' print(msg.format(len(clusters))) # compute permutations for perm in range(n_permutations): perm_inds = np.random.permutation(n_obs) this_perm = perm_preds[perm_inds] perm_tvals = compute_regression_t(data, this_perm) _, perm_cluster_stats = _find_clusters( perm_tvals.ravel(), threshold=2., tail=0, connectivity=connectivity) # if any clusters were found - add max statistic if perm_cluster_stats.shape[0] > 0: max_val = perm_cluster_stats.max() min_val = perm_cluster_stats.min() if max_val > 0: pos_dist[perm] = max_val if min_val < 0: neg_dist[perm] = min_val if progressbar: pbar.update(1) # compute permutation probability cluster_p = np.array([(pos_dist > cluster_stat).mean() if cluster_stat > 0 else (neg_dist < cluster_stat).mean() for cluster_stat in cluster_stats]) cluster_p *= 2 # because we use two-tail cluster_p[cluster_p > 1.] = 1. # probability has to be >= 1. # sort clusters by p value cluster_order = np.argsort(cluster_p) cluster_p = cluster_p[cluster_order] clusters = [clusters[i] for i in cluster_order] return t_values, clusters, cluster_p
def _permutation_cluster_test_AT(X, threshold, tail, n_permutations, connectivity, n_jobs, seed, max_step, exclude, step_down_p, t_power, out_type, check_disjoint, buffer_size): n_jobs = check_n_jobs(n_jobs) """Aux Function. Note. X is required to be a list. Depending on the length of X either a 1 sample t-test or an F test / more sample permutation scheme is elicited. """ if out_type not in ['mask', 'indices']: raise ValueError('out_type must be either \'mask\' or \'indices\'') if not isinstance(threshold, dict) and (tail < 0 and threshold > 0 or tail > 0 and threshold < 0 or tail == 0 and threshold < 0): raise ValueError( 'incompatible tail and threshold signs, got %s and %s' % (tail, threshold)) # check dimensions for each group in X (a list at this stage). X = [x[:, np.newaxis] if x.ndim == 1 else x for x in X] n_times = X[0].shape[0] sample_shape = X[0].shape[1:] for x in X: if x.shape[1:] != sample_shape: raise ValueError('All samples mush have the same size') # # flatten the last dimensions in case the data is high dimensional # X = [np.reshape(x, (x.shape[0], -1)) for x in X] n_tests = X[0].shape[1] if connectivity is not None and connectivity is not False: connectivity = cluster_level._setup_connectivity( connectivity, n_tests, n_times) if (exclude is not None) and not exclude.size == n_tests: raise ValueError('exclude must be the same shape as X[0]') # determine if connectivity itself can be separated into disjoint sets if check_disjoint is True and (connectivity is not None and connectivity is not False): partitions = cluster_level._get_partitions_from_connectivity( connectivity, n_times) else: partitions = None max_clu_lens = np.zeros(n_permutations) for i in range(0, n_permutations): #logger.info('Running initial clustering') include = None out = cluster_level._find_clusters(X[i][0], threshold, tail, connectivity, max_step=max_step, include=include, partitions=partitions, t_power=t_power, show_info=True) clusters, cluster_stats = out logger.info('Found %d clusters' % len(clusters)) # convert clusters to old format if connectivity is not None and connectivity is not False: # our algorithms output lists of indices by default if out_type == 'mask': clusters = cluster_level._cluster_indices_to_mask( clusters, n_tests) else: # ndimage outputs slices or boolean masks by default if out_type == 'indices': clusters = cluster_level._cluster_mask_to_indices(clusters) # The clusters should have the same shape as the samples clusters = cluster_level._reshape_clusters(clusters, sample_shape) max_clu_len = 0 for j in range(0, len(clusters)): max_new = len(clusters[j][0]) if max_new > max_clu_len: max_clu_len = max_new logger.info('Max cluster length %d' % max_clu_len) max_clu_lens[i] = max_clu_len return max_clu_lens, clusters
def cluster_based_regression(data, preds, adjacency=None, n_permutations=1000, stat_threshold=None, alpha_threshold=0.05, cluster_pred=None, backend='auto', progressbar=True, return_distribution=False, stat_fun=None): '''Compute cluster-based permutation test with regression as the statistical function. Parameters ---------- data : numpy array N-dimensional numpy array with data to predict with regression. The first dimension has to correspond to observations. If ``adjacency`` was given the last dimension has to correspond to adjacency space (for example channels or vertices). preds : numpy array Predictors array of shape ``(n_observations, n_predictors)`` to use in regression. adjacency : numpy array, optional Adjacency matrix for the last ``data`` dimension. If ``None`` (default) lattice/grid adjacency is used. n_permutations : int Number of permutations to perferom to get a monte-carlo estimate of the null hypothesis distribution. More permutations result in more accurrate p values. Default is 1000. stat_threshold : float | None Cluster inclusion threshold in t value. Only data points exceeding this value of the t statistic (either ``t value > stat_threshold`` or ``t value < -stat_threshold``) form clusters. Default is ``None``, which means that cluster inclusion threshold is set according to ``alpha_threshold``. If both ``stat_threshold`` and ``alpha_threshold`` are set, ``stat_threshold`` takes priority. alpha_threshold : float | None Cluster inclusion threshold in critical p value. Only data points where p value of the predictor effect lower than the critical value form clusters. Default is 0.05. cluster_pred : int Specify which predictor to use in clustering. Must be an integer: a zero-based index for the t values matrix returned by ``compute_regression_t``. Use values higher than zero - zero index indicates the intercept, which should be tested using a different permutation scheme than the one used here. backend : str Clustering backend. The default is 'numpy' but 'numba' can be also chosen. 'numba' should be faster for 3d clustering but requires the numba package. progressbar : bool Whether to report the progress of permutations using a progress bar. The default is ``True`` which uses tqdm progress bar. return_distribution : bool Whether to retrun the permutation distribution as an additional, fourth output argument. stat_fun : None | callable Function to compute regression. The function should take two arguments: data (data to predict) and preds (predictors to use) and return a matrix of regression parameters. Returns ------- t_values : numpy array Statistical map of t values for the effect of predictor of interest. clusters : list of numpy arrays List of boolean numpy arrays. Consecutive arrays correspond to boolean cluster masks. cluster_p : numpy array Numpy array of cluster-level p values. distributions : dict Dictionary of positive null distribution (``distributions['pos']``) and negative null distribution (``distributions['neg']``). Returned only if ``return_distribution`` was set to ``True``. ''' # data has to have observations as 1st dim and channels/vert as last dim # FIXME: add checks for input types preds = _handle_preds(preds) if stat_threshold is None: from scipy.stats import t df = data.shape[0] - 2 # in future: preds.shape[1] stat_threshold = t.ppf(1 - alpha_threshold / 2, df) if stat_fun is None: stat_fun = compute_regression_t use_3d_clustering = data.ndim > 3 and adjacency is not None n_obs = data.shape[0] if adjacency is not None and not use_3d_clustering: try: from mne.stats.cluster_level import _setup_connectivity except ImportError: from mne.stats.cluster_level import (_setup_adjacency as _setup_connectivity) adjacency = _setup_connectivity(adjacency, np.prod(data.shape[1:]), data.shape[1]) pos_dist = np.zeros(n_permutations) neg_dist = np.zeros(n_permutations) perm_preds = preds.copy() if cluster_pred is None: cluster_pred = 1 # regression on non-permuted data t_values = stat_fun(data, preds)[cluster_pred] if use_3d_clustering: # use 3d clustering cluster_fun = _get_cluster_fun(t_values, adjacency=adjacency, backend=backend) # we need to transpose dimensions for 3d clustering # FIXME/TODO - this could be eliminated by creating a single unified # clustering function / API data_dims = np.array(list(range(data.ndim))) data_dims[1], data_dims[-1] = data_dims[-1], 1 data = data.transpose(data_dims) t_values = t_values.transpose(data_dims[1:] - 1) else: backend = 'mne' cluster_fun = None clusters, cluster_stats = find_clusters(t_values, stat_threshold, adjacency=adjacency, cluster_fun=cluster_fun, backend=backend) if use_3d_clustering: t_values = t_values.transpose(data_dims[1:] - 1) if not clusters: print('No clusters found, permutations are not performed.') return t_values, clusters, cluster_stats else: msg = 'Found {} clusters, computing permutations.' print(msg.format(len(clusters))) # TODO - move progressbar code from DiamSar! # - then support tqdm pbar as input if progressbar: from tqdm import tqdm pbar = tqdm(total=n_permutations) # compute permutations for perm in range(n_permutations): # permute predictors perm_inds = np.random.permutation(n_obs) perm_preds[:, cluster_pred] = preds[perm_inds, cluster_pred] perm_tvals = stat_fun(data, perm_preds)[cluster_pred] # cluster _, perm_cluster_stats = find_clusters(perm_tvals, stat_threshold, adjacency=adjacency, cluster_fun=cluster_fun, backend=backend, mne_reshape_clusters=False) # if any clusters were found - add max statistic if len(perm_cluster_stats) > 0: max_val = perm_cluster_stats.max() min_val = perm_cluster_stats.min() if max_val > 0: pos_dist[perm] = max_val if min_val < 0: neg_dist[perm] = min_val if progressbar: pbar.update(1) # compute permutation probability cluster_p = np.array([ (pos_dist > cluster_stat).mean() if cluster_stat > 0 else (neg_dist < cluster_stat).mean() for cluster_stat in cluster_stats ]) cluster_p *= 2 # because we use two-tail cluster_p[cluster_p > 1.] = 1. # probability has to be <= 1. # sort clusters by p value cluster_order = np.argsort(cluster_p) cluster_p = cluster_p[cluster_order] clusters = [clusters[i] for i in cluster_order] if use_3d_clustering: clusters = [clst.transpose(data_dims[1:] - 1) for clst in clusters] out = t_values, clusters, cluster_p if return_distribution: distribution = dict(pos=pos_dist, neg=neg_dist) out += (distribution, ) return out