def calc_rdm_correlation(dataset, descriptor=None): """ calculates an RDM from an input dataset using correlation distance If multiple instances of the same condition are found in the dataset they are averaged. Args: dataset (rsatoolbox.data.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ ma, desc, descriptor = _parse_input(dataset, descriptor) ma = ma - ma.mean(axis=1, keepdims=True) ma /= np.sqrt(np.einsum('ij,ij->i', ma, ma))[:, None] rdm = 1 - np.einsum('ik,jk', ma, ma) rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='correlation', rdm_descriptors=deepcopy(dataset.descriptors)) rdm.pattern_descriptors[descriptor] = desc return rdm
def calc_rdm_poisson(dataset, descriptor=None, prior_lambda=1, prior_weight=0.1): """ calculates an RDM from an input dataset using the symmetrized KL-divergence assuming a poisson distribution. If multiple instances of the same condition are found in the dataset they are averaged. Args: dataset (rsatoolbox.data.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ measurements, desc, descriptor = _parse_input(dataset, descriptor) measurements = (measurements + prior_lambda * prior_weight) \ / (1 + prior_weight) kernel = measurements @ np.log(measurements).T rdm = np.expand_dims(np.diag(kernel), 0) + np.expand_dims(np.diag(kernel), 1)\ - kernel - kernel.T rdm = _extract_triu_(rdm) / measurements.shape[1] rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='poisson', rdm_descriptors=deepcopy(dataset.descriptors)) rdm.pattern_descriptors[descriptor] = desc return rdm
def test_weighted_mean(self): """Weights passed or stored in a descriptor are used in average """ from rsatoolbox.rdm.rdms import RDMs partial_rdms = RDMs( dissimilarities=array([ [ 1, 2, nan, 3, nan, nan], [ 2, 1, nan, 4, 5, 6], ]) ) weights = array([ [ 1, 1, nan, 1, nan, nan], [ 2, 2, nan, 2, 2, 2], ]) assert_almost_equal( partial_rdms.mean(weights=weights).dissimilarities, array([[1.6667, 1.3333, nan, 3.6667, 5.0000, 6.0000]]), decimal=3 ) partial_rdms.rdm_descriptors['theWeights'] = weights assert_almost_equal( partial_rdms.mean(weights='theWeights').dissimilarities, array([[1.6667, 1.3333, nan, 3.6667, 5.0000, 6.0000]]), decimal=3 )
def calc_rdm_poisson_cv(dataset, descriptor=None, prior_lambda=1, prior_weight=0.1, cv_descriptor=None): """ calculates an RDM from an input dataset using the crossvalidated symmetrized KL-divergence assuming a poisson distribution To assert equal ordering in the folds the dataset is initially sorted according to the descriptor used to define the patterns. Args: dataset (rsatoolbox.data.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset cv_descriptor (str): The descriptor that indicates the folds to use for crossvalidation Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ if descriptor is None: raise ValueError('descriptor must be a string! Crossvalidation' + 'requires multiple measurements to be grouped') if cv_descriptor is None: cv_desc = _gen_default_cv_descriptor(dataset, descriptor) dataset.obs_descriptors['cv_desc'] = cv_desc cv_descriptor = 'cv_desc' dataset.sort_by(descriptor) cv_folds = np.unique(np.array(dataset.obs_descriptors[cv_descriptor])) for i_fold in range(len(cv_folds)): fold = cv_folds[i_fold] data_test = dataset.subset_obs(cv_descriptor, fold) data_train = dataset.subset_obs(cv_descriptor, np.setdiff1d(cv_folds, fold)) measurements_train, _, _ = average_dataset_by(data_train, descriptor) measurements_test, _, _ = average_dataset_by(data_test, descriptor) measurements_train = (measurements_train + prior_lambda * prior_weight) \ / (1 + prior_weight) measurements_test = (measurements_test + prior_lambda * prior_weight) \ / (1 + prior_weight) kernel = measurements_train @ np.log(measurements_test).T rdm = np.expand_dims(np.diag(kernel), 0) + np.expand_dims(np.diag(kernel), 1)\ - kernel - kernel.T rdm = _extract_triu_(rdm) / measurements_train.shape[1] rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='poisson_cv', rdm_descriptors=deepcopy(dataset.descriptors)) _, desc, _ = average_dataset_by(dataset, descriptor) rdm.pattern_descriptors[descriptor] = desc return rdm
def test_mean_no_weights(self): """RDMs.mean() returns an RDMs with the nan omitted mean of the rdms """ from rsatoolbox.rdm.rdms import RDMs partial_rdms = RDMs( dissimilarities=array([ [ 1, 2, nan, 3, nan, nan], [ 2, 1, nan, 4, 5, 6], ]) ) assert_almost_equal( partial_rdms.mean().dissimilarities, array([[ 1.5, 1.5, nan, 3.5, 5, 6]]) )
def test_rescale(self): """The rescale function bring the RDMs as close together as possible """ from rsatoolbox.rdm.rdms import RDMs from rsatoolbox.rdm.combine import rescale partial=array([ [ 1, 2, nan, 3, nan, nan], [nan, nan, nan, 4, 5, 6], ]) partial_rdms = RDMs( dissimilarities=partial ) rescaled_rdms = rescale(partial_rdms) rescaled = rescaled_rdms.dissimilarities assert_almost_equal(rescaled[0, 3], rescaled[1, 3], decimal=4) assert_almost_equal( pearsonr(non_nan(partial), non_nan(rescaled))[0], 1, decimal=7 ) actual_rescaled = array([ [0.1438, 0.2877, nan, 0.4315, nan, nan], [ nan, nan, nan, 0.4316, 0.5395, 0.6474] ]) assert_almost_equal(rescaled, actual_rescaled, decimal=4) assert_array_equal( rescaled_rdms.rdm_descriptors.get('rescalingWeights'), array([ [ 1, 4, nan, 9, nan, nan], [nan, nan, nan, 16, 25, 36], ]) )
def test_from_partials_with_list_of_pattern_descriptors(self): """Where the user explicitly chooses the patterns We pass a list with a single RDMs object containing one RDM, then specify one additional pattern not covered in the RDM. """ from rsatoolbox.rdm.rdms import RDMs from rsatoolbox.rdm.combine import from_partials rdms1 = RDMs( dissimilarities=array([[1, 2, 3]]), dissimilarity_measure='measure', pattern_descriptors=dict(conds=['b', 'c', 'd']), ) rdms = from_partials([rdms1], all_patterns=['a', 'b', 'c', 'd']) self.assertEqual(rdms.n_rdm, 1) self.assertEqual(rdms.n_cond, 4) assert_array_equal( rdms.pattern_descriptors.get('conds'), ['a', 'b', 'c', 'd'] ) assert_array_equal( rdms.dissimilarities, array([ [nan, nan, nan, 1, 2, 3], ]) )
def test_from_partials_based_on_list_of_rdms_objects(self): """In this case the complete list of conditions is determined from the RDMs passed. """ from rsatoolbox.rdm.rdms import RDMs from rsatoolbox.rdm.combine import from_partials rdms1 = RDMs( dissimilarities=array([[1, 2, 3]]), dissimilarity_measure='shared_measure', descriptors=dict(shared_desc='shared_val', diff_desc='one'), rdm_descriptors=dict(rdesc=['foo1']), pattern_descriptors=dict(conds=['a', 'b', 'c']), ) rdms23 = RDMs( dissimilarities=array([[4, 5, 6], [7, 8, 9]]), dissimilarity_measure='shared_measure', descriptors=dict(shared_desc='shared_val', diff_desc='two-three'), rdm_descriptors=dict(rdesc=['foo2', 'foo3']), pattern_descriptors=dict(conds=['b', 'c', 'd']), ) rdms = from_partials([rdms1, rdms23]) self.assertEqual(rdms.n_rdm, 3) self.assertEqual(rdms.n_cond, 4) self.assertEqual(rdms.dissimilarity_measure, 'shared_measure') self.assertEqual(rdms.descriptors.get('shared_desc'), 'shared_val') assert_array_equal( rdms.rdm_descriptors.get('diff_desc'), ['one', 'two-three', 'two-three'] ) assert_array_equal( rdms.rdm_descriptors.get('rdesc'), ['foo1', 'foo2', 'foo3'] ) assert_array_equal( rdms.pattern_descriptors.get('conds'), ['a', 'b', 'c', 'd'] ) assert_array_equal( rdms.dissimilarities, array([ [ 1, 2, nan, 3, nan, nan], [nan, nan, nan, 4, 5, 6], [nan, nan, nan, 7, 8, 9] ]) )
def load_rdms(fpath: str, sort: bool = True) -> RDMs: """Read a Meadows results file and return any RDMs as an rsatoolbox object Args: fpath (str): path to .mat Meadows results file sort (bool): whether to sort the RDM based on the stimulus names Raises: ValueError: Will raise an error if the file is missing an expected variable. This can happen if the file does not contain MA task data. Returns: RDMs: All rdms found in the data file as an RDMs object """ info = extract_filename_segments(fpath) if info['filetype'] == 'mat': utvs, stimuli, pnames, tnames, tidx = load_rdms_comps_mat(fpath, info) elif info['filetype'] == 'json': utvs, stimuli, pnames, tnames, tidx = load_rdms_comps_json(fpath, info) else: raise ValueError('Unsupported file type') conds = [f.split('.')[0] for f in stimuli] rdm_descriptors = {} rdm_descriptors['participant'] = pnames if tnames is not None: rdm_descriptors['task'] = tnames if tidx is not None: rdm_descriptors['task_index'] = tidx rdms = RDMs( utvs, dissimilarity_measure='euclidean', descriptors=dict(experiment_name=info['experiment_name']), rdm_descriptors=rdm_descriptors, pattern_descriptors=dict(conds=conds), ) if sort: rdms.sort_by(conds='alpha') return rdms
def calc_rdm_euclid(dataset, descriptor=None): """ Args: dataset (rsatoolbox.data.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ measurements, desc, descriptor = _parse_input(dataset, descriptor) sum_sq_measurements = np.sum(measurements**2, axis=1, keepdims=True) rdm = sum_sq_measurements + sum_sq_measurements.T \ - 2 * np.dot(measurements, measurements.T) rdm = _extract_triu_(rdm) / measurements.shape[1] rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='squared euclidean', rdm_descriptors=deepcopy(dataset.descriptors)) rdm.pattern_descriptors[descriptor] = desc return rdm
def calc_rdm_mahalanobis(dataset, descriptor=None, noise=None): """ calculates an RDM from an input dataset using mahalanobis distance If multiple instances of the same condition are found in the dataset they are averaged. Args: dataset (rsatoolbox.data.dataset.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset noise (numpy.ndarray): dataset.n_channel x dataset.n_channel precision matrix used to calculate the RDM default: identity matrix, i.e. euclidean distance Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ if noise is None: rdm = calc_rdm_euclid(dataset, descriptor) else: measurements, desc, descriptor = _parse_input(dataset, descriptor) noise = _check_noise(noise, dataset.n_channel) kernel = measurements @ noise @ measurements.T rdm = np.expand_dims(np.diag(kernel), 0) + np.expand_dims(np.diag(kernel), 1)\ - 2 * kernel rdm = _extract_triu_(rdm) / measurements.shape[1] rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='squared mahalanobis', rdm_descriptors=deepcopy(dataset.descriptors)) rdm.pattern_descriptors[descriptor] = desc rdm.descriptors['noise'] = noise return rdm
def test_rescale_setsize(self): """The rescale function bring the RDMs as close together as possible """ from rsatoolbox.rdm.rdms import RDMs from rsatoolbox.rdm.combine import rescale partial_rdms = RDMs( dissimilarities=array([ [ 1, 2, nan, 3, nan, nan], [nan, nan, nan, 4, 5, nan], ]) ) rescaled_rdms = rescale(partial_rdms, method='setsize') assert_almost_equal( rescaled_rdms.rdm_descriptors.get('rescalingWeights'), array([ [0.3333, 0.3333, nan, 0.3333, nan, nan], [ nan, nan, nan, 0.5, 0.5, nan], ]), decimal=4 )
def calc_rdm_crossnobis(dataset, descriptor, noise=None, cv_descriptor=None): """ calculates an RDM from an input dataset using Cross-nobis distance This performs leave one out crossvalidation over the cv_descriptor. As the minimum input provide a dataset and a descriptor-name to define the rows & columns of the RDM. You may pass a noise precision. If you don't an identity is assumed. Also a cv_descriptor can be passed to define the crossvalidation folds. It is recommended to do this, to assure correct calculations. If you do not, this function infers a split in order of the dataset, which is guaranteed to fail if there are any unbalances. This function also accepts a list of noise precision matricies. It is then assumed that this is the precision of the mean from the corresponding crossvalidation fold, i.e. if multiple measurements enter a fold, please compute the resulting noise precision in advance! To assert equal ordering in the folds the dataset is initially sorted according to the descriptor used to define the patterns. Args: dataset (rsatoolbox.data.dataset.DatasetBase): The dataset the RDM is computed from descriptor (String): obs_descriptor used to define the rows/columns of the RDM defaults to one row/column per row in the dataset noise (numpy.ndarray): dataset.n_channel x dataset.n_channel precision matrix used to calculate the RDM default: identity matrix, i.e. euclidean distance cv_descriptor (String): obs_descriptor which determines the cross-validation folds Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ noise = _check_noise(noise, dataset.n_channel) if noise is None: noise = np.eye(dataset.n_channel) if descriptor is None: raise ValueError('descriptor must be a string! Crossvalidation' + 'requires multiple measurements to be grouped') if cv_descriptor is None: cv_desc = _gen_default_cv_descriptor(dataset, descriptor) dataset.obs_descriptors['cv_desc'] = cv_desc cv_descriptor = 'cv_desc' dataset.sort_by(descriptor) cv_folds = np.unique(np.array(dataset.obs_descriptors[cv_descriptor])) rdms = [] if (noise is None) or (isinstance(noise, np.ndarray) and noise.ndim == 2): for i_fold in range(len(cv_folds)): fold = cv_folds[i_fold] data_test = dataset.subset_obs(cv_descriptor, fold) data_train = dataset.subset_obs(cv_descriptor, np.setdiff1d(cv_folds, fold)) measurements_train, _, _ = \ average_dataset_by(data_train, descriptor) measurements_test, _, _ = \ average_dataset_by(data_test, descriptor) rdm = _calc_rdm_crossnobis_single( measurements_train, measurements_test, noise) rdms.append(rdm) else: # a list of noises was provided measurements = [] variances = [] for i_fold in range(len(cv_folds)): data = dataset.subset_obs(cv_descriptor, cv_folds[i_fold]) measurements.append(average_dataset_by(data, descriptor)[0]) variances.append(np.linalg.inv(noise[i_fold])) for i_fold in range(len(cv_folds)): for j_fold in range(i_fold + 1, len(cv_folds)): if i_fold != j_fold: rdm = _calc_rdm_crossnobis_single( measurements[i_fold], measurements[j_fold], np.linalg.inv( (variances[i_fold] + variances[j_fold]) / 2) ) rdms.append(rdm) rdms = np.array(rdms) rdm = np.einsum('ij->j', rdms) / rdms.shape[0] rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure='crossnobis', rdm_descriptors=deepcopy(dataset.descriptors)) _, desc, _ = average_dataset_by(dataset, descriptor) rdm.pattern_descriptors[descriptor] = desc rdm.descriptors['noise'] = noise rdm.descriptors['cv_descriptor'] = cv_descriptor return rdm
def calc_rdm_unbalanced(dataset, method='euclidean', descriptor=None, noise=None, cv_descriptor=None, prior_lambda=1, prior_weight=0.1, weighting='number', enforce_same=False): """ calculate a RDM from an input dataset for unbalanced datasets. Args: dataset (rsatoolbox.data.dataset.DatasetBase): The dataset the RDM is computed from method (String): a description of the dissimilarity measure (e.g. 'Euclidean') descriptor (String): obs_descriptor used to define the rows/columns of the RDM noise (numpy.ndarray): dataset.n_channel x dataset.n_channel precision matrix used to calculate the RDM used only for Mahalanobis and Crossnobis estimators defaults to an identity matrix, i.e. euclidean distance Returns: rsatoolbox.rdm.rdms.RDMs: RDMs object with the one RDM """ if descriptor is None: dataset = deepcopy(dataset) dataset.obs_descriptors['index'] = np.arange(dataset.n_obs) descriptor = 'index' if isinstance(dataset, Iterable): rdms = [] for i_dat, dat in enumerate(dataset): if noise is None: rdms.append( calc_rdm_unbalanced(dat, method=method, descriptor=descriptor, cv_descriptor=cv_descriptor, prior_lambda=prior_lambda, prior_weight=prior_weight, weighting=weighting, enforce_same=enforce_same)) elif isinstance(noise, np.ndarray) and noise.ndim == 2: rdms.append( calc_rdm_unbalanced(dat, method=method, descriptor=descriptor, noise=noise, cv_descriptor=cv_descriptor, prior_lambda=prior_lambda, prior_weight=prior_weight, weighting=weighting, enforce_same=enforce_same)) elif isinstance(noise, Iterable): rdms.append( calc_rdm_unbalanced(dat, method=method, descriptor=descriptor, noise=noise[i_dat], cv_descriptor=cv_descriptor, prior_lambda=prior_lambda, prior_weight=prior_weight, weighting=weighting, enforce_same=enforce_same)) rdm = concat(rdms) else: rdm = [] weights = [] self_sim = [] if method == 'crossnobis' or method == 'poisson_cv': if cv_descriptor is None: if 'index' not in dataset.obs_descriptors.keys(): dataset.obs_descriptors['index'] = np.arange(dataset.n_obs) cv_descriptor = 'index' warnings.warn('cv_descriptor not set, using index for now.' + 'This will only remove self-similarities.' + 'Effectively this assumes independent trials') unique_cond = set(dataset.obs_descriptors[descriptor]) for i, i_des in enumerate(unique_cond): v, _ = calc_one_similarity(dataset, descriptor, i_des, i_des, method=method, noise=noise, weighting=weighting, prior_lambda=prior_lambda, prior_weight=prior_weight, cv_descriptor=cv_descriptor) self_sim.append(v) for j, j_des in enumerate(unique_cond): if j > i: v, w = calc_one_similarity(dataset, descriptor, i_des, j_des, method=method, noise=noise, weighting=weighting, prior_lambda=prior_lambda, prior_weight=prior_weight, cv_descriptor=cv_descriptor) rdm.append(v) weights.append(w) row_idx, col_idx = row_col_indicator_rdm(len(unique_cond)) self_sim = np.array(self_sim) rdm = np.array(rdm) rdm = row_idx @ self_sim + col_idx @ self_sim - 2 * rdm rdm = RDMs(dissimilarities=np.array([rdm]), dissimilarity_measure=method, rdm_descriptors=deepcopy(dataset.descriptors)) rdm.pattern_descriptors[descriptor] = list(unique_cond) rdm.rdm_descriptors['weights'] = [weights] return rdm