class PDist(Measure): """Compute dissimiliarity matrix for samples in a dataset This `Measure` returns the upper triangle of the n x n disimilarity matrix defined as the pairwise distances between samples in the dataset, and where n is the number of samples. """ is_trained = True # Indicate that this measure is always trained. pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating pairwise vector distances for dissimilarity matrix (DSM). See scipy.spatial.distance.pdist for all possible metrics.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracing the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") square = Parameter(False, constraints='bool', doc="""\ If True return the square distance matrix, if False, returns the flattened upper triangle.""") def __init__(self, **kwargs): """ Returns ------- Dataset If square is False, contains a column vector of length = n(n-1)/2 of pairwise distances between all samples. A sample attribute ``pairs`` identifies the indices of input samples for each individual pair. If square is True, the dataset contains a square dissimilarty matrix and the entire sample attributes collection of the input dataset. """ Measure.__init__(self, **kwargs) def _call(self,ds): data = ds.samples # center data if specified if self.params.center_data: data = data - np.mean(data,0) # get dsm dsm = pdist(data,metric=self.params.pairwise_metric) # if square return value make dsm square if self.params.square: # re-add the sample attributes -- should still be valid out = Dataset(squareform(dsm), sa=ds.sa) else: # add some attributes out = Dataset(dsm, sa=dict(pairs=list(combinations(range(len(ds)), 2)))) return out
class PolyKernel(NumpyKernel): """Polynomial kernel: K(a,b) = (gamma*a*b.T+coef0)**degree""" gamma = Parameter(1, doc='Gamma scaling coefficient') degree = Parameter(2, doc="Polynomial degree") coef0 = Parameter(1, doc="Offset added to dot product before exponent") def _compute(self, d1, d2): self._k = np.power( self.params.gamma * np.dot(d1, d2.T) + self.params.coef0, self.params.degree)
class SigmoidLSKernel(LSKernel): """Sigmoid kernel: K(a,b) = tanh(gamma*a*b.T + coef0)""" __kernel_type__ = _svmc.SIGMOID __kernel_name__ = 'sigmoid' gamma = Parameter(1, doc='Gamma multiplying parameter for SigmoidKernel') coef0 = Parameter(1, doc='Offset inside tanh') def __init__(self, **kwargs): # Necessary for proper docstring construction LSKernel.__init__(self, **kwargs)
class PolyLSKernel(LSKernel): """Polynomial kernel: K(a,b) = (gamma*a*b.T + coef0)**degree""" __kernel_type__ = _svmc.POLY __kernel_name__ = 'poly' gamma = Parameter(1, doc='Gamma multiplying parameter for Polynomial') degree = Parameter(2, doc='Degree of polynomial') coef0 = Parameter(1, doc='Offset inside polynomial') # aka coef0 def __init__(self, **kwargs): # Necessary for proper docstring construction LSKernel.__init__(self, **kwargs)
class MeanRemoval(Mapper): """Subtract sample mean from features.""" is_trained = True in_place = Parameter( False, doc="""If False: a copy of the dataset will be made before demeaning. If True: demeaning will be performed in-place, i.e. input data is modified. This is faster, but can have side-effects when the original dataset is used elsewhere again, and implies that floating point data types are required to prevent rounding errors in this case.""", constraints=EnsureBool()) def __init__(self, in_place=False, **kwargs): Mapper.__init__(self, **kwargs) self.in_place = in_place def _forward_data(self, data): mdata = data mean = np.mean(mdata, axis=1) if self.in_place: if not np.issubdtype(mdata.dtype, float): warning("Integer dtype. Mean removal won't work correctly for " "this implementation. Rounding errors will occur. " "Use in_place=False instead") mdata -= mean[:, np.newaxis] else: mdata = mdata - mean[:, np.newaxis] return mdata
class WithFuncChoices(ClassWithCollections): C = Parameter('choice1', constraints=EnsureChoice('choice1', np.sum), doc="documentation") # We need __init__ to get 'custom' docstring def __init__(self, **kwargs): super(type(self), self).__init__(**kwargs)
class RandomClassifier(Classifier): """Dummy classifier deciding on labels absolutely randomly """ __tags__ = ['random', 'non-deterministic', 'oneclass-binary'] same = Parameter( False, constraints='bool', doc="If a dataset arrives to predict, assign identical (but random) label " "to all samples having the same label in original, thus mimiquing the " "situation where testing samples are not independent.") def __init__(self, **kwargs): Classifier.__init__(self, **kwargs) self._ulabels = None def _train(self, data): self._ulabels = data.sa[self.get_space()].unique @accepts_dataset_as_samples def _predict(self, data): l = len(self._ulabels) # oh those lovely random estimates, for now just an estimate # per sample. Since we are random after all -- keep it random self.ca.estimates = np.random.normal(size=len(data)) if is_datasetlike(data) and self.params.same: # decide on mapping between original labels labels_map = dict( (t, rt) for t, rt in zip(self._ulabels, self._ulabels[npr.randint(0, l, size=l)])) return [labels_map[t] for t in data.sa[self.get_space()].value] else: # random one per each return self._ulabels[npr.randint(0, l, size=len(data))]
def test_deprecated_allowedtype(self): with assert_warnings( [(DeprecationWarning, "allowedtype option was deprecated in favor of constraints. " "Adjust your code, provided value 'str' was ignored")]): p = Parameter(1.0, allowedtype="str") self.assertRaises(AttributeError, lambda p: p.allowedtype, p) self.assertEqual(p.constraints, None)
class RbfSGKernel(_BasicSGKernel): """Radial basis function: K(a,b) = exp(-||a-b||**2/sigma)""" __kernel_cls__ = sgk.GaussianKernel __kernel_name__ = 'rbf' sigma = Parameter(1, doc="Width/division parameter for gaussian kernel") def __init__(self, **kwargs): # Necessary for proper docstring construction _BasicSGKernel.__init__(self, **kwargs)
class BinomialProportionCI(Mapper): """Compute binomial proportion confidence intervals This is a convenience frontend for binomial_proportion_ci_from_bool() and supports all methods implemented in this function. The confidence interval is computed independently for each feature column. The returned dataset contains two samples. The first one contains the lower CI boundary and the second sample the upper boundary. Returns ------- dataset """ is_trained = True width = Parameter(.95, constraints=EnsureFloat() & EnsureRange(min=0, max=1), doc="Confidence interval width") meth = Parameter('jeffreys', constraints=EnsureChoice('wald', 'wilson', 'agresti-coull', 'jeffreys', 'clopper-pearson', 'arc-sine', 'logit', 'anscombe'), doc="Interval estimation method") def __init__(self, **kwargs): Mapper.__init__(self, **kwargs) def _train(self, ds): pass def _forward_data(self, data): from mvpa2.misc.stats import binomial_proportion_ci_from_bool return binomial_proportion_ci_from_bool(data, axis=0, alpha=1 - self.params.width, meth=self.params.meth) def _forward_dataset(self, ds): msamp = self._forward_data(ds.samples) mds = Dataset(msamp, sa=dict(ci_boundary=['lower', 'upper'])) return mds
class RbfKernel(NumpyKernel): """Radial basis function (aka Gausian, aka ) kernel K(a,b) = exp(-||a-b||**2/sigma) """ sigma = Parameter(1.0, allowedtype=float, doc="Width parameter sigma") def _compute(self, d1, d2): # Do the Rbf self._k = np.exp(-squared_euclidean_distance(d1, d2) / self.params.sigma)
class PolySGKernel(_BasicSGKernel): """Polynomial kernel: K(a,b) = (a*b.T + c)**degree c is 1 if and only if 'inhomogenous' is True """ __kernel_cls__ = sgk.PolyKernel __kernel_name__ = 'poly' __kp_order__ = ('degree', 'inhomogenous') degree = Parameter(2, allowedtype=int, doc="Polynomial order of the kernel") inhomogenous = Parameter(True, allowedtype=bool, doc="Whether +1 is added within the expression") if not exists('sg ge 0.6.5'): use_normalization = Parameter(False, allowedtype=bool, doc="Optional normalization") __kp_order__ = __kp_order__ + ('use_normalization',) def __init__(self, **kwargs): # Necessary for proper docstring construction _BasicSGKernel.__init__(self, **kwargs)
class RbfLSKernel(LSKernel): """Radial Basis Function kernel (aka Gaussian): K(a,b) = exp(-gamma*||a-b||**2) """ __kernel_type__ = _svmc.RBF __kernel_name__ = 'rbf' gamma = Parameter(1, doc='Gamma multiplying paramater for Rbf') def __init__(self, **kwargs): # Necessary for proper docstring construction LSKernel.__init__(self, **kwargs)
class ConstantKernel(NumpyKernel): """The constant kernel class. """ sigma_0 = Parameter(1.0, doc=""" A simple constant squared value of which is broadcasted across kernel. In the case of GPR -- standard deviation of the Gaussian prior probability N(0,sigma_0**2) of the intercept of the constant regression.""") def _compute(self, data1, data2): """Compute kernel matrix. Parameters ---------- data1 : numpy.ndarray lhs data data2 : numpy.ndarray rhs data """ self._k = \ (self.params.sigma_0 ** 2) * np.ones((data1.shape[0], data2.shape[0])) ## def set_hyperparameters(self, hyperparameter): ## if hyperparameter < 0: ## raise InvalidHyperparameterError() ## self.sigma_0 = hyperparameter ## return def compute_lml_gradient(self, alphaalphaT_Kinv, data): K_grad_sigma_0 = 2 * self.params.sigma_0 # self.lml_gradient = 0.5*(np.trace(np.dot(alphaalphaT_Kinv,K_grad_sigma_0*np.ones(alphaalphaT_Kinv.shape))) # Faster formula: np.trace(np.dot(A,B)) = (A*(B.T)).sum() # Fastest when B is a constant: B*A.sum() self.lml_gradient = 0.5 * np.array( K_grad_sigma_0 * alphaalphaT_Kinv.sum()) #return self.lml_gradient def compute_lml_gradient_logscale(self, alphaalphaT_Kinv, data): K_grad_sigma_0 = 2 * self.params.sigma_0**2 self.lml_gradient = 0.5 * np.array( K_grad_sigma_0 * alphaalphaT_Kinv.sum()) #return self.lml_gradient pass
def __init__(self, **kwargs): # XXX Determine which parameters depend on each other and implement # safety/simplifying logic around them # already done for: nr_weight # thought: weight and weight_label should be a dict """Interface class to LIBSVM classifiers and regressions. Default implementation (C/nu/epsilon SVM) is chosen depending on the given parameters (C/nu/tube_epsilon). """ svm_impl = kwargs.get('svm_impl', None) # Depending on given arguments, figure out desired SVM # implementation if svm_impl is None: for arg, impl in [('tube_epsilon', 'EPSILON_SVR'), ('C', 'C_SVC'), ('nu', 'NU_SVC')]: if arg in kwargs: svm_impl = impl if __debug__: debug( 'SVM', 'No implementation was specified. Since ' '%s is given among arguments, assume %s' % (arg, impl)) break if svm_impl is None: svm_impl = 'C_SVC' if __debug__: debug('SVM', 'Assign C_SVC "by default"') kwargs['svm_impl'] = svm_impl # init base class _SVM.__init__(self, **kwargs) self._svm_type = self._KNOWN_IMPLEMENTATIONS[svm_impl][0] if 'nu' in self._KNOWN_PARAMS and 'epsilon' in self._KNOWN_PARAMS: # overwrite eps param with new default value (information # taken from libSVM docs self.params['epsilon']._set_default(0.001) self.params['nr_weight'] = Parameter(len(self.params['weight'].value)) self.__model = None """Holds the trained SVM."""
def __init__(self, kernel_cls, kernel_params=[], **kwargs): """Initialize CustomSGKernel. Parameters ---------- kernel_cls : Shogun.Kernel Class of a Kernel from Shogun kernel_params : list Each item in this list should be a tuple of (kernelparamname, value), and the order is the explicit order required by the Shogun constructor """ self.__kernel_cls__ = kernel_cls # These are normally static _BasicSGKernel.__init__(self, **kwargs) order = [] for k, v in kernel_params: self.params[k] = Parameter(default=v) order.append(k) self.__kp_order__ = tuple(order)
class ConnectivityHyperalignment(SearchlightHyperalignment): """ Given a list of datasets, provide a list of mappers into common space using connectivity based hyperalignment. This time on Surface!!! - Compute the mean time-series for each connectivity target. - Use these mean time-series to align each target region and get `npc` PC time-series per region that are aligned across individuals (optional). - Compute a connectivity profile for each feature (e.g., vertex) depicting its connectivities to the targets. If `npc` is None, the mean time-series of each target is used; otherwise, the `npc` PC time-series are used. - Use SL HA to align the whole cortex based on connectivity profiles. See :ref:`Guntupalli et al., Plos Comp. Bio (2018)` for details. """ mask_ids = Parameter( None, constraints=EnsureListOf(int) | EnsureNone(), doc="""You can specify a mask to compute searchlight hyperalignment only within this mask..""") seed_indices = Parameter( None, constraints=EnsureListOf(int) | EnsureNone(), doc="""A list of node indices that correspond to seed centers for seed queryengines. If None, all centers of seed_queryengines are used.""") seed_queryengines = Parameter( None, doc="""A list of queryengines to determine seed searchlights for connectomes. If a single queryengine is given in the list, then it is assumed that it applies to all datasets.""") seed_radius = Parameter( None, constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(), doc=""" Radius in voxels for seed size in volume.""") conn_metric = Parameter( lambda x, y: np.dot(x.samples.T, y.samples) / x.nsamples, # doc="""How to compute the connectivity metric between features. Default is the dot product of samples (which on zscored data becomes correlation if you normalize by nsamples.""") npcs = Parameter( 3, constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(), doc= """Maximum number of PCs to be considered in each surface searchlight. If None, use seed mean instead of PCs. """) connectomes = Parameter( None, constraints=EnsureStr() | EnsureNone(), doc="""Precomputed connectomes supplied as hdf5 filename (for now). It is expected to be a dictionary with key 'hmappers' (for now).""" ) common_model = Parameter( None, constraints=EnsureStr() | EnsureNone(), doc="""Precomputed common model supplied as hdf5 filename (for now). It is expected to be a dict with feature-targets connectome and common models in each target ROI with appropriate pcs (for now). Expects 'local_models' and 'connectome_model' keys.""") save_model = Parameter( None, constraints=EnsureStr() | EnsureNone(), doc="""Precomputed common model supplied as hdf5 filename (for now). It is expected to be a tuple with feature-targets connectome and common models in each target ROI with appropriate pcs (for now).""" ) def __init__(self, **kwargs): SearchlightHyperalignment.__init__(self, **kwargs) def _get_seed_means(self, measure, queryengine, dataset, seed_indices): # Computing seed data as mean timeseries in each SL seed_data = Searchlight(measure, queryengine=queryengine, nproc=self.params.nproc, roi_ids=seed_indices) seed_data = seed_data(dataset) zscore(seed_data, chunks_attr=None) return seed_data def _get_sl_connectomes(self, seed_means, qe_all, datasets, inode, connectivity_mapper): # For each SL, computing connectivity of features to seed means sl_connectomes = [] # Looping over each subject for seed_mean, qe_, sd in zip(seed_means, qe_all, datasets): connectivity_mapper.train(seed_mean) sl_ids = qe_[inode] if is_datasetlike(sl_ids): assert (sl_ids.nsamples == 1) sl_ids = sl_ids.samples[0, :].tolist() sl_connectomes.append(connectivity_mapper.forward(sd[:, sl_ids])) return sl_connectomes def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [ slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes) ] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model def _get_connectomes(self, datasets): params = self.params # If no precomputed connectomes are supplied, compute them. if params.connectomes is not None and os.path.exists( params.connectomes): _chpaldebug("Loading pre-computed connectomes from ", params.connectomes) connectomes = h5load(params.connectomes) return connectomes connectivity_mapper = FxyMapper(params.conn_metric) # Initializing datasets with original anatomically aligned datasets mfm = MeanFeatureMeasure() # TODO Handle seed_radius if seed queryengines are not provided seed_radius = params.seed_radius _chpaldebug( "Performing surface connectivity hyperalignment with seeds") _chpaldebug("Computing connectomes.") ndatasets = len(datasets) if params.seed_queryengines is None: raise NotImplementedError("For now, we need seed queryengines.") qe_all = super(ConnectivityHyperalignment, self)._get_trained_queryengines( datasets, params.seed_queryengines, seed_radius, params.ref_ds) # If seed_indices are not supplied, use all as centers if not params.seed_indices: roi_ids = super(ConnectivityHyperalignment, self)._get_verified_ids(qe_all) else: roi_ids = params.seed_indices if len(qe_all) == 1: qe_all *= ndatasets # Computing Seed means to be used for aligning seed features seed_means = [ self._get_seed_means(MeanFeatureMeasure(), qe, ds, params.seed_indices) for qe, ds in zip(qe_all, datasets) ] if params.npcs is None: conn_targets = [] for seed_mean in seed_means: zscore(seed_mean, chunks_attr=None) conn_targets.append(seed_mean) else: # compute all PC-seed connectivity in each subject # 1. make common model SVs in each seed SL based on connectivity to seed_means # 2. Use these SVs for computing connectomes _chpaldebug("Aligning SVs in each searchlight across subjects") # Looping over all seeds in which SVD is done pc_data = [[] for isub in range(ndatasets)] sl_common_models = dict() if params.common_model is not None and os.path.exists( params.common_model): _chpaldebug("Loading common model from %s" % params.common_model) common_model = h5load(params.common_model) sl_common_models = common_model['local_models'] for inode in roi_ids: # For each SL, computing connectivity of features to seed means # This line below doesn't need common model sl_connectomes = self._get_sl_connectomes( seed_means, qe_all, datasets, inode, connectivity_mapper) # Hyperalign connectomes in SL # XXX TODO Common model input to below function should be updated. local_common_model = sl_common_models[inode][:, :params.npcs] \ if params.common_model else None sl_hmappers, svm, sl_common_model = self._get_hypesvs( sl_connectomes, local_common_model=local_common_model) if sl_common_model is not None: sl_common_models[inode] = sl_common_model # make common model SV timeseries data in each subject for sd, slhm, qe, pcd in zip(datasets, sl_hmappers, qe_all, pc_data): sd_svs = slhm.forward(sd[:, qe[inode]]) zscore(sd_svs, chunks_attr=None) if svm is not None: sd_svs = svm.forward(sd_svs) sd_svs = sd_svs[:, :params.npcs] zscore(sd_svs, chunks_attr=None) pcd.append(sd_svs) if params.save_model is not None: # TODO: should use debug print('Saving local models to %s' % params.save_model) h5save(params.save_model, sl_common_models) pc_data = [hstack(pcd) for pcd in pc_data] conn_targets = pc_data #print pc_data[-1] # compute connectomes using connectivity targets (PCs or seed means) connectomes = [] if params.common_model is not None and os.path.exists( params.common_model): # TODO: should use debug print('Loading from saved common model: %s' % params.common_model) connectome_model = common_model['connectome_model'] connectomes.append(connectome_model) for t_, ds in zip(conn_targets, datasets): connectivity_mapper.train(t_) connectome = connectivity_mapper.forward(ds) t_ = None connectome.fa = ds.fa if connectome.samples.dtype == 'float64': connectome.samples = connectome.samples.astype('float32') zscore(connectome, chunks_attr=None) connectomes.append(connectome) if params.connectomes is not None and not os.path.exists( params.connectomes): _chpaldebug("Saving connectomes to ", params.connectomes) h5save(params.connectomes, connectomes) return connectomes @due.dcite(Doi('10.1371/journal.pcbi.1006120'), description="Connectivity-based hyperalignment", tags=["implementation"]) def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ connectomes = self._get_connectomes(datasets) # TODO Add assertion about nsamples matching across connectomes _chpaldebug( "Performing hyperalignment of %d connectomes with %d samples" % (len(connectomes), connectomes[0].nsamples)) _chpaldebug("Running searchlight hyperalignment") conhypmappers = super(ConnectivityHyperalignment, self).__call__(connectomes) _chpaldebug("Finished Connectivity hyperalignment. Returning mappers.") return conhypmappers
class PDistTargetSimilarity(Measure): """Calculate the correlations of PDist measures with a target Target dissimilarity correlation `Measure`. Computes the correlation between the dissimilarity matrix defined over the pairwise distances between the samples of dataset and the target dissimilarity matrix. """ is_trained = True """Indicate that this measure is always trained.""" pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating pairwise vector distances for dissimilarity matrix (DSM). See scipy.spatial.distance.pdist for all possible metrics.""") comparison_metric = Parameter('pearson', constraints=EnsureChoice('pearson', 'spearman'), doc="""\ Similarity measure to be used for comparing dataset DSM with the target DSM.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracing the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") corrcoef_only = Parameter(False, constraints='bool', doc="""\ If True, return only the correlation coefficient (rho), otherwise return rho and probability, p.""") def __init__(self, target_dsm, **kwargs): """ Parameters ---------- target_dsm : array (length N*(N-1)/2) Target dissimilarity matrix Returns ------- Dataset If ``corrcoef_only`` is True, contains one feature: the correlation coefficient (rho); or otherwise two-fetaures: rho plus p. """ # init base classes first Measure.__init__(self, **kwargs) self.target_dsm = target_dsm if self.params.comparison_metric == 'spearman': self.target_dsm = rankdata(target_dsm) def _call(self,dataset): data = dataset.samples if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data,self.params.pairwise_metric) if self.params.comparison_metric=='spearman': dsm = rankdata(dsm) rho, p = pearsonr(dsm,self.target_dsm) if self.params.corrcoef_only: return Dataset([rho], fa={'metrics': ['rho']}) else: return Dataset([[rho,p]], fa={'metrics': ['rho', 'p']})
class Classifier(Learner): """Abstract classifier class to be inherited by all classifiers """ # Kept separate from doc to don't pollute help(clf), especially if # we including help for the parent class _DEV__doc__ = """ Required behavior: For every classifier is has to be possible to be instantiated without having to specify the training pattern. Repeated calls to the train() method with different training data have to result in a valid classifier, trained for the particular dataset. It must be possible to specify all classifier parameters as keyword arguments to the constructor. Recommended behavior: Derived classifiers should provide access to *estimates* -- i.e. that information that is finally used to determine the predicted class label. Michael: Maybe it works well if each classifier provides a 'estimates' state member. This variable is a list as long as and in same order as Dataset.uniquetargets (training data). Each item in the list corresponds to the likelyhood of a sample to belong to the respective class. However the semantics might differ between classifiers, e.g. kNN would probably store distances to class- neighbors, where PLR would store the raw function value of the logistic function. So in the case of kNN low is predictive and for PLR high is predictive. Don't know if there is the need to unify that. As the storage and/or computation of this information might be demanding its collection should be switchable and off be default. Nomenclature * predictions : result of the last call to .predict() * estimates : might be different from predictions if a classifier's predict() makes a decision based on some internal value such as probability or a distance. """ # Dict that contains the parameters of a classifier. # This shall provide an interface to plug generic parameter optimizer # on all classifiers (e.g. grid- or line-search optimizer) # A dictionary is used because Michael thinks that access by name is nicer. # Additionally Michael thinks ATM that additional information might be # necessary in some situations (e.g. reasonably predefined parameter range, # minimal iteration stepsize, ...), therefore the value to each key should # also be a dict or we should use mvpa2.base.param.Parameter'... training_stats = ConditionalAttribute( enabled=False, doc="Confusion matrix of learning performance") predictions = ConditionalAttribute(enabled=True, doc="Most recent set of predictions") estimates = ConditionalAttribute( enabled=True, doc="Internal classifier estimates the most recent " + "predictions are based on") predicting_time = ConditionalAttribute( enabled=True, doc="Time (in seconds) which took classifier to predict") __tags__ = [] """Describes some specifics about the classifier -- is that it is doing regression for instance....""" # TODO: make it available only for actually retrainable classifiers retrainable = Parameter( False, constraints='bool', doc="""Either to enable retraining for 'retrainable' classifier.""", index=1002) def __init__(self, space=None, **kwargs): # by default we want classifiers to use the 'targets' sample attribute # for training/testing if space is None: space = 'targets' Learner.__init__(self, space=space, **kwargs) # XXX # the place to map literal to numerical labels (and back) # this needs to be in the base class, since some classifiers also # have this nasty 'regression' mode, and the code in this class # needs to deal with converting the regression output into discrete # labels # however, preferably the mapping should be kept in the respective # low-level implementations that need it self._attrmap = AttributeMap() self.__trainednfeatures = 0 """Stores number of features for which classifier was trained. If 0 -- it wasn't trained at all""" self._set_retrainable(self.params.retrainable, force=True) # deprecate #self.__trainedidhash = None #"""Stores id of the dataset on which it was trained to signal #in trained() if it was trained already on the same dataset""" @property def __summary_class__(self): if 'regression' in self.__tags__: return RegressionStatistics else: return ConfusionMatrix @property def __is_regression__(self): return 'regression' in self.__tags__ def __str__(self, *args, **kwargs): if __debug__ and 'CLF_' in debug.active: return "%s / %s" % (repr(self), super(Classifier, self).__str__()) else: return _str(self, *args, **kwargs) def _pretrain(self, dataset): """Functionality prior to training """ # So we reset all conditional attributes and may be free up some memory # explicitly params = self.params if not params.retrainable: self.untrain() else: # just reset the ca, do not untrain self.ca.reset() if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData __idhashes = self.__idhashes __invalidatedChangedData = self.__invalidatedChangedData # if we don't know what was changed we need to figure # them out if __debug__: debug('CLF_', "IDHashes are %s", (__idhashes, )) # Look at the data if any was changed for key, data_ in (('traindata', dataset.samples), ('targets', dataset.sa[self.get_space()].value)): _changedData[key] = self.__was_data_changed(key, data_) # if those idhashes were invalidated by retraining # we need to adjust _changedData accordingly if __invalidatedChangedData.get(key, False): if __debug__ and not _changedData[key]: debug( 'CLF_', 'Found that idhash for %s was ' 'invalidated by retraining', (key, )) _changedData[key] = True # Look at the parameters for col in self._paramscols: changedParams = self._collections[col].which_set() if len(changedParams): _changedData[col] = changedParams self.__invalidatedChangedData = {} # reset it on training if __debug__: debug('CLF_', "Obtained _changedData is %s", (self._changedData, )) def _posttrain(self, dataset): """Functionality post training For instance -- computing confusion matrix. Parameters ---------- dataset : Dataset Data which was used for training """ super(Classifier, self)._posttrain(dataset) ca = self.ca # needs to be assigned first since below we use predict self.__trainednfeatures = dataset.nfeatures if __debug__ and 'CHECK_TRAINED' in debug.active: self.__trainedidhash = dataset.idhash if ca.is_enabled('training_stats') and \ not ca.is_set('training_stats'): # we should not store predictions for training data, # it is confusing imho (yoh) ca.change_temporarily(disable_ca=["predictions"]) if self.params.retrainable: # we would need to recheck if data is the same, # XXX think if there is a way to make this all # efficient. For now, probably, retrainable # classifiers have no chance but not to use # training_stats... sad self.__changedData_isset = False predictions = self.predict(dataset) ca.reset_changed_temporarily() targets = dataset.sa[self.get_space()].value if is_datasetlike(predictions) and (self.get_space() in predictions.fa): # e.g. in case of pair-wise uncombined results - provide # stats per each of the targets pairs prediction_targets = predictions.fa[self.get_space()].value ca.training_stats = dict( (t, self.__summary_class__(targets=targets, predictions=predictions.samples[:, i]) ) for i, t in enumerate(prediction_targets)) else: ca.training_stats = self.__summary_class__( targets=targets, predictions=predictions) def summary(self): """Providing summary over the classifier""" s = "Classifier %s" % self ca = self.ca ca_enabled = ca.enabled if self.trained: s += "\n trained" if ca.is_set('training_time'): s += ' in %.3g sec' % ca.training_time s += ' on data with' if ca.is_set('trained_targets'): s += ' targets:%s' % list(ca.trained_targets) nsamples, nchunks = None, None if ca.is_set('trained_nsamples'): nsamples = ca.trained_nsamples if ca.is_set('trained_dataset'): td = ca.trained_dataset nsamples, nchunks = td.nsamples, len(td.sa['chunks'].unique) if nsamples is not None: s += ' #samples:%d' % nsamples if nchunks is not None: s += ' #chunks:%d' % nchunks s += " #features:%d" % self.__trainednfeatures if ca.is_set('training_stats'): s += ", training error:%.3g" % ca.training_stats.error else: s += "\n not yet trained" if len(ca_enabled): s += "\n enabled ca:%s" % ', '.join( [str(ca[x]) for x in ca_enabled]) return s def clone(self): """Create full copy of the classifier. It might require classifier to be untrained first due to present SWIG bindings. TODO: think about proper re-implementation, without enrollment of deepcopy """ if __debug__: debug("CLF", "Cloning %s%s", (self, _strid(self))) try: return deepcopy(self) except: self.untrain() return deepcopy(self) def _train(self, dataset): """Function to be actually overridden in derived classes """ raise NotImplementedError def _prepredict(self, dataset): """Functionality prior prediction """ if not ('notrain2predict' in self.__tags__): # check if classifier was trained if that is needed if not self.trained: raise FailedToPredictError( "Classifier %s wasn't yet trained, therefore can't " "predict" % self) nfeatures = dataset.nfeatures #data.shape[1] # check if number of features is the same as in the data # it was trained on if nfeatures != self.__trainednfeatures: raise ValueError, \ "Classifier %s was trained on data with %d features, " % \ (self, self.__trainednfeatures) + \ "thus can't predict for %d features" % nfeatures if self.params.retrainable: if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData data = np.asanyarray(dataset.samples) _changedData['testdata'] = \ self.__was_data_changed('testdata', data) if __debug__: debug('CLF_', "prepredict: Obtained _changedData is %s", (_changedData, )) def _postpredict(self, dataset, result): """Functionality after prediction is computed """ self.ca.predictions = result if self.params.retrainable: self.__changedData_isset = False def _predict(self, dataset): """Actual prediction """ raise NotImplementedError @accepts_samples_as_dataset def predict(self, dataset): """Predict classifier on data Shouldn't be overridden in subclasses unless explicitly needed to do so. Also subclasses trying to call super class's predict should call _predict if within _predict instead of predict() since otherwise it would loop """ ## ??? yoh: changed to asany from as without exhaustive check data = np.asanyarray(dataset.samples) if __debug__: # Verify that we have no NaN/Inf's which we do not "support" ATM if not np.all(np.isfinite(data)): raise ValueError( "Some input data for predict is not finite (NaN or Inf)") debug("CLF", "Predicting classifier %s on ds %s", (self, dataset)) # remember the time when started computing predictions t0 = time.time() ca = self.ca # to assure that those are reset (could be set due to testing # post-training) ca.reset(['estimates', 'predictions']) self._prepredict(dataset) if self.__trainednfeatures > 0 \ or 'notrain2predict' in self.__tags__: result = self._predict(dataset) else: warning( "Trying to predict using classifier trained on no features") if __debug__: debug("CLF", "No features were present for training, prediction is " \ "bogus") result = [None] * data.shape[0] ca.predicting_time = time.time() - t0 # with labels mapping in-place, we also need to go back to the # literal labels if self._attrmap: try: result = self._attrmap.to_literal(result) except KeyError, e: raise FailedToPredictError, \ "Failed to convert predictions from numeric into " \ "literals: %s" % e self._postpredict(dataset, result) return result
class ProcrusteanMapper(ProjectionMapper): """Mapper to project from one space to another using Procrustean transformation (shift + scaling + rotation). Training this mapper requires data for both source and target space to be present in the training dataset. The source space data is taken from the training dataset's ``samples``, while the target space is taken from a sample attribute corresponding to the ``space`` setting of the ProcrusteanMapper. See: http://en.wikipedia.org/wiki/Procrustes_transformation """ scaling = Parameter( True, constraints='bool', doc="""Estimate a global scaling factor for the transformation (no longer rigid body)""") reflection = Parameter( True, constraints='bool', doc="""Allow for the data to be reflected (so it might not be a rotation. Effective only for non-oblique transformations. """) reduction = Parameter( True, constraints='bool', doc="""If true, it is allowed to map into lower-dimensional space. Forward transformation might be suboptimal then and reverse transformation might not recover all original variance.""") oblique = Parameter( False, constraints='bool', doc="""Either to allow non-orthogonal transformation -- might heavily overfit the data if there is less samples than dimensions. Use `oblique_rcond`.""") oblique_rcond = Parameter( -1, constraints='float', doc="""Cutoff for 'small' singular values to regularize the inverse. See :class:`~numpy.linalg.lstsq` for more information.""") svd = Parameter( 'numpy', constraints=EnsureChoice('numpy', 'scipy', 'dgesvd'), doc="""Implementation of SVD to use. dgesvd requires ctypes to be available.""") def __init__(self, space='targets', **kwargs): ProjectionMapper.__init__(self, space=space, **kwargs) self._scale = None """Estimated scale""" if self.params.svd == 'dgesvd' and not externals.exists( 'liblapack.so'): warning( "Reverting choice of svd for ProcrusteanMapper to be default " "'numpy' since liblapack.so seems not to be available for " "'dgesvd'") self.params.svd = 'numpy' def _train(self, source): params = self.params # Since it is unsupervised, we don't care about labels datas = () odatas = () means = () shapes = () assess_residuals = __debug__ and 'MAP_' in debug.active target = source.sa[self.get_space()].value for i, ds in enumerate((source, target)): if is_datasetlike(ds): data = np.asarray(ds.samples) else: data = ds if assess_residuals: odatas += (data, ) if self._demean: if i == 0: mean = self._offset_in else: mean = data.mean(axis=0) data = data - mean else: # no demeaning === zero means mean = np.zeros(shape=data.shape[1:]) means += (mean, ) datas += (data, ) shapes += (data.shape, ) # shortcuts for sizes sn, sm = shapes[0] tn, tm = shapes[1] # Check the sizes if sn != tn: raise ValueError, "Data for both spaces should have the same " \ "number of samples. Got %d in source and %d in target space" \ % (sn, tn) # Sums of squares ssqs = [np.sum(d**2, axis=0) for d in datas] # XXX check for being invariant? # needs to be tuned up properly and not raise but handle for i in xrange(2): if np.all(ssqs[i] <= np.abs((np.finfo(datas[i].dtype).eps * sn * means[i])**2)): raise ValueError, "For now do not handle invariant in time datasets" norms = [np.sqrt(np.sum(ssq)) for ssq in ssqs] normed = [data / norm for (data, norm) in zip(datas, norms)] # add new blank dimensions to source space if needed if sm < tm: normed[0] = np.hstack((normed[0], np.zeros((sn, tm - sm)))) if sm > tm: if params.reduction: normed[1] = np.hstack((normed[1], np.zeros((sn, sm - tm)))) else: raise ValueError, "reduction=False, so mapping from " \ "higher dimensionality " \ "source space is not supported. Source space had %d " \ "while target %d dimensions (features)" % (sm, tm) source, target = normed if params.oblique: # Just do silly linear system of equations ;) or naive # inverse problem if sn == sm and tm == 1: T = np.linalg.solve(source, target) else: T = np.linalg.lstsq(source, target, rcond=params.oblique_rcond)[0] ss = 1.0 else: # Orthogonal transformation # figure out optimal rotation if params.svd == 'numpy': U, s, Vh = np.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'scipy': # would raise exception if not present externals.exists('scipy', raise_=True) import scipy U, s, Vh = scipy.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'dgesvd': from mvpa2.support.lapack_svd import svd as dgesvd U, s, Vh = dgesvd(np.dot(target.T, source), full_matrices=True, algo='svd') else: raise ValueError('Unknown type of svd %r' % (params.svd)) T = np.dot(Vh.T, U.T) if not params.reflection: # then we need to assure that it is only rotation # "recipe" from # http://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem # for more and info and original references, see # http://dx.doi.org/10.1007%2FBF02289451 s_new = np.ones_like(s) s_new[-1] = np.linalg.det(T) T = np.dot(Vh.T * s_new, U.T) # figure out scale and final translation if not params.reflection: ss = np.sum(s_new * s) else: ss = np.sum(s) # if we were to collect standardized distance # std_d = 1 - sD**2 # select out only relevant dimensions if sm != tm: T = T[:sm, :tm] self._scale = scale = ss * norms[1] / norms[0] # Assign projection if self.params.scaling: proj = scale * T else: proj = T self._proj = proj if self._demean: self._offset_out = means[1] if __debug__ and 'MAP_' in debug.active: # compute the residuals res_f = self.forward(odatas[0]) d_f = np.linalg.norm(odatas[1] - res_f) / np.linalg.norm(odatas[1]) res_r = self.reverse(odatas[1]) d_r = np.linalg.norm(odatas[0] - res_r) / np.linalg.norm(odatas[0]) debug( 'MAP_', "%s, residuals are forward: %g," " reverse: %g" % (repr(self), d_f, d_r)) def _compute_recon(self): """For Procrustean mapper, inverse is transpose. So, let's skip computing inverse in the super class. """ # XXX Change pinv to superclass compute_recon? if self.params.oblique: #return ProjectionMapper._compute_recon(self) return np.linalg.pinv(self._proj) else: return np.transpose( self._proj / self._scale**2) if self.params.scaling else np.transpose( self._proj)
class GDA(Classifier): """Gaussian Discriminant Analysis -- base for LDA and QDA """ __tags__ = ['binary', 'multiclass', 'oneclass'] prior = Parameter('laplacian_smoothing', constraints=EnsureChoice('laplacian_smoothing', 'uniform', 'ratio'), doc="""How to compute prior distribution.""") allow_pinv = Parameter( True, constraints='bool', doc="""Allow pseudo-inverse in case of degenerate covariance(s).""") def __init__(self, **kwargs): """Initialize a GDA classifier. """ # init base class first Classifier.__init__(self, **kwargs) # pylint friendly initializations self.means = None """Means of features per class""" self.cov = None """Co-variances per class, but "vars" is taken ;)""" self.ulabels = None """Labels classifier was trained on""" self.priors = None """Class probabilities""" self.nsamples_per_class = None """Number of samples per class - used by derived classes""" # Define internal state of classifier self._norm_weight = None def _get_priors(self, nlabels, nsamples, nsamples_per_class): """Return prior probabilities given data """ prior = self.params.prior if prior == 'uniform': priors = np.ones((nlabels, )) / nlabels elif prior == 'laplacian_smoothing': priors = (1+np.squeeze(nsamples_per_class)) \ / (float(nsamples) + nlabels) elif prior == 'ratio': priors = np.squeeze(nsamples_per_class) / float(nsamples) else: raise ValueError, \ "No idea on how to handle '%s' way to compute priors" \ % self.params.prior return np.atleast_1d(priors) def _train(self, dataset): """Train the classifier using `dataset` (`Dataset`). """ params = self.params targets_sa_name = self.get_space() targets_sa = dataset.sa[targets_sa_name] # get the dataset information into easy vars X = dataset.samples labels = targets_sa.value self.ulabels = ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) # set the feature dimensions nsamples = len(X) nfeatures = dataset.nfeatures self.means = means = \ np.zeros((nlabels, nfeatures)) # degenerate dimension are added for easy broadcasting later on # XXX might want to remove -- for now taken from GNB as is self.nsamples_per_class = nsamples_per_class \ = np.zeros((nlabels, 1)) self.cov = cov = \ np.zeros((nlabels, nfeatures, nfeatures)) # Estimate cov # better loop than repmat! ;) for l, il in label2index.iteritems(): Xl = X[labels == l] nsamples_per_class[il] = len(Xl) # TODO: degenerate case... no samples for known label for # some reason? means[il] = np.mean(Xl, axis=0) # since we have means already lets do manually cov here Xldm = Xl - means[il] cov[il] = np.dot(Xldm.T, Xldm) # scaling will be done correspondingly in LDA or QDA # Store prior probabilities self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class) if __debug__ and 'GDA' in debug.active: debug( 'GDA', "training finished on data.shape=%s " % (X.shape, ) + "min:max(data)=%f:%f" % (np.min(X), np.max(X))) def _untrain(self): """Untrain classifier and reset all learnt params """ self.means = None self.cov = None self.ulabels = None self.priors = None super(GDA, self)._untrain() @accepts_dataset_as_samples def _predict(self, data): """Predict the output for the provided data. """ params = self.params self.ca.estimates = prob_cs_cp = self._g_k(data) # Take the class with maximal (log)probability # XXX in GNB it is axis=0, i.e. classes were first winners = prob_cs_cp.argmax(axis=1) predictions = [self.ulabels[c] for c in winners] if __debug__ and 'GDA' in debug.active: debug( 'GDA', "predict on data.shape=%s min:max(data)=%f:%f " % (data.shape, np.min(data), np.max(data))) return predictions def _inv(self, cov): try: return np.linalg.inv(cov) except Exception, e: if self.params.allow_pinv: try: return np.linalg.pinv(cov) except Exception, e: pass raise DegenerateInputError, \ "Data is probably singular, since inverse fails. Got %s"\ % (e,)
class LinearSVMWeights(Sensitivity): """`SensitivityAnalyzer` for the LIBSVM implementation of a linear SVM. """ _ATTRIBUTE_COLLECTIONS = ['params'] split_weights = Parameter( False, allowedtype='bool', doc="If binary classification either to sum SVs per each " "class separately. Note: be careful with interpretation" " of the values") def __init__(self, clf, **kwargs): """Initialize the analyzer with the classifier it shall use. Parameters ---------- clf : LinearSVM classifier to use. Only classifiers sub-classed from `LinearSVM` may be used. """ # init base classes first Sensitivity.__init__(self, clf, **kwargs) def _call(self, dataset, callables=[]): # local bindings clf = self.clf model = clf.model # Labels for sensitivities to be returned sens_labels = None if clf.__is_regression__: nr_class = None svm_labels = None # shouldn't bother to provide "targets" for regressions else: nr_class = model.nr_class svm_labels = model.labels # No need to warn since now we by default we do not do # anything evil and provide labels -- so it is up for a user # to decide either he wants to do something silly #if nr_class != 2: # warning("You are estimating sensitivity for SVM %s trained on %d" % # (str(clf), nr_class) + # " classes. Make sure that it is what you intended to do" ) svcoef = np.matrix(model.get_sv_coef()) svs = np.matrix(model.get_sv()) rhos = np.asarray(model.get_rho()) if self.params.split_weights: if nr_class != 2: raise NotImplementedError, \ "Cannot compute per-class weights for" \ " non-binary classification task" # libsvm might have different idea on the ordering # of labels, so we would need to map them back explicitely ds_labels = list( dataset.sa[clf.get_space()].unique) # labels in the dataset senses = [None for i in ds_labels] # first label is given positive value for i, (c, l) in enumerate([(svcoef > 0, lambda x: x), (svcoef < 0, lambda x: x * -1)]): # convert to array, and just take the meaningful dimension c_ = c.A[0] # NOTE svm_labels are numerical; ds_labels are literal senses[ds_labels.index( clf._attrmap.to_literal(svm_labels[i]))] = \ (l(svcoef[:, c_] * svs[c_, :])).A[0] weights = np.array(senses) sens_labels = svm_labels else: # XXX yoh: .mean() is effectively # averages across "sensitivities" of all paired classifiers (I # think). See more info on this topic in svm.py on how sv_coefs # are stored # # First multiply SV coefficients with the actual SVs to get # weighted impact of SVs on decision, then for each feature # take mean across SVs to get a single weight value # per feature if nr_class is None or nr_class <= 2: # as simple as this weights = (svcoef * svs).A # and only in case of classification if nr_class: # ??? First label seems corresponds to positive sens_labels = [tuple(svm_labels[::-1])] else: # we need to compose correctly per each pair of classifiers. # See docstring for get_sv_coef for more details on internal # structure of bloody storage # total # of pairs npairs = nr_class * (nr_class - 1) / 2 # # of SVs in each class NSVs_perclass = model.get_n_sv() # indices where each class starts in each row of SVs # name is after similar variable in libsvm internals nz_start = np.cumsum([0] + NSVs_perclass[:-1]) nz_end = nz_start + NSVs_perclass # reserve storage weights = np.zeros((npairs, svs.shape[1])) ipair = 0 # index of the pair """ // classifier (i,j): coefficients with // i are in sv_coef[j-1][nz_start[i]...], // j are in sv_coef[i][nz_start[j]...] """ sens_labels = [] for i in xrange(nr_class): for j in xrange(i + 1, nr_class): weights[ipair, :] = np.asarray( svcoef[j - 1, nz_start[i]:nz_end[i]] * svs[nz_start[i]:nz_end[i]] + svcoef[i, nz_start[j]:nz_end[j]] * svs[nz_start[j]:nz_end[j]]) # ??? First label corresponds to positive # that is why [j], [i] sens_labels += [(svm_labels[j], svm_labels[i])] ipair += 1 # go to the next pair assert (ipair == npairs) if __debug__ and 'SVM' in debug.active: if nr_class: nsvs = model.get_n_sv() else: nsvs = model.get_total_n_sv() if clf.__is_regression__: svm_type = clf._svm_impl # type of regression else: svm_type = '%d-class SVM(%s)' % (nr_class, clf._svm_impl) debug('SVM', "Extracting weights for %s: #SVs=%s, " % \ (svm_type, nsvs) + \ " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \ (svcoef.shape, svs.shape, rhos) + \ " Result: min=%f max=%f" % (np.min(weights), np.max(weights))) ds_kwargs = {} if nr_class: # for classification only # and we should have prepared the labels assert (sens_labels is not None) if len(clf._attrmap): if isinstance(sens_labels[0], tuple): sens_labels = asobjarray(sens_labels) sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) # NOTE: `weights` is already and always 2D ds_kwargs = dict(sa={clf.get_space(): sens_labels}) weights_ds = Dataset(weights, **ds_kwargs) weights_ds.sa['biases'] = rhos return weights_ds _customizeDocInherit = True
class SVM(_SVM): """Support Vector Machine Classifier(s) based on Shogun This is a simple base interface """ __default_kernel_class__ = _default_kernel_class_ num_threads = Parameter(1, min=1, doc='Number of threads to utilize') _KNOWN_PARAMS = [ 'epsilon' ] __tags__ = _SVM.__tags__ + [ 'sg', 'retrainable' ] # Some words of wisdom from shogun author: # XXX remove after proper comments added to implementations """ If you'd like to train linear SVMs use SGD or OCAS. These are (I am serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs with standard additive bias, but will L2 reqularize it - though it should not matter much in practice (although it will give slightly different solutions)). Note that SGD has no stopping criterion (you simply have to specify the number of iterations) and that OCAS has a different stopping condition than svmlight for example which may be more tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3 for epsilon. If you would like to train kernel SVMs use libsvm/gpdt/svmlight - depending on the problem one is faster than the other (hard to say when, I *think* when your dataset is very unbalanced chunking methods like svmlight/gpdt are better), for smaller problems definitely libsvm. If you use string kernels then gpdt/svmlight have a special 'linadd' speedup for this (requires sg 0.6.2 - there was some inefficiency in the code for python-modular before that). This is effective for big datasets and (I trained on 10 million strings based on this). And yes currently we only implemented parallel training for svmlight, however all SVMs can be evaluated in parallel. """ _KNOWN_SENSITIVITIES={'linear':LinearSVMWeights, } _KNOWN_IMPLEMENTATIONS = {} if externals.exists('shogun', raise_=True): _KNOWN_IMPLEMENTATIONS = { "libsvm" : (shogun.Classifier.LibSVM, ('C',), ('multiclass', 'binary'), "LIBSVM's C-SVM (L2 soft-margin SVM)"), "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), ('multiclass', 'binary'), "Generalized Nearest Point Problem SVM"), # XXX should have been GPDT, shogun has it fixed since some version "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',), "Gradient Projection Decomposition Technique for " \ "large-scale SVM problems"), "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',), "Generalized Nearest Point Problem SVM"), ## TODO: Needs sparse features... # "svmlin" : (shogun.Classifier.SVMLin, ''), # "liblinear" : (shogun.Classifier.LibLinear, ''), # "subgradient" : (shogun.Classifier.SubGradientSVM, ''), ## good 2-class linear SVMs # "ocas" : (shogun.Classifier.SVMOcas, ''), # "sgd" : ( shogun.Classifier.SVMSGD, ''), # regressions "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), ('regression',), "LIBSVM's epsilon-SVR"), } def __init__(self, **kwargs): """Interface class to Shogun's classifiers and regressions. Default implementation is 'libsvm'. """ svm_impl = kwargs.get('svm_impl', 'libsvm').lower() kwargs['svm_impl'] = svm_impl # init base class _SVM.__init__(self, **kwargs) self.__svm = None """Holds the trained svm.""" self.__svm_apply = None """Compatibility convenience to bind to the classify/apply method of __svm""" # Need to store original data... # TODO: keep 1 of them -- just __traindata or __traindataset # For now it is needed for computing sensitivities self.__traindataset = None # internal SG swig proxies self.__traindata = None self.__kernel = None self.__kernel_test = None self.__testdata = None # remove kernel-based for some # TODO RF: provide separate handling for non-kernel machines if svm_impl in ['svmocas']: if not (self.__kernel is None or self.__kernel.__kernel_name__ == 'linear'): raise ValueError( "%s is inherently linear, thus provided kernel %s " "is of no effect" % (svm_impl, self.__kernel)) self.__tags__.pop(self.__tags__.index('kernel-based')) self.__tags__.pop(self.__tags__.index('retrainable')) # TODO: integrate with kernel framework #def __condition_kernel(self, kernel): ## XXX I thought that it is needed only for retrainable classifier, ## but then krr gets confused, and svrlight needs it to provide ## meaningful results even without 'retraining' #if self._svm_impl in ['svrlight', 'lightsvm']: #try: #kernel.set_precompute_matrix(True, True) #except Exception, e: ## N/A in shogun 0.9.1... TODO: RF #if __debug__: #debug('SG_', "Failed call to set_precompute_matrix for %s: %s" #% (self, e)) def _train(self, dataset): """Train SVM """ # XXX watchout # self.untrain() newkernel, newsvm = False, False # local bindings for faster lookup params = self.params retrainable = self.params.retrainable targets_sa_name = self.get_space() # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa if retrainable: _changedData = self._changedData # LABELS ul = None self.__traindataset = dataset # OK -- we have to map labels since # binary ones expect -1/+1 # Multiclass expect labels starting with 0, otherwise they puke # when ran from ipython... yikes if __debug__: debug("SG_", "Creating labels instance") if self.__is_regression__: labels_ = np.asarray(targets_sa.value, dtype='double') else: ul = targets_sa.unique # ul.sort() if len(ul) == 2: # assure that we have -1/+1 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0} elif len(ul) < 2: raise FailedToTrainError, \ "We do not have 1-class SVM brought into SG yet" else: # can't use plain enumerate since we need them swapped _labels_dict = dict([ (ul[i], i) for i in range(len(ul))]) # Create SG-customized attrmap to assure -1 / +1 if necessary self._attrmap = AttributeMap(_labels_dict, mapnumeric=True) if __debug__: debug("SG__", "Mapping labels using dict %s" % _labels_dict) labels_ = self._attrmap.to_numeric(targets_sa.value).astype(float) labels = shogun.Features.Labels(labels_) _setdebug(labels, 'Labels') # KERNEL # XXX cruel fix for now... whole retraining business needs to # be rethought if retrainable: _changedData['kernel_params'] = _changedData.get('kernel_params', False) # TODO: big RF to move non-kernel classifiers away if 'kernel-based' in self.__tags__ and (not retrainable or _changedData['traindata'] or _changedData['kernel_params']): # If needed compute or just collect arguments for SVM and for # the kernel if retrainable and __debug__: if _changedData['traindata']: debug("SG", "Re-Creating kernel since training data has changed") if _changedData['kernel_params']: debug("SG", "Re-Creating kernel since params %s has changed" % _changedData['kernel_params']) k = self.params.kernel k.compute(dataset) self.__kernel = kernel = k.as_raw_sg() newkernel = True self.kernel_params.reset() # mark them as not-changed #_setdebug(kernel, 'Kernels') #self.__condition_kernel(kernel) if retrainable: if __debug__: debug("SG_", "Resetting test kernel for retrainable SVM") self.__kernel_test = None # TODO -- handle _changedData['params'] correctly, ie without recreating # whole SVM Cs = None if not retrainable or self.__svm is None or _changedData['params']: # SVM if self.params.has_key('C'): Cs = self._get_cvec(dataset) # XXX do not jump over the head and leave it up to the user # ie do not rescale automagically by the number of samples #if len(Cs) == 2 and not ('regression' in self.__tags__) and len(ul) == 2: # # we were given two Cs # if np.max(C) < 0 and np.min(C) < 0: # # and both are requested to be 'scaled' TODO : # # provide proper 'features' to the parameters, # # so we could specify explicitely if to scale # # them by the number of samples here # nl = [np.sum(labels_ == _labels_dict[l]) for l in ul] # ratio = np.sqrt(float(nl[1]) / nl[0]) # #ratio = (float(nl[1]) / nl[0]) # Cs[0] *= ratio # Cs[1] /= ratio # if __debug__: # debug("SG_", "Rescaled Cs to %s to accomodate the " # "difference in number of training samples" % # Cs) # Choose appropriate implementation svm_impl_class = self.__get_implementation(ul) if __debug__: debug("SG", "Creating SVM instance of %s" % `svm_impl_class`) if self._svm_impl in ['libsvr', 'svrlight']: # for regressions constructor a bit different self.__svm = svm_impl_class(Cs[0], self.params.tube_epsilon, self.__kernel, labels) # we need to set epsilon explicitly self.__svm.set_epsilon(self.params.epsilon) elif self._svm_impl in ['krr']: self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) elif 'kernel-based' in self.__tags__: self.__svm = svm_impl_class(Cs[0], self.__kernel, labels) self.__svm.set_epsilon(self.params.epsilon) else: traindata_sg = _tosg(dataset.samples) self.__svm = svm_impl_class(Cs[0], traindata_sg, labels) self.__svm.set_epsilon(self.params.epsilon) # To stay compatible with versions across API changes in sg 1.0.0 self.__svm_apply = externals.versions['shogun'] >= '1' \ and self.__svm.apply \ or self.__svm.classify # the last one for old API # Set shrinking if 'shrinking' in params: shrinking = params.shrinking if __debug__: debug("SG_", "Setting shrinking to %s" % shrinking) self.__svm.set_shrinking_enabled(shrinking) if Cs is not None and len(Cs) == 2: if __debug__: debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs) self.__svm.set_C(Cs[0], Cs[1]) self.params.reset() # mark them as not-changed newsvm = True _setdebug(self.__svm, 'SVM') # Set optimization parameters if self.params.has_key('tube_epsilon') and \ hasattr(self.__svm, 'set_tube_epsilon'): self.__svm.set_tube_epsilon(self.params.tube_epsilon) self.__svm.parallel.set_num_threads(self.params.num_threads) else: if __debug__: debug("SG_", "SVM instance is not re-created") if _changedData['targets']: # labels were changed if __debug__: debug("SG__", "Assigning new labels") self.__svm.set_labels(labels) if newkernel: # kernel was replaced if __debug__: debug("SG__", "Assigning new kernel") self.__svm.set_kernel(self.__kernel) assert(_changedData['params'] is False) # we should never get here if retrainable: # we must assign it only if it is retrainable self.ca.retrained = not newsvm or not newkernel # Train if __debug__ and 'SG' in debug.active: if not self.__is_regression__: lstr = " with labels %s" % targets_sa.unique else: lstr = "" debug("SG", "%sTraining %s on data%s" % (("","Re-")[retrainable and self.ca.retrained], self, lstr)) self.__svm.train() if __debug__: debug("SG_", "Done training SG_SVM %s" % self) # Report on training if (__debug__ and 'SG__' in debug.active) or \ self.ca.is_enabled('training_stats'): if __debug__: debug("SG_", "Assessing predictions on training data") trained_targets = self.__svm_apply().get_labels() else: trained_targets = None if __debug__ and "SG__" in debug.active: debug("SG__", "Original labels: %s, Trained labels: %s" % (targets_sa.value, trained_targets)) # Assign training confusion right away here since we are ready # to do so. # XXX TODO use some other conditional attribute like 'trained_targets' and # use it within base Classifier._posttrain to assign predictions # instead of duplicating code here # XXX For now it can be done only for regressions since labels need to # be remapped and that becomes even worse if we use regression # as a classifier so mapping happens upstairs if self.__is_regression__ and self.ca.is_enabled('training_stats'): self.ca.training_stats = self.__summary_class__( targets=targets_sa.value, predictions=trained_targets) # XXX actually this is the beast which started this evil conversion # so -- make use of dataset here! ;) @accepts_samples_as_dataset def _predict(self, dataset): """Predict values for the data """ retrainable = self.params.retrainable if retrainable: changed_testdata = self._changedData['testdata'] or \ self.__kernel_test is None if not retrainable: if __debug__: debug("SG__", "Initializing SVMs kernel of %s with training/testing samples" % self) self.params.kernel.compute(self.__traindataset, dataset) self.__kernel_test = self.params.kernel.as_sg()._k # We can just reuse kernel used for training #self.__condition_kernel(self.__kernel) else: if changed_testdata: #if __debug__: #debug("SG__", #"Re-creating testing kernel of %s giving " #"arguments %s" % #(`self._kernel_type`, self.__kernel_args)) self.params.kernel.compute(self.__traindataset, dataset) #_setdebug(kernel_test, 'Kernels') #_setdebug(kernel_test_custom, 'Kernels') self.__kernel_test = self.params.kernel.as_raw_sg() elif __debug__: debug("SG__", "Re-using testing kernel") assert(self.__kernel_test is not None) if 'kernel-based' in self.__tags__: self.__svm.set_kernel(self.__kernel_test) # doesn't do any good imho although on unittests helps tiny bit... hm #self.__svm.init_kernel_optimization() values_ = self.__svm_apply() else: testdata_sg = _tosg(dataset.samples) self.__svm.set_features(testdata_sg) values_ = self.__svm_apply() if __debug__: debug("SG_", "Classifying testing data") if values_ is None: raise RuntimeError, "We got empty list of values from %s" % self values = values_.get_labels() if retrainable: # we must assign it only if it is retrainable self.ca.repredicted = repredicted = not changed_testdata if __debug__: debug("SG__", "Re-assigning learing kernel. Repredicted is %s" % repredicted) # return back original kernel if 'kernel-based' in self.__tags__: self.__svm.set_kernel(self.__kernel) if __debug__: debug("SG__", "Got values %s" % values) if (self.__is_regression__): predictions = values else: if len(self._attrmap.keys()) == 2: predictions = np.sign(values) # since np.sign(0) == 0 predictions[predictions==0] = 1 else: predictions = values # remap labels back adjusting their type # XXX YOH: This is done by topclass now (needs RF) #predictions = self._attrmap.to_literal(predictions) if __debug__: debug("SG__", "Tuned predictions %s" % predictions) # store conditional attribute # TODO: extract values properly for multiclass SVMs -- # ie 1 value per label or pairs for all 1-vs-1 classifications self.ca.estimates = values ## to avoid leaks with not yet properly fixed shogun if not retrainable: try: testdata.free_features() except: pass return predictions def _untrain(self): super(SVM, self)._untrain() # untrain/clean the kernel -- we might not allow to drag SWIG # instance around BUT XXX -- make it work fine with # CachedKernel -- we might not want to fully "untrain" in such # case self.params.kernel.cleanup() # XXX unify naming if not self.params.retrainable: if __debug__: debug("SG__", "Untraining %(clf)s and destroying sg's SVM", msgargs={'clf':self}) # to avoid leaks with not yet properly fixed shogun # XXX make it nice... now it is just stable ;-) if True: # not self.__traindata is None: if True: # try: if self.__kernel is not None: del self.__kernel self.__kernel = None if self.__kernel_test is not None: del self.__kernel_test self.__kernel_test = None if self.__svm is not None: del self.__svm self.__svm = None self.__svm_apply = None if self.__traindata is not None: # Let in for easy demonstration of the memory leak in shogun #for i in xrange(10): # debug("SG__", "cachesize pre free features %s" % # (self.__svm.get_kernel().get_cache_size())) self.__traindata.free_features() del self.__traindata self.__traindata = None self.__traindataset = None #except: # pass if __debug__: debug("SG__", "Done untraining %(self)s and destroying sg's SVM", msgargs=locals()) elif __debug__: debug("SG__", "Not untraining %(self)s since it is retrainable", msgargs=locals()) def __get_implementation(self, ul): if self.__is_regression__ or len(ul) == 2: svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0] else: if self._svm_impl == 'libsvm': svm_impl_class = shogun.Classifier.LibSVMMultiClass elif self._svm_impl == 'gmnp': svm_impl_class = shogun.Classifier.GMNPSVM else: raise RuntimeError, \ "Shogun: Implementation %s doesn't handle multiclass " \ "data. Got labels %s. Use some other classifier" % \ (self._svm_impl, self.__traindataset.sa[self.get_space()].unique) if __debug__: debug("SG_", "Using %s for multiclass data of %s" % (svm_impl_class, self._svm_impl)) return svm_impl_class svm = property(fget=lambda self: self.__svm) """Access to the SVM model.""" traindataset = property(fget=lambda self: self.__traindataset) """Dataset which was used for training
class GroupClusterThreshold_NN3(Learner): """Statistical evaluation of group-level average accuracy maps This algorithm can be used to perform cluster-thresholding of searchlight-based group analyses. It implements a two-stage procedure that uses the results of within-subject permutation analyses, estimates a per feature cluster forming threshold (via bootstrap), and uses the thresholded bootstrap samples to estimate the distribution of cluster sizes in group-average accuracy maps under the NULL hypothesis, as described in [1]_. Note: this class implements a modified version of that algorithm. The present implementation differs in, at least, four aspects from the description in that paper. 1) Cluster p-values refer to the probability of observing a particular cluster size or a larger one (original paper: probability to observe a larger cluster only). Consequently, probabilities reported by this implementation will have a tendency to be higher in comparison. 2) Clusters found in the original (unpermuted) accuracy map are always included in the NULL distribution estimate of cluster sizes. This provides an explicit lower bound for probabilities, as there will always be at least one observed cluster for every cluster size found in the original accuracy map. Consequently, it is impossible to get a probability of zero for clusters of any size (see [2] for more information). 3) Bootstrap accuracy maps that contain no clusters are counted in a dedicated size-zero bin in the NULL distribution of cluster sizes. This change yields reliable cluster-probabilities even for very low featurewise threshold probabilities, where (some portion) of the bootstrap accuracy maps do not contain any clusters. 4) The method for FWE-correction used by the original authors is not provided. Instead, a range of alternatives implemented by the statsmodels package are available. Moreover, this implementation minimizes the required memory demands and allows for computing large numbers of bootstrap samples without significant increase in memory demand (CPU time trade-off). Instances of this class must be trained before than can be used to threshold accuracy maps. The training dataset must match the following criteria: 1) For every subject in the group, it must contain multiple accuracy maps that are the result of a within-subject classification analysis based on permuted class labels. One map must corresponds to one fixed permutation for all features in the map, as described in [1]_. The original authors recommend 100 accuracy maps per subject for a typical searchlight analysis. 2) It must contain a sample attribute indicating which sample is associated with which subject, because bootstrapping average accuracy maps is implemented by randomly drawing one map from each subject. The name of the attribute can be configured via the ``chunk_attr`` parameter. After training, an instance can be called with a dataset to perform threshold and statistical evaluation. Unless a single-sample dataset is passed, all samples in the input dataset will be averaged prior thresholding. Returns ------- Dataset This is a shallow copy of the input dataset (after a potential averaging), hence contains the same data and attributes. In addition it includes the following attributes: ``fa.featurewise_thresh`` Vector with feature-wise cluster-forming thresholds. ``fa.clusters_featurewise_thresh`` Vector with labels for clusters after thresholding the input data with the desired feature-wise probability. Each unique non-zero element corresponds to an individual super-threshold cluster. Cluster values are sorted by cluster size (number of features). The largest cluster is always labeled with ``1``. ``fa.clusters_fwe_thresh`` Vector with labels for super-threshold clusters after correction for multiple comparisons. The attribute is derived from ``fa.clusters_featurewise_thresh`` by removing all clusters that do not pass the threshold when controlling for the family-wise error rate. ``a.clusterstats`` Record array with information on all detected clusters. The array is sorted according to cluster size, starting with the largest cluster in terms of number of features. The array contains the fields ``size`` (number of features comprising the cluster), ``mean``, ``median``, min``, ``max``, ``std`` (respective descriptive statistics for all clusters), and ``prob_raw`` (probability of observing the cluster of a this size or larger under the NULL hypothesis). If correction for multiple comparisons is enabled an additional field ``prob_corrected`` (probability after correction) is added. ``a.clusterlocations`` Record array with information on the location of all detected clusters. The array is sorted according to cluster size (same order as ``a.clusterstats``. The array contains the fields ``max`` (feature coordinate of the maximum score within the cluster, and ``center_of_mass`` (coordinate of the center of mass; weighted by the feature values within the cluster. References ---------- .. [1] Johannes Stelzer, Yi Chen and Robert Turner (2013). Statistical inference and multiple testing correction in classification-based multi-voxel pattern analysis (MVPA): Random permutations and cluster size control. NeuroImage, 65, 69--82. .. [2] Smyth, G. K., & Phipson, B. (2010). Permutation P-values Should Never Be Zero: Calculating Exact P-values When Permutations Are Randomly Drawn. Statistical Applications in Genetics and Molecular Biology, 9, 1--12. """ n_bootstrap = Parameter( 100000, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of bootstrap samples to be generated from the training dataset. For each sample, an average map will be computed from a set of randomly drawn samples (one from each chunk). Bootstrap samples will be used to estimate a featurewise NULL distribution of accuracy values for initial thresholding, and to estimate the NULL distribution of cluster sizes under the NULL hypothesis. A larger number of bootstrap samples reduces the lower bound of probabilities, which may be beneficial for multiple comparison correction.""") feature_thresh_prob = Parameter( 0.001, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0), doc="""Feature-wise probability threshold. The value corresponding to this probability in the NULL distribution of accuracies will be used as threshold for cluster forming. Given that the NULL distribution is estimated per feature, the actual threshold value will vary across features yielding a threshold vector. The number of bootstrap samples need to be adequate for a desired probability. A ``ValueError`` is raised otherwise.""") chunk_attr = Parameter( 'chunks', doc="""Name of the attribute indicating the individual chunks from which a single sample each is drawn for averaging into a bootstrap sample.""") fwe_rate = Parameter( 0.05, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0), doc="""Family-wise error rate for multiple comparison correction of cluster size probabilities.""") multicomp_correction = Parameter( 'fdr_bh', constraints=EnsureChoice('bonferroni', 'sidak', 'holm-sidak', 'holm', 'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', None), doc="""Strategy for multiple comparison correction of cluster probabilities. All methods supported by statsmodels' ``multitest`` are available. In addition, ``None`` can be specified to disable correction.""") n_blocks = Parameter( 1, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of segments used to compute the feature-wise NULL distributions. This parameter determines the peak memory demand. In case of a single segment a matrix of size (n_bootstrap x nfeatures) will be allocated. Increasing the number of segments reduces the peak memory demand by that roughly factor. """) n_proc = Parameter( 1, constraints=EnsureInt() & EnsureRange(min=1), doc="""Number of parallel processes to use for computation. Requires `joblib` external module.""") def __init__(self, **kwargs): # force disable auto-train: would make no sense Learner.__init__(self, auto_train=False, **kwargs) if 1. / (self.params.n_bootstrap + 1) > self.params.feature_thresh_prob: raise ValueError('number of bootstrap samples is insufficient for' ' the desired threshold probability') self.untrain() def _untrain(self): self._thrmap = None self._null_cluster_sizes = None @due.dcite( Doi("10.1016/j.neuroimage.2012.09.063"), description="Statistical assessment of (searchlight) MVPA results", tags=['implementation']) def _train(self, ds): # shortcuts chunk_attr = self.params.chunk_attr # # Step 0: bootstrap maps by drawing one for each chunk and average them # (do N iterations) # this could take a lot of memory, hence instead of computing the maps # we compute the source maps they can be computed from and then (re)build # the matrix of bootstrapped maps either row-wise or column-wise (as # needed) to save memory by a factor of (close to) `n_bootstrap` # which samples belong to which chunk chunk_samples = dict([(c, np.where(ds.sa[chunk_attr].value == c)[0]) for c in ds.sa[chunk_attr].unique]) # pre-built the bootstrap combinations bcombos = [[random.sample(v, 1)[0] for v in chunk_samples.values()] for i in xrange(self.params.n_bootstrap)] bcombos = np.array(bcombos, dtype=int) # # Step 1: find the per-feature threshold that corresponds to some p # in the NULL segwidth = ds.nfeatures / self.params.n_blocks # speed things up by operating on an array not a dataset ds_samples = ds.samples if __debug__: debug('GCTHR', 'Compute per-feature thresholds in %i blocks of %i features' % (self.params.n_blocks, segwidth)) # Execution can be done in parallel as the estimation is independent # across features def featuresegment_producer(ncols): for segstart in xrange(0, ds.nfeatures, ncols): # one average map for every stored bcombo # this also slices the input data into feature subsets # for the compute blocks yield [np.mean( # get a view to a subset of the features # -- should be somewhat efficient as feature axis is # sliced ds_samples[sidx, segstart:segstart + ncols], axis=0) for sidx in bcombos] if self.params.n_proc == 1: # Serial execution thrmap = np.hstack( # merge across compute blocks [get_thresholding_map(d, self.params.feature_thresh_prob) # compute a partial threshold map for as many features # as fit into a compute block for d in featuresegment_producer(segwidth)]) else: # Parallel execution verbose_level_parallel = 50 \ if (__debug__ and 'GCTHR' in debug.active) else 0 # local import as only parallel execution needs this from joblib import Parallel, delayed # same code as above, just in parallel with joblib's Parallel thrmap = np.hstack( Parallel(n_jobs=self.params.n_proc, pre_dispatch=self.params.n_proc, verbose=verbose_level_parallel)( delayed(get_thresholding_map) (d, self.params.feature_thresh_prob) for d in featuresegment_producer(segwidth))) # store for later thresholding of input data self._thrmap = thrmap # # Step 2: threshold all NULL maps and build distribution of NULL cluster # sizes # cluster_sizes = Counter() # recompute the bootstrap average maps to threshold them and determine # cluster sizes dsa = dict(mapper=ds.a.mapper) if 'mapper' in ds.a else {} if __debug__: debug('GCTHR', 'Estimating NULL distribution of cluster sizes') # this step can be computed in parallel chunks to speeds things up if self.params.n_proc == 1: # Serial execution for sidx in bcombos: avgmap = np.mean(ds_samples[sidx], axis=0)[None] # apply threshold clustermap = avgmap > thrmap # wrap into a throw-away dataset to get the reverse mapping right bds = Dataset(clustermap, a=dsa) # this function reverse-maps every sample one-by-one, hence no need # to collect chunks of bootstrapped maps cluster_sizes = get_cluster_sizes(bds, cluster_sizes) else: # Parallel execution # same code as above, just restructured for joblib's Parallel for jobres in Parallel(n_jobs=self.params.n_proc, pre_dispatch=self.params.n_proc, verbose=verbose_level_parallel)( delayed(get_cluster_sizes) (Dataset(np.mean(ds_samples[sidx], axis=0)[None] > thrmap, a=dsa)) for sidx in bcombos): # aggregate cluster_sizes += jobres # store cluster size histogram for later p-value evaluation # use a sparse matrix for easy consumption (max dim is the number of # features, i.e. biggest possible cluster) scl = dok_matrix((1, ds.nfeatures + 1), dtype=int) for s in cluster_sizes: scl[0, s] = cluster_sizes[s] self._null_cluster_sizes = scl def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input othrd = _verified_reverse1(mapper, thrd) # TODO: what is your purpose in life osamp? ;-) osamp = _verified_reverse1(mapper, ds.samples[0]) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(othrd,structure=np.ones([3,3,3])) area = measurements.sum(othrd, labels, index=np.arange(1, num + 1)).astype(int) com = measurements.center_of_mass( osamp, labels=labels, index=np.arange(1, num + 1)) maxpos = measurements.maximum_position( osamp, labels=labels, index=np.arange(1, num + 1)) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) ordered_com = np.zeros((num, len(osamp.shape)), dtype=float) ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i # kinda ugly, but we are looping anyway ordered_area[i] = area[idx] ordered_com[i] = com[idx] ordered_maxpos[i] = maxpos[idx] labels = ordered_labels area = ordered_area[::-1] com = ordered_com[::-1] maxpos = ordered_maxpos[::-1] del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # location info outds.a['clusterlocations'] = \ np.rec.fromarrays( [com, maxpos], names=('center_of_mass', 'max')) # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) clusterstats = ( [area, cluster_probs_raw], ['size', 'prob_raw'] ) # evaluate a bunch of stats for all clusters morestats = {} for cid in xrange(len(area)): # keep clusters on outer loop, because selection is more expensive clvals = ds.samples[0, labels == cid + 1] for id_, fx in ( ('mean', np.mean), ('median', np.median), ('min', np.min), ('max', np.max), ('std', np.std)): stats = morestats.get(id_, []) stats.append(fx(clvals)) morestats[id_] = stats for k, v in morestats.items(): clusterstats[0].append(v) clusterstats[1].append(k) if self.params.multicomp_correction is not None: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities clusterstats[0].append(probs_corr) clusterstats[1].append('prob_corrected') # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels outds.a['clusterstats'] = \ np.rec.fromarrays(clusterstats[0], names=clusterstats[1]) return outds
class _SVM(Classifier): """Support Vector Machine Classifier. Base class for all external SVM implementations. """ """ Derived classes should define: * _KERNELS: map(dict) should define assignment to a tuple containing implementation kernel type, list of parameters adherent to the kernel, and sensitivity analyzer e.g.:: _KERNELS = { 'linear': (shogun.Kernel.LinearKernel, (), LinearSVMWeights), 'rbf' : (shogun.Kernel.GaussianKernel, ('gamma',), None), ... } * _KNOWN_IMPLEMENTATIONS: map(dict) should define assignment to a tuple containing implementation of the SVM, list of parameters adherent to the implementation, additional internals, and description e.g.:: _KNOWN_IMPLEMENTATIONS = { 'C_SVC' : (svm.svmc.C_SVC, ('C',), ('binary', 'multiclass'), 'C-SVM classification'), ... } """ _ATTRIBUTE_COLLECTIONS = ['params'] # enforce presence of params collections # Placeholder: map kernel names to sensitivity classes, ie # 'linear':LinearSVMWeights, for each backend _KNOWN_SENSITIVITIES={} kernel = Parameter(None, # XXX: Currently, can't be ensured using constraints # allowedtype=Kernel, doc='Kernel object', index=-1) _SVM_PARAMS = { 'C' : Parameter(-1.0, doc='Trade-off parameter between width of the ' 'margin and number of support vectors. Higher C -- ' 'more rigid margin SVM. In linear kernel, negative ' 'values provide automatic scaling of their value ' 'according to the norm of the data'), 'nu' : Parameter(0.5, min=0.0, max=1.0, doc='Fraction of datapoints within the margin'), 'cache_size': Parameter(100, doc='Size of the kernel cache, specified in megabytes'), 'tube_epsilon': Parameter(0.01, doc='Epsilon in epsilon-insensitive loss function of ' 'epsilon-SVM regression (SVR)'), 'tau': Parameter(1e-6, doc='TAU parameter of KRR regression in shogun'), 'probability': Parameter(0, doc='Flag to signal either probability estimate is obtained ' 'within LIBSVM'), 'shrinking': Parameter(1, doc='Either shrinking is to be conducted'), 'weight_label': Parameter([], constraints=EnsureListOf(int), doc='To be used in conjunction with weight for custom ' 'per-label weight'), # TODO : merge them into a single dictionary 'weight': Parameter([], constraints=EnsureListOf(float), doc='Custom weights per label'), # For some reason setting up epsilon to 1e-5 slowed things down a bit # in comparison to how it was before (in yoh/master) by up to 20%... not clear why # may be related to 1e-3 default within _svm.py? 'epsilon': Parameter(5e-5, min=1e-10, doc='Tolerance of termination criteria. (For nu-SVM default is 0.001)') } _KNOWN_PARAMS = () # just a placeholder to please lintian """Parameters which are specific to a given instantiation of SVM """ __tags__ = [ 'svm', 'kernel-based', 'swig' ] def __init__(self, **kwargs): """Init base class of SVMs. *Not to be publicly used* TODO: handling of parameters might migrate to be generic for all classifiers. SVMs are chosen to be testbase for that functionality to see how well it would fit. """ # Check if requested implementation is known svm_impl = kwargs.get('svm_impl', None) if not svm_impl in self._KNOWN_IMPLEMENTATIONS: raise ValueError, \ "Unknown SVM implementation '%s' is requested for %s." \ "Known are: %s" % (svm_impl, self.__class__, self._KNOWN_IMPLEMENTATIONS.keys()) self._svm_impl = svm_impl impl, add_params, add_internals, descr = \ self._KNOWN_IMPLEMENTATIONS[svm_impl] # Add corresponding parameters to 'known' depending on the # implementation chosen if add_params is not None: self._KNOWN_PARAMS = \ self._KNOWN_PARAMS[:] + list(add_params) # Assign per-instance __tags__ self.__tags__ = self.__tags__[:] + [svm_impl] # Add corresponding internals if add_internals is not None: self.__tags__ += list(add_internals) self.__tags__.append(svm_impl) k = kwargs.get('kernel', None) if k is None: kwargs['kernel'] = self.__default_kernel_class__() if 'linear' in ('%s'%kwargs['kernel']).lower(): # XXX not necessarily best self.__tags__ += [ 'linear', 'has_sensitivity' ] else: self.__tags__ += [ 'non-linear' ] # pop out all args from **kwargs which are known to be SVM parameters _args = {} for param in self._KNOWN_PARAMS + ['svm_impl']: # Update to remove kp's? if param in kwargs: _args[param] = kwargs.pop(param) try: Classifier.__init__(self, **kwargs) except TypeError, e: if "__init__() got an unexpected keyword argument " in e.args[0]: # TODO: make it even more specific -- if that argument is listed # within _SVM_PARAMS e.args = tuple( [e.args[0] + "\n Given SVM instance of class %s knows following parameters: %s" % (self.__class__, self._KNOWN_PARAMS) + \ list(e.args)[1:]]) raise e # populate collections and add values from arguments for paramfamily, paramset in ( (self._KNOWN_PARAMS, self.params),): for paramname in paramfamily: if not (paramname in self._SVM_PARAMS): raise ValueError, "Unknown parameter %s" % paramname + \ ". Known SVM params are: %s" % self._SVM_PARAMS.keys() param = deepcopy(self._SVM_PARAMS[paramname]) if paramname in _args: param.value = _args[paramname] # XXX might want to set default to it -- not just value paramset[paramname] = param # TODO: Below commented out because kernel_type has been removed. # Find way to set default C as necessary # tune up C if it has one and non-linear classifier is used #if self.params.has_key('C') and kernel_type != "linear" \ #and self.params['C'].is_default: #if __debug__: #debug("SVM_", "Assigning default C value to be 1.0 for SVM " #"%s with non-linear kernel" % self) #self.params['C'].default = 1.0 # Some postchecks if 'weight' in self.params and 'weight_label' in self.params: if not len(self.params.weight_label) == len(self.params.weight): raise ValueError, "Lenghts of 'weight' and 'weight_label' lists " \ "must be equal." if __debug__: debug("SVM", "Initialized %s with kernel %s" % (self, self.params.kernel))
class GNB(Classifier): """Gaussian Naive Bayes `Classifier`. `GNB` is a probabilistic classifier relying on Bayes rule to estimate posterior probabilities of labels given the data. Naive assumption in it is an independence of the features, which allows to combine per-feature likelihoods by a simple product across likelihoods of "independent" features. See http://en.wikipedia.org/wiki/Naive_bayes for more information. Provided here implementation is "naive" on its own -- various aspects could be improved, but it has its own advantages: - implementation is simple and straightforward - no data copying while considering samples of specific class - provides alternative ways to assess prior distribution of the classes in the case of unbalanced sets of samples (see parameter `prior`) - makes use of NumPy broadcasting mechanism, so should be relatively efficient - should work for any dimensionality of samples `GNB` is listed both as linear and non-linear classifier, since specifics of separating boundary depends on the data and/or parameters: linear separation is achieved whenever samples are balanced (or ``prior='uniform'``) and features have the same variance across different classes (i.e. if ``common_variance=True`` to enforce this). Whenever decisions are made based on log-probabilities (parameter ``logprob=True``, which is the default), then conditional attribute `values`, if enabled, would also contain log-probabilities. Also mention that normalization by the evidence (P(data)) is disabled by default since it has no impact per se on classification decision. You might like to set parameter normalize to True if you want to access properly scaled probabilities in `values` conditional attribute. """ # XXX decide when should we set corresponding internal, # since it depends actually on the data -- no clear way, # so set both linear and non-linear __tags__ = [ 'gnb', 'linear', 'non-linear', 'binary', 'multiclass' ] common_variance = Parameter(False, constraints='bool', doc="""Use the same variance across all classes.""") prior = Parameter('laplacian_smoothing', constraints=EnsureChoice('laplacian_smoothing', 'uniform', 'ratio'), doc="""How to compute prior distribution.""") logprob = Parameter(True, constraints='bool', doc="""Operate on log probabilities. Preferable to avoid unneeded exponentiation and loose precision. If set, logprobs are stored in `values`""") normalize = Parameter(False, constraints='bool', doc="""Normalize (log)prob by P(data). Requires probabilities thus for `logprob` case would require exponentiation of 'logprob's, thus disabled by default since does not impact classification output. """) def __init__(self, **kwargs): """Initialize an GNB classifier. """ # init base class first Classifier.__init__(self, **kwargs) # pylint friendly initializations self.means = None """Means of features per class""" self.variances = None """Variances per class, but "vars" is taken ;)""" self.ulabels = None """Labels classifier was trained on""" self.priors = None """Class probabilities""" # Define internal state of classifier self._norm_weight = None def _get_priors(self, nlabels, nsamples, nsamples_per_class): """Return prior probabilities given data """ # helper function - squash all dimensions but 1 squash = lambda x: np.atleast_1d(x.squeeze()) prior = self.params.prior if prior == 'uniform': priors = np.ones((nlabels,))/nlabels elif prior == 'laplacian_smoothing': priors = (1+squash(nsamples_per_class)) \ / (float(nsamples) + nlabels) elif prior == 'ratio': priors = squash(nsamples_per_class) / float(nsamples) else: raise ValueError( "No idea on how to handle '%s' way to compute priors" % self.params.prior) return priors def _train(self, dataset): """Train the classifier using `dataset` (`Dataset`). """ params = self.params targets_sa_name = self.get_space() targets_sa = dataset.sa[targets_sa_name] # get the dataset information into easy vars X = dataset.samples labels = targets_sa.value self.ulabels = ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) # set the feature dimensions nsamples = len(X) s_shape = X.shape[1:] # shape of a single sample self.means = means = \ np.zeros((nlabels, ) + s_shape) self.variances = variances = \ np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape)) # Estimate means and number of samples per each label for s, l in zip(X, labels): il = label2index[l] # index of the label nsamples_per_class[il] += 1 means[il] += s # helper function - squash all dimensions but 1 squash = lambda x: np.atleast_1d(x.squeeze()) ## Actually compute the means non0labels = (squash(nsamples_per_class) != 0) means[non0labels] /= nsamples_per_class[non0labels] # Store prior probabilities self.priors = self._get_priors(nlabels, nsamples, nsamples_per_class) # Estimate variances # better loop than repmat! ;) for s, l in zip(X, labels): il = label2index[l] # index of the label variances[il] += (s - means[il])**2 ## Actually compute the variances if params.common_variance: # we need to get global std cvar = np.sum(variances, axis=0)/nsamples # sum across labels # broadcast the same variance across labels variances[:] = cvar else: variances[non0labels] /= nsamples_per_class[non0labels] # Precompute and store weighting coefficient for Gaussian if params.logprob: # it would be added to exponent self._norm_weight = -0.5 * np.log(2*np.pi*variances) else: self._norm_weight = 1.0/np.sqrt(2*np.pi*variances) if __debug__ and 'GNB' in debug.active: debug('GNB', "training finished on data.shape=%s " % (X.shape, ) + "min:max(data)=%f:%f" % (np.min(X), np.max(X))) def _untrain(self): """Untrain classifier and reset all learnt params """ self.means = None self.variances = None self.ulabels = None self.priors = None super(GNB, self)._untrain() @accepts_dataset_as_samples def _predict(self, data): """Predict the output for the provided data. """ params = self.params # argument of exponentiation scaled_distances = \ -0.5 * (((data - self.means[:, np.newaxis, ...])**2) \ / self.variances[:, np.newaxis, ...]) if params.logprob: # if self.params.common_variance: # XXX YOH: # For decision there is no need to actually compute # properly scaled p, ie 1/sqrt(2pi * sigma_i) could be # simply discarded since it is common across features AND # classes # For completeness -- computing everything now even in logprob lprob_csfs = self._norm_weight[:, np.newaxis, ...] \ + scaled_distances # XXX for now just cut/paste with different operators, but # could just bind them and reuse in the same equations # Naive part -- just a product of probabilities across features ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape( lprob_csfs.shape[:2] + (-1,)) ## Now -- sum across features lprob_cs = lprob_csf.sum(axis=2) # Incorporate class probabilities: prob_cs_cp = lprob_cs + np.log(self.priors[:, np.newaxis]) else: # Just a regular Normal distribution with per # feature/class mean and variances prob_csfs = \ self._norm_weight[:, np.newaxis, ...] \ * np.exp(scaled_distances) # Naive part -- just a product of probabilities across features ## First we need to reshape to get class x samples x features prob_csf = prob_csfs.reshape( prob_csfs.shape[:2] + (-1,)) ## Now -- product across features prob_cs = prob_csf.prod(axis=2) # Incorporate class probabilities: prob_cs_cp = prob_cs * self.priors[:, np.newaxis] # Normalize by evidence P(data) if params.normalize: if params.logprob: prob_cs_cp_real = np.exp(prob_cs_cp) else: prob_cs_cp_real = prob_cs_cp prob_s_cp_marginals = np.sum(prob_cs_cp_real, axis=0) if params.logprob: prob_cs_cp -= np.log(prob_s_cp_marginals) else: prob_cs_cp /= prob_s_cp_marginals # Take the class with maximal (log)probability winners = prob_cs_cp.argmax(axis=0) predictions = [self.ulabels[c] for c in winners] # set to the probabilities per class self.ca.estimates = prob_cs_cp.T if __debug__ and 'GNB' in debug.active: debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " % (data.shape, np.min(data), np.max(data))) return predictions
class SearchlightHyperalignment(ClassWithCollections): """ Given a list of datasets, provide a list of mappers into common space using searchlight based hyperalignment. :ref:`Guntupalli et al., Cerebral Cortex (2016)` 1) Input datasets should all be of the same size in terms of nsamples and nfeatures, and be coarsely aligned (using anatomy). 2) All features in all datasets should be zscored. 3) Datasets should have feature attribute `voxel_indices` containing spatial coordinates of all features """ # TODO: add {training_,}residual_errors .ca ? ## Parameters common with Hyperalignment but overriden ref_ds = Parameter( 0, constraints=EnsureInt() & EnsureRange(min=0), doc="""Index of a dataset to use as a reference. First dataset is used as default. If you supply exclude_from_model list, you should supply the ref_ds index as index before you remove those excluded datasets. Note that unlike regular Hyperalignment, there is no automagic choosing of the "best" ref_ds by default.""") ## Parameters specific to SearchlightHyperalignment queryengine = Parameter( None, doc="""A single (or a list of query engines, one per each dataset) to be used. If not provided, volumetric searchlight, with spherical neighborhood as instructed by radius parameter will be used.""") radius = Parameter( 3, constraints=EnsureInt() & EnsureRange(min=1), doc="""Radius of a searchlight sphere in number of voxels to be used if no `queryengine` argument was provided.""") nproc = Parameter(1, constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(), doc="""Number of cores to use.""") nblocks = Parameter( None, constraints=EnsureInt() & EnsureRange(min=1) | EnsureNone(), doc="""Number of blocks to divide to process. Higher number results in smaller memory consumption.""") sparse_radius = Parameter( None, constraints=(EnsureRange(min=1) & EnsureInt() | EnsureNone()), doc="""Radius supplied to scatter_neighborhoods in units of voxels. This is effectively the distance between the centers where hyperalignment is performed in searchlights. ATM applicable only if no custom queryengine was provided. If None, hyperalignment is performed at every voxel (default).""") hyperalignment = Parameter( Hyperalignment(ref_ds=None), doc="""Hyperalignment instance to be used in each searchlight sphere. Default is just the Hyperalignment instance with default parameters. Its `ref_ds` parameter would be overridden by the `ref_ds` parameter of this SearchlightHyperalignment instance because we want to be consistent and only need one `ref_ds`.""") combine_neighbormappers = Parameter( True, constraints=EnsureBool(), doc="""This param determines whether to combine mappers for each voxel from its neighborhood searchlights or just use the mapper for which it is the center voxel. This will not be applicable for certain queryengines whose ids and neighborhoods are from different spaces, such as for SurfaceVerticesQueryEngine""") compute_recon = Parameter( True, constraints=EnsureBool(), doc="""This param determines whether to compute reverse mappers for each subject from common-space to subject space. These will be stored in the StaticProjectionMapper() and used when reverse() is called. Enabling it will double the size of the mappers returned.""") featsel = Parameter( 1.0, constraints=EnsureFloat() & EnsureRange(min=0.0, max=1.0) | EnsureInt() & EnsureRange(min=2), doc= """Determines if feature selection will be performed in each searchlight. 1.0: Use all features. < 1.0 is understood as selecting that proportion of features in each searchlight of ref_ds using feature scores; > 1.0 is understood as selecting at most that many features in each searchlight.""") # TODO: Should we get rid of this feature? use_same_features = Parameter( False, constraints=EnsureBool(), doc="""Select the same (best) features when doing feature selection for all datasets.""") exclude_from_model = Parameter( [], constraints=EnsureListOf(int), doc="""List of dataset indices that will not participate in building common model. These will still get mappers back but they don't influence the model or voxel selection.""") mask_node_ids = Parameter( None, constraints=EnsureListOf(int) | EnsureNone(), doc="""You can specify a mask to compute searchlight hyperalignment only within this mask. These would be a list of voxel indices.""") dtype = Parameter( 'float32', constraints='str', doc="""dtype of elements transformation matrices to save on memory for big datasets""") results_backend = Parameter( 'hdf5', constraints=EnsureChoice('hdf5', 'native'), doc="""'hdf5' or 'native'. See Searchlight documentation.""") tmp_prefix = Parameter( 'tmpsl', constraints='str', doc="""Prefix for temporary files. See Searchlight documentation.""") def __init__(self, **kwargs): _shpaldebug("Initializing.") ClassWithCollections.__init__(self, **kwargs) self.ndatasets = 0 self.nfeatures = 0 self.projections = None # This option makes the roi_seed in each SL to be selected during feature selection self.force_roi_seed = True if self.params.nproc is not None and self.params.nproc > 1 \ and not externals.exists('pprocess'): raise RuntimeError("The 'pprocess' module is required for " "multiprocess searchlights. Please either " "install python-pprocess, or reduce `nproc` " "to 1 (got nproc=%i) or set to default None" % self.params.nproc) if not externals.exists('scipy'): raise RuntimeError("The 'scipy' module is required for " "searchlight hyperalignment.") if self.params.results_backend == 'native': raise NotImplementedError( "'native' mode to handle results is still a " "work in progress.") #warning("results_backend is set to 'native'. This has been known" # "to result in longer run time when working with big datasets.") if self.params.results_backend == 'hdf5' and \ not externals.exists('h5py'): raise RuntimeError("The 'hdf5' module is required for " "when results_backend is set to 'hdf5'") def _proc_block(self, block, datasets, featselhyper, queryengines, seed=None, iblock='main'): if seed is not None: mvpa2.seed(seed) if __debug__: debug('SLC', 'Starting computing block for %i elements' % len(block)) bar = ProgressBar() projections = [ csc_matrix((self.nfeatures, self.nfeatures), dtype=self.params.dtype) for isub in range(self.ndatasets) ] for i, node_id in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine # Find the neighborhood for that selected nearest node roi_feature_ids_all = [qe[node_id] for qe in queryengines] # handling queryengines that return AttrDatasets for isub in range(len(roi_feature_ids_all)): if is_datasetlike(roi_feature_ids_all[isub]): # making sure queryengine returned proper shaped output assert (roi_feature_ids_all[isub].nsamples == 1) roi_feature_ids_all[isub] = roi_feature_ids_all[ isub].samples[0, :].tolist() if len(roi_feature_ids_all) == 1: # just one was provided to be "broadcasted" roi_feature_ids_all *= len(datasets) # if qe returns zero-sized ROI for any subject, pass... if any(len(x) == 0 for x in roi_feature_ids_all): continue # selecting neighborhood for all subject for hyperalignment ds_temp = [ sd[:, ids] for sd, ids in zip(datasets, roi_feature_ids_all) ] if self.force_roi_seed: roi_seed = np.array( roi_feature_ids_all[self.params.ref_ds]) == node_id ds_temp[self.params.ref_ds].fa['roi_seed'] = roi_seed if __debug__: msg = 'ROI (%i/%i), %i features' % ( i + 1, len(block), ds_temp[self.params.ref_ds].nfeatures) debug('SLC', bar(float(i + 1) / len(block), msg), cr=True) hmappers = featselhyper(ds_temp) assert (len(hmappers) == len(datasets)) roi_feature_ids_ref_ds = roi_feature_ids_all[self.params.ref_ds] for isub, roi_feature_ids in enumerate(roi_feature_ids_all): if not self.params.combine_neighbormappers: I = roi_feature_ids #J = [roi_feature_ids[node_id]] * len(roi_feature_ids) J = [node_id] * len(roi_feature_ids) V = hmappers[isub].tolist() if np.isscalar(V): V = [V] else: I, J, V = [], [], [] for f2, roi_feature_id_ref_ds in enumerate( roi_feature_ids_ref_ds): I += roi_feature_ids J += [roi_feature_id_ref_ds] * len(roi_feature_ids) V += hmappers[isub][:, f2].tolist() proj = coo_matrix( (V, (I, J)), shape=(max(self.nfeatures, max(I) + 1), max(self.nfeatures, max(J) + 1)), dtype=self.params.dtype) proj = proj.tocsc() # Cleaning up the current subject's projections to free up memory hmappers[isub] = [[] for _ in hmappers] projections[isub] = projections[isub] + proj if self.params.results_backend == 'native': return projections elif self.params.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = mktemp(prefix=self.params.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, projections) if __debug__: debug('SLC_', "Results stored") return results_file else: raise RuntimeError("Must not reach this point") def __handle_results(self, results): if self.params.results_backend == 'hdf5': # 'results' must be just a filename assert (isinstance(results, str)) if __debug__: debug('SLC', "Loading results from %s" % results) results_data = h5load(results) os.unlink(results) if __debug__: debug('SLC_', "Loaded results of len=%d from" % len(results_data)) for isub, res in enumerate(results_data): self.projections[isub] = self.projections[isub] + res if __debug__: debug('SLC_', "Finished adding results") return def __handle_all_results(self, results): """Helper generator to decorate passing the results out to results_fx """ for r in results: yield self.__handle_results(r) @due.dcite( Doi('10.1093/cercor/bhw068'), description="Full cortex hyperalignment of data to a common space", tags=["implementation"]) def __call__(self, datasets): """Estimate mappers for each dataset using searchlight-based hyperalignment. Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained StaticProjectionMappers of the same length as datasets """ # Perform some checks first before modifying internal state params = self.params ndatasets = len(datasets) if len(datasets) <= 1: raise ValueError("SearchlightHyperalignment needs > 1 dataset to " "operate on. Got: %d" % self.ndatasets) if params.ref_ds in params.exclude_from_model: raise ValueError("Requested reference dataset %i is also " "in the exclude list." % params.ref_ds) if params.ref_ds >= ndatasets: raise ValueError("Requested reference dataset %i is out of " "bounds. We have only %i datasets provided" % (params.ref_ds, self.ndatasets)) # The rest of the checks are just warnings self.ndatasets = ndatasets _shpaldebug("SearchlightHyperalignment %s for %i datasets" % (self, self.ndatasets)) selected = [ _ for _ in range(ndatasets) if _ not in params.exclude_from_model ] ref_ds_train = selected.index(params.ref_ds) params.hyperalignment.params.ref_ds = ref_ds_train warning('Using %dth dataset as the reference dataset (%dth after ' 'excluding datasets)' % (params.ref_ds, ref_ds_train)) if len(params.exclude_from_model) > 0: warning("These datasets will not participate in building common " "model: %s" % params.exclude_from_model) if __debug__: # verify that datasets were zscored prior the alignment since it is # assumed/required preprocessing step for ids, ds in enumerate(datasets): for f, fname, tval in ((np.mean, 'means', 0), (np.std, 'stds', 1)): vals = f(ds, axis=0) vals_comp = np.abs(vals - tval) > 1e-5 if np.any(vals_comp): warning( '%d %s are too different (max diff=%g) from %d in ' 'dataset %d to come from a zscored dataset. ' 'Please zscore datasets first for correct operation ' '(unless if was intentional)' % (np.sum(vals_comp), fname, np.max( np.abs(vals)), tval, ids)) # Setting up SearchlightHyperalignment # we need to know which original features where comprising the # individual SL ROIs _shpaldebug('Initializing FeatureSelectionHyperalignment.') hmeasure = FeatureSelectionHyperalignment( ref_ds=params.ref_ds, featsel=params.featsel, hyperalignment=params.hyperalignment, full_matrix=params.combine_neighbormappers, use_same_features=params.use_same_features, exclude_from_model=params.exclude_from_model, dtype=params.dtype) # Performing SL processing manually _shpaldebug("Setting up for searchlights") if params.nproc is None and externals.exists('pprocess'): import pprocess try: params.nproc = pprocess.get_number_of_cores() or 1 except AttributeError: warning("pprocess version %s has no API to figure out maximal " "number of cores. Using 1" % externals.versions['pprocess']) params.nproc = 1 # XXX I think this class should already accept a single dataset only. # It should have a ``space`` setting that names a sample attribute that # can be used to identify individual/original datasets. # Taking a single dataset as argument would be cleaner, because the # algorithm relies on the assumption that there is a coarse feature # alignment, i.e. the SL ROIs cover roughly the same area queryengines = self._get_trained_queryengines(datasets, params.queryengine, params.radius, params.ref_ds) # For surface nodes to voxels queryengines, roi_seed hardly makes sense qe = queryengines[(0 if len(queryengines) == 1 else params.ref_ds)] if isinstance(qe, SurfaceVerticesQueryEngine): self.force_roi_seed = False if not self.params.combine_neighbormappers: raise NotImplementedError( "Mapping from voxels to surface nodes is not " "implmented yet. Try setting combine_neighbormappers to True." ) self.nfeatures = datasets[params.ref_ds].nfeatures _shpaldebug("Performing Hyperalignment in searchlights") # Setting up centers for running SL Hyperalignment if params.sparse_radius is None: roi_ids = self._get_verified_ids(queryengines) \ if params.mask_node_ids is None \ else params.mask_node_ids else: if params.queryengine is not None: raise NotImplementedError( "using sparse_radius whenever custom queryengine is " "provided is not yet supported.") _shpaldebug("Setting up sparse neighborhood") from mvpa2.misc.neighborhood import scatter_neighborhoods if params.mask_node_ids is None: scoords, sidx = scatter_neighborhoods( Sphere(params.sparse_radius), datasets[params.ref_ds].fa.voxel_indices, deterministic=True) roi_ids = sidx else: scoords, sidx = scatter_neighborhoods( Sphere(params.sparse_radius), datasets[params.ref_ds].fa.voxel_indices[ params.mask_node_ids], deterministic=True) roi_ids = [params.mask_node_ids[sid] for sid in sidx] # Initialize projections _shpaldebug('Initializing projection matrices') self.projections = [ csc_matrix((self.nfeatures, self.nfeatures), dtype=params.dtype) for isub in range(self.ndatasets) ] # compute if params.nproc is not None and params.nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), params.nproc) params.nblocks = nproc_needed \ if params.nblocks is None else params.nblocks params.nblocks = min(len(roi_ids), params.nblocks) node_blocks = np.array_split(roi_ids, params.nblocks) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug( 'SLC', "Starting off %s child processes for nblocks=%i" % (nproc_needed, params.nblocks)) compute = p_results.manage(pprocess.MakeParallel(self._proc_block)) seed = mvpa2.get_random_seed() for iblock, block in enumerate(node_blocks): # should we maybe deepcopy the measure to have a unique and # independent one per process? compute(block, datasets, copy.copy(hmeasure), queryengines, seed=seed, iblock=iblock) else: # otherwise collect the results in an 1-item list _shpaldebug('Using 1 process to compute mappers.') if params.nblocks is None: params.nblocks = 1 params.nblocks = min(len(roi_ids), params.nblocks) node_blocks = np.array_split(roi_ids, params.nblocks) p_results = [ self._proc_block(block, datasets, hmeasure, queryengines) for block in node_blocks ] results_ds = self.__handle_all_results(p_results) # Dummy iterator for, you know, iteration list(results_ds) _shpaldebug( 'Wrapping projection matrices into StaticProjectionMappers') self.projections = [ StaticProjectionMapper(proj=proj, recon=proj.T) if params.compute_recon else StaticProjectionMapper(proj=proj) for proj in self.projections ] return self.projections def _get_verified_ids(self, queryengines): """Helper to return ids of queryengines, verifying that they are the same""" qe0 = queryengines[0] roi_ids = qe0.ids for qe in queryengines: if qe is not qe0: # if a different query engine (so wasn't just replicated) if np.any(qe.ids != qe0.ids): raise RuntimeError( "Query engine %s provided different ids than %s. Not supported" % (qe0, qe)) return roi_ids def _get_trained_queryengines(self, datasets, queryengine, radius, ref_ds): """Helper to return trained query engine(s), either list of one or one per each dataset if queryengine is None then IndexQueryEngine based on radius is created """ ndatasets = len(datasets) if queryengine: if isinstance(queryengine, (list, tuple)): queryengines = queryengine if len(queryengines) != ndatasets: raise ValueError( "%d query engines were specified although %d datasets " "provided" % (len(queryengines), ndatasets)) _shpaldebug("Training provided query engines") for qe, ds in zip(queryengines, datasets): qe.train(ds) else: queryengine.train(datasets[ref_ds]) queryengines = [queryengine] else: _shpaldebug( 'No custom query engines were provided. Setting up the ' 'volumetric query engine on voxel_indices.') queryengine = IndexQueryEngine(voxel_indices=Sphere(radius)) queryengine.train(datasets[ref_ds]) queryengines = [queryengine] return queryengines
class PDistConsistency(Measure): """Calculate the correlations of PDist measures across chunks This measures the consistency in similarity structure across runs within individuals, or across individuals if the target dataset is made from several subjects in some common space and where the sample attribute specified as the chunks_attr codes for subject identity. @author: ACC Aug 2013 """ is_trained = True """Indicate that this measure is always trained.""" chunks_attr = Parameter('chunks', constraints='str', doc="""\ Chunks attribute to use for chunking dataset. Can be any samples attribute.""") pairwise_metric = Parameter('correlation', constraints='str', doc="""\ Distance metric to use for calculating dissimilarity matrices from the set of samples in each chunk specified. See spatial.distance.pdist for all possible metrics.""") consistency_metric = Parameter('pearson', constraints=EnsureChoice('pearson', 'spearman'), doc="""\ Correlation measure to use for the correlation between dissimilarity matrices.""") center_data = Parameter(False, constraints='bool', doc="""\ If True then center each column of the data matrix by subtracing the column mean from each element. This is recommended especially when using pairwise_metric='correlation'.""") square = Parameter(False, constraints='bool', doc="""\ If True return the square distance matrix, if False, returns the flattened upper triangle.""") def __init__(self, **kwargs): """ Returns ------- Dataset Contains the pairwise correlations between the DSMs computed from each chunk of the input dataset. If square is False, this is a column vector of length N(N-1)/2 for N chunks. If square is True, this is a square matrix of size NxN for N chunks. """ # TODO: Another metric for consistency metric could be the "Rv" # coefficient... (ac) # init base classes first Measure.__init__(self, **kwargs) def _call(self, dataset): """Computes the average correlation in similarity structure across chunks.""" chunks_attr = self.params.chunks_attr nchunks = len(dataset.sa[chunks_attr].unique) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") dsms = [] chunks = [] for chunk in dataset.sa[chunks_attr].unique: data = np.atleast_2d( dataset.samples[dataset.sa[chunks_attr].value == chunk,:]) if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data, self.params.pairwise_metric) dsms.append(dsm) chunks.append(chunk) dsms = np.vstack(dsms) if self.params.consistency_metric=='spearman': dsms = np.apply_along_axis(rankdata, 1, dsms) corrmat = np.corrcoef(dsms) if self.params.square: ds = Dataset(corrmat, sa={self.params.chunks_attr: chunks}) else: ds = Dataset(squareform(corrmat,checks=False), sa=dict(pairs=list(combinations(chunks, 2)))) return ds
class Hyperalignment(ClassWithCollections): """Align the features across multiple datasets into a common feature space. This is a three-level algorithm. In the first level, a series of input datasets is projected into a common feature space using a configurable mapper. The common space is initially defined by a chosen exemplar from the list of input datasets, but is subsequently refined by iteratively combining the common space with the projected input datasets. In the second (optional) level, the original input datasets are again aligned with (or projected into) the intermediate first-level common space. Through a configurable number of iterations the common space is further refined by repeated projections of the input datasets and combination/aggregation of these projections into an updated common space. In the third level, the input datasets are again aligned with the, now final, common feature space. The output of this algorithm are trained mappers (one for each input dataset) that transform the individual features spaces into the common space. Level 1 and 2 are performed by the ``train()`` method, and level 3 is performed when the trained Hyperalignment instance is called with a list of datasets. This dataset list may or may not be identical to the training datasets. The default values for the parameters of the algorithm (e.g. projection via Procrustean transformation, common space aggregation by averaging) resemble the setup reported in :ref:`Haxby et al., Neuron (2011) <HGC+11>` *A common, high-dimensional model of the representational space in human ventral temporal cortex.* Examples -------- >>> # get some example data >>> from mvpa2.testing.datasets import datasets >>> from mvpa2.misc.data_generators import random_affine_transformation >>> ds4l = datasets['uni4large'] >>> # generate a number of distorted variants of this data >>> dss = [random_affine_transformation(ds4l) for i in xrange(4)] >>> ha = Hyperalignment() >>> ha.train(dss) >>> mappers = ha(dss) >>> len(mappers) 4 """ training_residual_errors = ConditionalAttribute( enabled=False, doc="""Residual error (norm of the difference between common space and projected data) per each training dataset at each level. The residuals are stored in a dataset with one row per level, and one column per input dataset. The first row corresponds to the error 1st-level of hyperalignment the remaining rows store the residual errors for each 2nd-level iteration.""") residual_errors = ConditionalAttribute( enabled=False, doc="""Residual error (norm of the difference between common space and projected data) per each dataset. The residuals are stored in a single-row dataset with one column per input dataset.""") # XXX Who cares whether it was chosen, or specified? This should be just # 'ref_ds' chosen_ref_ds = ConditionalAttribute( enabled=True, doc="""Index of the input dataset used as 1st-level reference dataset.""") # Lets use built-in facilities to specify parameters which # constructor should accept # the ``space`` of the mapper determines where the algorithm places the # common space definition in the datasets alignment = Parameter( ProcrusteanMapper(space='commonspace'), # might provide allowedtype # XXX Currently, there's no way to handle this with constraints doc="""The multidimensional transformation mapper. If `None` (default) an instance of :class:`~mvpa2.mappers.procrustean.ProcrusteanMapper` is used.""") output_dim = Parameter( None, constraints=(EnsureInt() & EnsureRange(min=1) | EnsureNone()), doc="""Output common space dimensionality. If None, datasets are aligned to the features of the `ref_ds`. Otherwise, dimensionality reduction is performed using SVD and only the top SVs are kept. To get all features in SVD-aligned space, give output_dim>=nfeatures. """) alpha = Parameter( 1, constraints=EnsureFloat() & EnsureRange(min=0, max=1), doc="""Regularization parameter to traverse between (Shrinkage)-CCA (canonical correlation analysis) and regular hyperalignment. Setting alpha to 1 makes the algorithm identical to hyperalignment and alpha of 0 makes it CCA. By default, it is 1, therefore hyperalignment. """) level2_niter = Parameter(1, constraints=EnsureInt() & EnsureRange(min=0), doc="Number of 2nd-level iterations.") ref_ds = Parameter( None, constraints=(EnsureRange(min=0) & EnsureInt() | EnsureNone()), doc="""Index of a dataset to use as 1st-level common space reference. If `None`, then the dataset with the maximum number of features is used.""") nproc = Parameter( 1, constraints=EnsureInt(), doc="""Number of processes to use to parallelize the last step of alignment. If different from 1, it passes it as n_jobs to `joblib.Parallel`. Requires joblib package.""") zscore_all = Parameter( False, constraints='bool', doc="""Flag to Z-score all datasets prior hyperalignment. Turn it off if Z-scoring is not desired or was already performed. If True, returned mappers are ChainMappers with the Z-scoring prepended to the actual projection.""") zscore_common = Parameter( True, constraints='bool', doc="""Flag to Z-score the common space after each adjustment. This should be left enabled in most cases.""") combiner1 = Parameter( mean_xy, # doc="""How to update common space in the 1st-level loop. This must be a callable that takes two arguments. The first argument is one of the input datasets after projection onto the 1st-level common space. The second argument is the current 1st-level common space. The 1st-level combiner is called iteratively for each projected input dataset, except for the reference dataset. By default the new common space is the average of the current common space and the recently projected dataset.""") level1_equal_weight = Parameter( False, constraints='bool', doc="""Flag to force all datasets to have the same weight in the level 1 iteration. False (default) means each time the new common space is the average of the current common space and the newly aligned dataset, and therefore earlier datasets have less weight.""" ) combiner2 = Parameter( mean_axis0, doc="""How to combine all individual spaces to common space. This must be a callable that take a sequence of datasets as an argument. The callable must return a single array. This combiner is called once with all datasets after 1st-level projection to create an updated common space, and is subsequently called again after each 2nd-level iteration.""") joblib_backend = Parameter( None, constraints=EnsureChoice('multiprocessing', 'threading') | EnsureNone(), doc="""Backend to use for joblib when using nproc>1. Options are 'multiprocessing' and 'threading'. Default is to use 'multiprocessing' unless run on OSX which have known issues with joblib v0.10.3. If it is set to specific value here, then that will be used at the risk of failure.""") def __init__(self, **kwargs): ClassWithCollections.__init__(self, **kwargs) self.commonspace = None # mapper to a low-dimensional subspace derived using SVD on training data # Initializing here so that call can access it without passing after train. # Moreover, it is similar to commonspace, in that, it is required for mapping # new subjects self._svd_mapper = None @due.dcite(Doi('10.1016/j.neuron.2011.08.026'), description="Hyperalignment of data to a common space", tags=["implementation"]) def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds # Making sure that ref_ds is within range. #Parameter() already checks for it being a non-negative integer if ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.chosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # If there is only one dataset in training phase, there is nothing to be done # just use that data as the common space if len(datasets) < 2: self.commonspace = commonspace else: # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) if params.output_dim is not None: mappers = self._level3(datasets) self._svd_mapper = SVDMapper() self._svd_mapper.train(self._map_and_mean(datasets, mappers)) self._svd_mapper = StaticProjectionMapper( proj=self._svd_mapper.proj[:, :params.output_dim]) def __call__(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ if self.commonspace is None: self.train(datasets) else: # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError( "Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") # place datasets into a copy of the list since items # will be reassigned datasets = list(datasets) params = self.params # for quicker access ;) alpha = params.alpha # for letting me be lazy ;) if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's # so we can assemble a comprehensive mapper at the end # (together with procrustes) zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # # Level 3 -- final, from-scratch, alignment to final common space # mappers = self._level3(datasets) # return trained mappers for projection from all datasets into the # common space if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation if params.alpha < 1: mappers = [ ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers) ] else: mappers = [ ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers) ] elif params.alpha < 1: mappers = [ ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers) ] if params.output_dim is not None: mappers = [ChainMapper([m, self._svd_mapper]) for m in mappers] return mappers def _regularize(self, datasets, alpha): if __debug__: debug('HPAL', "Using regularized hyperalignment with alpha of %d" % alpha) wmappers = [] for ids in xrange(len(datasets)): U, S, Vh = np.linalg.svd(datasets[ids]) S = 1 / np.sqrt((1 - alpha) * np.square(S) + alpha) S.resize(len(Vh)) S = np.matrix(np.diag(S)) W = np.matrix(Vh.T) * S * np.matrix(Vh) wmapper = StaticProjectionMapper(proj=W, auto_train=False) wmapper.train(datasets[ids]) wmappers.append(wmapper) datasets[ids] = wmapper.forward(datasets[ids]) return datasets, wmappers def _level1(self, datasets, commonspace, ref_ds, mappers, residuals): params = self.params # for quicker access ;) data_mapped = [ds.samples for ds in datasets] counts = 1 # number of datasets used so far for generating commonspace for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue # assign common space to ``space`` of the mapper, because this is # where it will be looking for it ds_new.sa[m.get_space()] = commonspace # find transformation of this dataset into the current common space m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # project this dataset into the current common space ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # replace original dataset with mapped one -- only the reference # dataset will remain unchanged data_mapped[i] = ds_ # compute first-level residuals wrt to the initial common space if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) # Update the common space. This is an incremental update after # processing each 1st-level dataset. Maybe there should be a flag # to make a batch update after processing all 1st-level datasets # to an identical 1st-level common space # TODO: make just a function so we dont' waste space if params.level1_equal_weight: commonspace = params.combiner1(ds_, commonspace, weights=(float(counts), 1.0)) else: commonspace = params.combiner1(ds_, commonspace) counts += 1 if params.zscore_common: zscore(commonspace, chunks_attr=None) return data_mapped def _level2(self, datasets, lvl1_data, mappers, residuals): params = self.params # for quicker access ;) data_mapped = lvl1_data # aggregate all processed 1st-level datasets into a new 2nd-level # common space commonspace = params.combiner2(data_mapped) # XXX Why is this commented out? Who knows what combiner2 is doing and # whether it changes the distribution of the data #if params.zscore_common: #zscore(commonspace, chunks_attr=None) ndatasets = len(datasets) for loop in xrange(params.level2_niter): # 2nd-level alignment starts from the original/unprojected datasets # again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) # Optimization speed up heuristic # Slightly modify the common space towards other feature # spaces and reduce influence of this feature space for the # to-be-computed projection temp_commonspace = (commonspace * ndatasets - data_mapped[i]) \ / (ndatasets - 1) if params.zscore_common: zscore(temp_commonspace, chunks_attr=None) # assign current common space ds_new.sa[m.get_space()] = temp_commonspace # retrain the mapper for this dataset m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # obtain the 2nd-level projection ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # store for 2nd-level combiner data_mapped[i] = ds_ # compute residuals if residuals is not None: residuals[1 + loop, i] = np.linalg.norm(ds_ - commonspace) commonspace = params.combiner2(data_mapped) # and again if params.zscore_common: zscore(commonspace, chunks_attr=None) # return the final common space return commonspace def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace # Fixing nproc=0 if params.nproc == 0: from mvpa2.base import warning warning("nproc of 0 doesn't make sense. Setting nproc to 1.") params.nproc = 1 # Checking for joblib, if not, set nproc to 1 if params.nproc != 1: from mvpa2.base import externals, warning if not externals.exists('joblib'): warning( "Setting nproc different from 1 requires joblib package, which " "does not seem to exist. Setting nproc to 1.") params.nproc = 1 # start from original input datasets again if params.nproc == 1: residuals = [] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) m, residual = get_trained_mapper( ds_new, self.commonspace, m, self.ca['residual_errors'].enabled) if self.ca['residual_errors'].enabled: residuals.append(residual) else: if __debug__: debug('HPAL_', "Level 3: Using joblib with nproc = %d " % params.nproc) verbose_level_parallel = 20 \ if (__debug__ and 'HPAL' in debug.active) else 0 from joblib import Parallel, delayed import sys # joblib's 'multiprocessing' backend has known issues of failure on OSX # Tested with MacOS 10.12.13, python 2.7.13, joblib v0.10.3 if params.joblib_backend is None: params.joblib_backend = 'threading' if sys.platform == 'darwin' \ else 'multiprocessing' res = Parallel(n_jobs=params.nproc, pre_dispatch=params.nproc, backend=params.joblib_backend, verbose=verbose_level_parallel)( delayed(get_trained_mapper)( ds, self.commonspace, mapper, self.ca['residual_errors'].enabled) for ds, mapper in zip(datasets, mappers)) mappers = [m for m, r in res] if self.ca['residual_errors'].enabled: residuals = [r for m, r in res] if self.ca['residual_errors'].enabled: self.ca.residual_errors = Dataset( samples=np.array(residuals)[None, :]) return mappers def _map_and_mean(self, datasets, mappers): params = self.params data_mapped = [[] for ds in datasets] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Mapping training data for SVD: ds #%i" % i) ds_ = m.forward(ds_new.samples) # XXX should we zscore data before averaging and running SVD? # zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ dss_mean = params.combiner2(data_mapped) return dss_mean
class Hyperalignment(ClassWithCollections): """Align the features across multiple datasets into a common feature space. This is a three-level algorithm. In the first level, a series of input datasets is projected into a common feature space using a configurable mapper. The common space is initially defined by a chosen exemplar from the list of input datasets, but is subsequently refined by iteratively combining the common space with the projected input datasets. In the second (optional) level, the original input datasets are again aligned with (or projected into) the intermediate first-level common space. Through a configurable number of iterations the common space is further refined by repeated projections of the input datasets and combination/aggregation of these projections into an updated common space. In the third level, the input datasets are again aligned with the, now final, common feature space. The output of this algorithm are trained mappers (one for each input dataset) that transform the individual features spaces into the common space. Level 1 and 2 are performed by the ``train()`` method, and level 3 is performed when the trained Hyperalignment instance is called with a list of datasets. This dataset list may or may not be identical to the training datasets. The default values for the parameters of the algorithm (e.g. projection via Procrustean transformation, common space aggregation by averaging) resemble the setup reported in :ref:`Haxby et al., Neuron (2011) <HGC+11>` *A common, high-dimensional model of the representational space in human ventral temporal cortex.* Examples -------- >>> # get some example data >>> from mvpa2.testing.datasets import datasets >>> from mvpa2.misc.data_generators import random_affine_transformation >>> ds4l = datasets['uni4large'] >>> # generate a number of distorted variants of this data >>> dss = [random_affine_transformation(ds4l) for i in xrange(4)] >>> ha = Hyperalignment() >>> ha.train(dss) >>> mappers = ha(dss) >>> len(mappers) 4 """ training_residual_errors = ConditionalAttribute( enabled=False, doc="""Residual error (norm of the difference between common space and projected data) per each training dataset at each level. The residuals are stored in a dataset with one row per level, and one column per input dataset. The first row corresponds to the error 1st-level of hyperalignment the remaining rows store the residual errors for each 2nd-level iteration.""") residual_errors = ConditionalAttribute( enabled=False, doc="""Residual error (norm of the difference between common space and projected data) per each dataset. The residuals are stored in a single-row dataset with one column per input dataset.""") # XXX Who cares whether it was chosen, or specified? This should be just # 'ref_ds' choosen_ref_ds = ConditionalAttribute( enabled=True, doc="""Index of the input dataset used as 1st-level reference dataset.""") # Lets use built-in facilities to specify parameters which # constructor should accept # the ``space`` of the mapper determines where the algorithm places the # common space definition in the datasets alignment = Parameter( ProcrusteanMapper(space='commonspace'), # might provide allowedtype allowedtype='basestring', doc="""The multidimensional transformation mapper. If `None` (default) an instance of :class:`~mvpa2.mappers.procrustean.ProcrusteanMapper` is used.""") alpha = Parameter( 1, allowedtype='float32', min=0, max=1, doc="""Regularization parameter to traverse between (Shrinkage)-CCA (canonical correlation analysis) and regular hyperalignment. Setting alpha to 1 makes the algorithm identical to hyperalignment and alpha of 0 makes it CCA. By default, it is 1, therefore hyperalignment. """) level2_niter = Parameter(1, allowedtype='int', min=0, doc="Number of 2nd-level iterations.") ref_ds = Parameter( None, allowedtype='int', min=0, doc="""Index of a dataset to use as 1st-level common space reference. If `None`, then the dataset with the maximum number of features is used.""") zscore_all = Parameter( False, allowedtype='bool', doc="""Flag to Z-score all datasets prior hyperalignment. Turn it off if Z-scoring is not desired or was already performed. If True, returned mappers are ChainMappers with the Z-scoring prepended to the actual projection.""") zscore_common = Parameter( True, allowedtype='bool', doc="""Flag to Z-score the common space after each adjustment. This should be left enabled in most cases.""") combiner1 = Parameter( lambda x, y: 0.5 * (x + y), # doc="""How to update common space in the 1st-level loop. This must be a callable that takes two arguments. The first argument is one of the input datasets after projection onto the 1st-level common space. The second argument is the current 1st-level common space. The 1st-level combiner is called iteratively for each projected input dataset, except for the reference dataset. By default the new common space is the average of the current common space and the recently projected dataset.""") combiner2 = Parameter( lambda l: np.mean(l, axis=0), doc="""How to combine all individual spaces to common space. This must be a callable that take a sequence of datasets as an argument. The callable must return a single array. This combiner is called once with all datasets after 1st-level projection to create an updated common space, and is subsequently called again after each 2nd-level iteration.""") def __init__(self, **kwargs): ClassWithCollections.__init__(self, **kwargs) self.commonspace = None def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) def __call__(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ if self.commonspace is None: self.train(datasets) # place datasets into a copy of the list since items # will be reassigned datasets = list(datasets) params = self.params # for quicker access ;) alpha = params.alpha # for letting me be lazy ;) if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's # so we can assemble a comprehensive mapper at the end # (together with procrustes) zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # # Level 3 -- final, from-scratch, alignment to final common space # mappers = self._level3(datasets) # return trained mappers for projection from all datasets into the # common space if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation if params.alpha < 1: return [ ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers) ] else: return [ ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers) ] else: if params.alpha < 1: return [ ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers) ] else: return mappers def _regularize(self, datasets, alpha): if __debug__: debug('HPAL', "Using regularized hyperalignment with alpha of %d" % alpha) wmappers = [] for ids in xrange(len(datasets)): U, S, Vh = np.linalg.svd(datasets[ids]) S = 1 / np.sqrt((1 - alpha) * np.square(S) + alpha) S.resize(len(Vh)) S = np.matrix(np.diag(S)) W = np.matrix(Vh.T) * S * np.matrix(Vh) wmapper = StaticProjectionMapper(proj=W) wmappers.append(wmapper) datasets[ids] = wmapper.forward(datasets[ids]) return datasets, wmappers def _level1(self, datasets, commonspace, ref_ds, mappers, residuals): params = self.params # for quicker access ;) data_mapped = [ds.samples for ds in datasets] for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue # assign common space to ``space`` of the mapper, because this is # where it will be looking for it ds_new.sa[m.get_space()] = commonspace # find transformation of this dataset into the current common space m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # project this dataset into the current common space ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # replace original dataset with mapped one -- only the reference # dataset will remain unchanged data_mapped[i] = ds_ # compute first-level residuals wrt to the initial common space if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) # Update the common space. This is an incremental update after # processing each 1st-level dataset. Maybe there should be a flag # to make a batch update after processing all 1st-level datasets # to an identical 1st-level common space # TODO: make just a function so we dont' waste space commonspace = params.combiner1(ds_, commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) return data_mapped def _level2(self, datasets, lvl1_data, mappers, residuals): params = self.params # for quicker access ;) data_mapped = lvl1_data # aggregate all processed 1st-level datasets into a new 2nd-level # common space commonspace = params.combiner2(data_mapped) # XXX Why is this commented out? Who knows what combiner2 is doing and # whether it changes the distribution of the data #if params.zscore_common: #zscore(commonspace, chunks_attr=None) ndatasets = len(datasets) for loop in xrange(params.level2_niter): # 2nd-level alignment starts from the original/unprojected datasets # again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) # Optimization speed up heuristic # Slightly modify the common space towards other feature # spaces and reduce influence of this feature space for the # to-be-computed projection temp_commonspace = (commonspace * ndatasets - data_mapped[i]) \ / (ndatasets - 1) if params.zscore_common: zscore(temp_commonspace, chunks_attr=None) # assign current common space ds_new.sa[m.get_space()] = temp_commonspace # retrain the mapper for this dataset m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # obtain the 2nd-level projection ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # store for 2nd-level combiner data_mapped[i] = ds_ # compute residuals if residuals is not None: residuals[1 + loop, i] = np.linalg.norm(ds_ - commonspace) commonspace = params.combiner2(data_mapped) # and again if params.zscore_common: zscore(commonspace, chunks_attr=None) # return the final common space return commonspace def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace residuals = None if self.ca['residual_errors'].enabled: residuals = np.zeros((1, len(datasets))) self.ca.residual_errors = Dataset(samples=residuals) # start from original input datasets again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) # retrain mapper on final common space ds_new.sa[m.get_space()] = self.commonspace m.train(ds_new) # remove common space attribute again to save on memory del ds_new.sa[m.get_space()] if residuals is not None: # obtain final projection data_mapped = m.forward(ds_new.samples) residuals[0, i] = np.linalg.norm(data_mapped - self.commonspace) return mappers