def test_deep_copying_state_variable(self): for v in (True, False): sv = ConditionalAttribute(enabled=v, doc="Testing") sv.enabled = not v sv_dc = copy.deepcopy(sv) self.failUnlessEqual(sv.enabled, sv_dc.enabled) self.failUnlessEqual(sv.name, sv_dc.name) self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
class TestClassParametrized(TestClassProper, ClassWithCollections): p1 = Parameter(0) state0 = ConditionalAttribute(enabled=False) def __init__(self, **kwargs): # XXX make such example when we actually need to invoke # constructor # TestClassProper.__init__(self, **kwargs) ClassWithCollections.__init__(self, **kwargs)
class FeatureSelection(ClassWithCollections): """Base class for any feature selection Base class for Functors which implement feature selection on the datasets. """ selected_ids = ConditionalAttribute(enabled=False) def __init__(self, **kwargs): # base init first ClassWithCollections.__init__(self, **kwargs) def __call__(self, dataset, testdataset=None): """Invocation of the feature selection Parameters ---------- dataset : Dataset dataset used to select features testdataset : Dataset dataset the might be used to compute a stopping criterion Returns ------- Dataset or tuple The dataset contains the selected features. If a ``testdataset`` has been passed a tuple with both processed datasets is return instead. Note that the resulting dataset(s) reference the same values for samples attributes (e.g. labels and chunks) of the input dataset(s): be careful if you alter them later. """ # Derived classes must provide interface to access other # relevant to the feature selection process information (e.g. mask, # elimination step (in RFE), etc) results = self._call(dataset, testdataset) if testdataset is None: return results[0] else: return results def untrain(self): """ 'Untrain' feature selection Necessary for full 'untraining' of the classifiers. By default does nothing, needs to be overridden in corresponding feature selections to pass to the sensitivities """ pass
class GPRLinearWeights(Sensitivity): """`SensitivityAnalyzer` that reports the weights GPR trained on a given `Dataset`. In case of LinearKernel compute explicitly the coefficients of the linear regression, together with their variances (if requested). Note that the intercept is not computed. """ variances = ConditionalAttribute( enabled=False, doc="Variances of the weights (for GeneralizedLinearKernel)") _LEGAL_CLFS = [GPR] def _call(self, dataset): """Extract weights from GPR """ clf = self.clf kernel = clf.kernel train_fv = clf._train_fv if isinstance(kernel, LinearKernel): Sigma_p = 1.0 else: Sigma_p = kernel.params.Sigma_p weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha)) if self.ca.is_enabled('variances'): # super ugly formulas that can be quite surely improved: tmp = np.linalg.inv(clf._L) Kyinv = Ndot(tmp.T, tmp) # XXX in such lengthy matrix manipulations you might better off # using np.matrix where * is a matrix product self.ca.variances = Ndiag( Sigma_p - Ndot(Sigma_p, Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p))))) return Dataset(np.atleast_2d(weights))
class SMLRWeights(Sensitivity): """`SensitivityAnalyzer` that reports the weights SMLR trained on a given `Dataset`. By default SMLR provides multiple weights per feature (one per label in training dataset). By default, all weights are combined into a single sensitivity value. Please, see the `FeaturewiseDatasetMeasure` constructor arguments how to custmize this behavior. """ biases = ConditionalAttribute(enabled=True, doc="A 1-d ndarray of biases") _LEGAL_CLFS = [SMLR] def _call(self, dataset=None): """Extract weights from SMLR classifier. SMLR always has weights available, so nothing has to be computed here. """ clf = self.clf # transpose to have the number of features on the second axis # (as usual) weights = clf.weights.T if clf.params.has_bias: self.ca.biases = clf.biases if __debug__: debug('SMLR', "Extracting weights for %d-class SMLR" % (len(weights) + 1) + "Result: min=%f max=%f" %\ (np.min(weights), np.max(weights))) # limit the labels to the number of sensitivity sets, to deal # with the case of `fit_all_weights=False` return Dataset( weights, sa={clf.params.targets_attr: clf._ulabels[:len(weights)]})
class ElementSelector(ClassWithCollections): """Base class to implement functors to select some elements based on a sequence of values. """ ndiscarded = ConditionalAttribute(enabled=True, doc="Store number of discarded elements.") def __init__(self, mode='discard', **kwargs): """ Parameters ---------- mode : {'discard', 'select'} Decides whether to `select` or to `discard` features. """ ClassWithCollections.__init__(self, **kwargs) self._set_mode(mode) """Flag whether to select or to discard elements.""" ##REF: Name was automagically refactored def _set_mode(self, mode): """Choose `select` or `discard` mode.""" if not mode in ['discard', 'select']: raise ValueError, "Unkown selection mode [%s]. Can only be one " \ "of 'select' or 'discard'." % mode self.__mode = mode def __call__(self, seq): """ Parameters ---------- seq Sequence based on values of which to perform the selection. If `Dataset`, then only 1st sample is taken. """ if isinstance(seq, AttrDataset): if len(seq)>1: raise ValueError( "Feature selectors cannot handle multiple " "sequences in a Dataset at once. We got dataset %s " "as input." % (seq,)) seq = seq.samples[0] elif hasattr(seq, 'shape'): shape = seq.shape if len(shape) > 1: raise ValueError( "Feature selectors cannot handle multidimensional " "inputs (such as ndarrays with more than a single " "dimension. We got %s with shape %s " "as input." % (seq.__class__, shape)) return self._call(seq) def _call(self, seq): """Implementations in derived classed have to return a list of selected element IDs based on the given sequence. """ raise NotImplementedError mode = property(fget=lambda self:self.__mode, fset=_set_mode)
class BLR(Classifier): """Bayesian Linear Regression (BLR). """ predicted_variances = ConditionalAttribute( enabled=False, doc="Variance per each predicted value") log_marginal_likelihood = ConditionalAttribute( enabled=False, doc="Log Marginal Likelihood") __tags__ = ['blr', 'regression', 'linear'] def __init__(self, sigma_p=None, sigma_noise=1.0, **kwargs): """Initialize a BLR regression analysis. Parameters ---------- sigma_noise : float the standard deviation of the gaussian noise. (Defaults to 0.1) """ # init base class first Classifier.__init__(self, **kwargs) # pylint happiness self.w = None # It does not make sense to calculate a confusion matrix for a # BLR: self.ca.enable('training_confusion', False) # set the prior on w: N(0,sigma_p) , specifying the covariance # sigma_p on w: self.sigma_p = sigma_p # set noise level: self.sigma_noise = sigma_noise self.ca.predicted_variances = None self.ca.log_marginal_likelihood = None # Yarik: what was those about??? just for future in # compute_log_marginal_likelihood ? # self.targets = None pass def __repr__(self): """String summary of the object """ return """BLR(w=%s, sigma_p=%s, sigma_noise=%f, enable_ca=%s)""" % \ (self.w, self.sigma_p, self.sigma_noise, str(self.ca.enabled)) def compute_log_marginal_likelihood(self): """ Compute log marginal likelihood using self.train_fv and self.targets. """ # log_marginal_likelihood = None # return log_marginal_likelihood raise NotImplementedError def _train(self, data): """Train regression using `data` (`Dataset`). """ # BLR relies on numerical labels train_labels = self._attrmap.to_numeric( data.sa[self.params.targets_attr].value) # provide a basic (i.e. identity matrix) and correct prior # sigma_p, if not provided before or not compliant to 'data': if self.sigma_p == None: # case: not provided self.sigma_p = np.eye(data.samples.shape[1] + 1) elif self.sigma_p.shape[1] != (data.samples.shape[1] + 1): # case: wrong dimensions self.sigma_p = np.eye(data.samples.shape[1] + 1) else: # ...then everything is OK :) pass # add one fake column of '1.0' to model the intercept: self.samples_train = np.hstack( [data.samples, np.ones((data.samples.shape[0], 1))]) if type(self.sigma_p) != type( self.samples_train): # if sigma_p is a number... self.sigma_p = np.eye(self.samples_train.shape[1] ) * self.sigma_p # convert in matrix pass self.A_inv = np.linalg.inv( 1.0 / (self.sigma_noise**2) * np.dot(self.samples_train.T, self.samples_train) + np.linalg.inv(self.sigma_p)) self.w = 1.0 / (self.sigma_noise**2) * np.dot( self.A_inv, np.dot(self.samples_train.T, train_labels)) pass @accepts_dataset_as_samples def _predict(self, data): """ Predict the output for the provided data. """ data = np.hstack([data, np.ones((data.shape[0], 1), dtype=data.dtype)]) predictions = np.dot(data, self.w) if self.ca.is_enabled('predicted_variances'): # do computation only if conditional attribute was enabled self.ca.predicted_variances = np.dot( data, np.dot(self.A_inv, data.T)).diagonal()[:, np.newaxis] self.ca.estimates = predictions return predictions def set_hyperparameters(self, *args): """ Set hyperparameters' values. Note that this is a list so the order of the values is important. """ args = args[0] self.sigma_noise = args[0] if len(args) > 1: self.sigma_p = np.array(args[1:]) # XXX check if this is ok pass return pass
class CrossValidatedTransferError(DatasetMeasure, Harvestable): """Classifier cross-validation. This class provides a simple interface to cross-validate a classifier on datasets generated by a splitter from a single source dataset. Arbitrary performance/error values can be computed by specifying an error function (used to compute an error value for each cross-validation fold) and a combiner function that aggregates all computed error values across cross-validation folds. """ results = ConditionalAttribute( enabled=False, doc="""Store individual results in the state""") splits = ConditionalAttribute( enabled=False, doc="""Store the actual splits of the data. Can be memory expensive""") transerrors = ConditionalAttribute( enabled=False, doc="""Store copies of transerrors at each step. If enabled - operates on clones of transerror, but for the last split original transerror is used""") confusion = ConditionalAttribute( enabled=False, doc="""Store total confusion matrix (if available)""") training_confusion = ConditionalAttribute( enabled=False, doc="""Store total training confusion matrix (if available)""") samples_error = ConditionalAttribute(enabled=False, doc="Per sample errors.") def __init__(self, transerror, splitter=None, expose_testdataset=False, harvest_attribs=None, copy_attribs='copy', samples_idattr='origids', **kwargs): """ Parameters ---------- transerror : TransferError instance Provides the classifier used for cross-validation. splitter : Splitter or None Used to split the dataset for cross-validation folds. By convention the first dataset in the tuple returned by the splitter is used to train the provided classifier. If the first element is 'None' no training is performed. The second dataset is used to generate predictions with the (trained) classifier. If `None` (default) an instance of :class:`~mvpa.datasets.splitters.NoneSplitter` is used. expose_testdataset : bool, optional In the proper pipeline, classifier must not know anything about testing data, but in some cases it might lead only to marginal harm, thus migth wanted to be enabled (provide testdataset for RFE to determine stopping point). harvest_attribs : list of str What attributes of call to store and return within harvested conditional attribute copy_attribs : None or str, optional Force copying values of attributes on harvesting samples_idattr : str, optional What samples attribute to use to identify and store samples_errors conditional attribute **kwargs All additional arguments are passed to the :class:`~mvpa.measures.base.DatasetMeasure` base class. """ DatasetMeasure.__init__(self, **kwargs) Harvestable.__init__(self, harvest_attribs, copy_attribs) if splitter is None: self.__splitter = NoneSplitter() else: self.__splitter = splitter self.__transerror = transerror self.__expose_testdataset = expose_testdataset self.__samples_idattr = samples_idattr # TODO: put back in ASAP # def __repr__(self): # """String summary over the object # """ # return """CrossValidatedTransferError / # splitter: %s # classifier: %s # errorfx: %s # combiner: %s""" % (indent_doc(self.__splitter), indent_doc(self.__clf), # indent_doc(self.__errorfx), indent_doc(self.__combiner)) def _call(self, dataset): """Perform cross-validation on a dataset. 'dataset' is passed to the splitter instance and serves as the source dataset to generate split for the single cross-validation folds. """ # store the results of the splitprocessor results = [] self.ca.splits = [] # local bindings ca = self.ca clf = self.__transerror.clf expose_testdataset = self.__expose_testdataset # what ca to enable in terr terr_enable = [] for state_var in ['confusion', 'training_confusion', 'samples_error']: if ca.is_enabled(state_var): terr_enable += [state_var] # charge ca with initial values summaryClass = clf.__summary_class__ clf_hastestdataset = hasattr(clf, 'testdataset') self.ca.confusion = summaryClass() self.ca.training_confusion = summaryClass() self.ca.transerrors = [] if ca.is_enabled('samples_error'): dataset.init_origids('samples', attr=self.__samples_idattr, mode='existing') self.ca.samples_error = dict([ (id_, []) for id_ in dataset.sa[self.__samples_idattr].value ]) # enable requested ca in child TransferError instance (restored # again below) if len(terr_enable): self.__transerror.ca.change_temporarily(enable_ca=terr_enable) # We better ensure that underlying classifier is not trained if we # are going to deepcopy transerror if ca.is_enabled("transerrors"): self.__transerror.untrain() # collect sum info about the split that where made for the resulting # dataset splitinfo = [] # splitter for split in self.__splitter(dataset): splitinfo.append("%s->%s" % (','.join([ str(c) for c in split[0].sa[self.__splitter.splitattr].unique ]), ','.join([ str(c) for c in split[1].sa[self.__splitter.splitattr].unique ]))) # only train classifier if splitter provides something in first # element of tuple -- the is the behavior of TransferError if ca.is_enabled("splits"): self.ca.splits.append(split) if ca.is_enabled("transerrors"): # copy first and then train, as some classifiers cannot be copied # when already trained, e.g. SWIG'ed stuff lastsplit = None for ds in split: if ds is not None: lastsplit = ds.a.lastsplit break if lastsplit: # only if we could deduce that it was last split # use the 'mother' transerror transerror = self.__transerror else: # otherwise -- deep copy transerror = deepcopy(self.__transerror) else: transerror = self.__transerror # assign testing dataset if given classifier can digest it if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = split[1] # run the beast result = transerror(split[1], split[0]) # unbind the testdataset from the classifier if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = None # next line is important for 'self._harvest' call self._harvest(locals()) # XXX Look below -- may be we should have not auto added .? # then transerrors also could be deprecated if ca.is_enabled("transerrors"): self.ca.transerrors.append(transerror) # XXX: could be merged with next for loop using a utility class # that can add dict elements into a list if ca.is_enabled("samples_error"): for k, v in \ transerror.ca.samples_error.iteritems(): self.ca.samples_error[k].append(v) # pull in child ca for state_var in ['confusion', 'training_confusion']: if ca.is_enabled(state_var): ca[state_var].value.__iadd__( transerror.ca[state_var].value) if __debug__: debug("CROSSC", "Split #%d: result %s" \ % (len(results), `result`)) results.append(result) # Since we could have operated with a copy -- bind the last used one back self.__transerror = transerror # put ca of child TransferError back into original config if len(terr_enable): self.__transerror.ca.reset_changed_temporarily() self.ca.results = results """Store conditional attribute if it is enabled""" results = Dataset(results, sa={'cv_fold': splitinfo}) return results splitter = property(fget=lambda self: self.__splitter, doc="Access to the Splitter instance.") transerror = property(fget=lambda self: self.__transerror, doc="Access to the TransferError instance.")
class SVM(_SVM): """Support Vector Machine Classifier. This is a simple interface to the libSVM package. """ # Since this is internal feature of LibSVM, this conditional attribute is present # here probabilities = ConditionalAttribute( enabled=False, doc="Estimates of samples probabilities as provided by LibSVM") # TODO p is specific for SVR _KNOWN_PARAMS = [ 'epsilon', 'probability', 'shrinking', 'weight_label', 'weight' ] #_KNOWN_KERNEL_PARAMS = [ 'cache_size' ] _KNOWN_SENSITIVITIES = { 'linear': LinearSVMWeights, } _KNOWN_IMPLEMENTATIONS = { 'C_SVC': (_svm.svmc.C_SVC, ('C', ), ('binary', 'multiclass'), 'C-SVM classification'), 'NU_SVC': (_svm.svmc.NU_SVC, ('nu', ), ('binary', 'multiclass'), 'nu-SVM classification'), 'ONE_CLASS': (_svm.svmc.ONE_CLASS, (), ('oneclass', ), 'one-class-SVM'), 'EPSILON_SVR': (_svm.svmc.EPSILON_SVR, ('C', 'tube_epsilon'), ('regression', ), 'epsilon-SVM regression'), 'NU_SVR': (_svm.svmc.NU_SVR, ('nu', 'tube_epsilon'), ('regression', ), 'nu-SVM regression') } __default_kernel_class__ = LinearLSKernel __tags__ = _SVM.__tags__ + ['libsvm'] def __init__(self, **kwargs): # XXX Determine which parameters depend on each other and implement # safety/simplifying logic around them # already done for: nr_weight # thought: weight and weight_label should be a dict """Interface class to LIBSVM classifiers and regressions. Default implementation (C/nu/epsilon SVM) is chosen depending on the given parameters (C/nu/tube_epsilon). """ svm_impl = kwargs.get('svm_impl', None) # Depending on given arguments, figure out desired SVM # implementation if svm_impl is None: for arg, impl in [('tube_epsilon', 'EPSILON_SVR'), ('C', 'C_SVC'), ('nu', 'NU_SVC')]: if kwargs.has_key(arg): svm_impl = impl if __debug__: debug( 'SVM', 'No implementation was specified. Since ' '%s is given among arguments, assume %s' % (arg, impl)) break if svm_impl is None: svm_impl = 'C_SVC' if __debug__: debug('SVM', 'Assign C_SVC "by default"') kwargs['svm_impl'] = svm_impl # init base class _SVM.__init__(self, **kwargs) self._svm_type = self._KNOWN_IMPLEMENTATIONS[svm_impl][0] if 'nu' in self._KNOWN_PARAMS and 'epsilon' in self._KNOWN_PARAMS: # overwrite eps param with new default value (information # taken from libSVM docs self.params['epsilon']._set_default(0.001) self.__model = None """Holds the trained SVM.""" def _train(self, dataset): """Train SVM """ targets_sa_name = self.params.targets_attr # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa # libsvm needs doubles src = _data2ls(dataset) # libsvm cannot handle literal labels labels = self._attrmap.to_numeric(targets_sa.value).tolist() svmprob = _svm.SVMProblem(labels, src) # Translate few params TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'} args = [] for paramname, param in self.params.items() \ + self.kernel_params.items(): if paramname in TRANSLATEDICT: argname = TRANSLATEDICT[paramname] elif paramname in _svm.SVMParameter.default_parameters: argname = paramname else: if __debug__: debug( "SVM_", "Skipping parameter %s since it is not known" "to libsvm" % paramname) continue args.append((argname, param.value)) # ??? All those parameters should be fetched if present from # **kwargs and create appropriate parameters within .params or # .kernel_params libsvm_param = _svm.SVMParameter( kernel_type=self.params.kernel.as_raw_ls(), # Just an integer ID svm_type=self._svm_type, **dict(args)) """Store SVM parameters in libSVM compatible format.""" if self.params.has_key('C'): #svm_type in [_svm.svmc.C_SVC]: Cs = self._get_cvec(dataset) if len(Cs) > 1: C0 = abs(Cs[0]) scale = 1.0 / (C0) #*np.sqrt(C0)) # so we got 1 C per label uls = self._attrmap.to_numeric(targets_sa.unique) if len(Cs) != len(uls): raise ValueError, "SVM was parameterized with %d Cs but " \ "there are %d labels in the dataset" % \ (len(Cs), len(targets_sa.unique)) weight = [c * scale for c in Cs] # All 3 need to be set to take an effect libsvm_param._set_parameter('weight', weight) libsvm_param._set_parameter('nr_weight', len(weight)) libsvm_param._set_parameter('weight_label', uls) libsvm_param._set_parameter('C', Cs[0]) self.__model = _svm.SVMModel(svmprob, libsvm_param) @accepts_samples_as_dataset def _predict(self, data): """Predict values for the data """ # libsvm needs doubles src = _data2ls(data) ca = self.ca predictions = [self.model.predict(p) for p in src] if ca.is_enabled('estimates'): if self.__is_regression__: estimates = [self.model.predict_values_raw(p)[0] for p in src] else: # if 'trained_targets' are literal they have to be mapped if np.issubdtype(self.ca.trained_targets.dtype, 'c'): trained_targets = self._attrmap.to_numeric( self.ca.trained_targets) else: trained_targets = self.ca.trained_targets nlabels = len(trained_targets) # XXX We do duplicate work. model.predict calls # predict_values_raw internally and then does voting or # thresholding. So if speed becomes a factor we might # want to move out logic from libsvm over here to base # predictions on obtined values, or adjust libsvm to # spit out values from predict() as well if nlabels == 2: # Apperently libsvm reorders labels so we need to # track (1,0) values instead of (0,1) thus just # lets take negative reverse estimates = [ self.model.predict_values(p)[(trained_targets[1], trained_targets[0])] for p in src ] if len(estimates) > 0: if __debug__: debug( "SVM", "Forcing estimates to be ndarray and reshaping" " them into 1D vector") estimates = np.asarray(estimates).reshape( len(estimates)) else: # In multiclass we return dictionary for all pairs # of labels, since libsvm does 1-vs-1 pairs estimates = [self.model.predict_values(p) for p in src] ca.estimates = estimates if ca.is_enabled("probabilities"): # XXX Is this really necesssary? yoh don't think so since # assignment to ca is doing the same #self.probabilities = [ self.model.predict_probability(p) # for p in src ] try: ca.probabilities = [ self.model.predict_probability(p) for p in src ] except TypeError: warning("Current SVM %s doesn't support probability " % self + " estimation.") return predictions def summary(self): """Provide quick summary over the SVM classifier""" s = super(SVM, self).summary() if self.trained: s += '\n # of SVs: %d' % self.__model.get_total_n_sv() try: prm = _svm.svmc.svm_model_param_get(self.__model.model) C = _svm.svmc.svm_parameter_C_get(prm) # extract information of how many SVs sit inside the margin, # i.e. so called 'bounded SVs' inside_margin = np.sum( # take 0.99 to avoid rounding issues np.abs(self.__model.get_sv_coef()) >= 0.99 * _svm.svmc.svm_parameter_C_get(prm)) s += ' #bounded SVs:%d' % inside_margin s += ' used C:%5g' % C except: pass return s def untrain(self): """Untrain libsvm's SVM: forget the model """ if __debug__ and "SVM" in debug.active: debug("SVM", "Untraining %s and destroying libsvm model" % self) super(SVM, self).untrain() del self.__model self.__model = None model = property(fget=lambda self: self.__model) """Access to the SVM model."""
class FeatureSelectionPipeline(FeatureSelection): """Feature elimination through the list of FeatureSelection's. Given as list of FeatureSelections it applies them in turn. """ nfeatures = ConditionalAttribute( doc="Number of features before each step in pipeline") # TODO: may be we should also append resultant number of features? def __init__(self, feature_selections, **kwargs): """Initialize feature selection pipeline Parameters ---------- feature_selections : lisf of FeatureSelection selections which to use. Order matters """ # base init first FeatureSelection.__init__(self, **kwargs) self.__feature_selections = feature_selections """Selectors to use in turn""" def untrain(self): if __debug__: debug("FS_", "Untraining FS pipeline: %s" % self) for fs in self.__feature_selections: fs.untrain() def _call(self, dataset, testdataset=None, **kwargs): """Invocation of the feature selection """ wdataset = dataset wtestdataset = testdataset self.ca.selected_ids = None self.ca.nfeatures = [] """Number of features at each step (before running selection)""" for fs in self.__feature_selections: # enable selected_ids state if it was requested from this class fs.ca.change_temporarily(enable_ca=["selected_ids"], other=self) if self.ca.is_enabled("nfeatures"): self.ca.nfeatures.append(wdataset.nfeatures) if __debug__: debug('FSPL', 'Invoking %s on (%s, %s)' % (fs, wdataset, wtestdataset)) wdataset, wtestdataset = fs(wdataset, wtestdataset, **kwargs) if self.ca.is_enabled("selected_ids"): if self.ca.selected_ids == None: self.ca.selected_ids = fs.ca.selected_ids else: self.ca.selected_ids = self.ca.selected_ids[ fs.ca.selected_ids] fs.ca.reset_changed_temporarily() return (wdataset, wtestdataset) feature_selections = property(fget=lambda self: self.__feature_selections, doc="List of `FeatureSelections`")
def _set_retrainable(self, value, force=False): """Assign value of retrainable parameter If retrainable flag is to be changed, classifier has to be untrained. Also internal attributes such as _changedData, __changedData_isset, and __idhashes should be initialized if it becomes retrainable """ pretrainable = self.params['retrainable'] if (force or value != pretrainable.value) \ and 'retrainable' in self.__tags__: if __debug__: debug("CLF_", "Setting retrainable to %s" % value) if 'meta' in self.__tags__: warning("Retrainability is not yet crafted/tested for " "meta classifiers. Unpredictable behavior might occur") # assure that we don't drag anything behind if self.trained: self.untrain() ca = self.ca if not value and ca.has_key('retrained'): ca.pop('retrained') ca.pop('repredicted') if value: if not 'retrainable' in self.__tags__: warning( "Setting of flag retrainable for %s has no effect" " since classifier has no such capability. It would" " just lead to resources consumption and slowdown" % self) ca['retrained'] = ConditionalAttribute( enabled=True, doc="Either retrainable classifier was retrained") ca['repredicted'] = ConditionalAttribute( enabled=True, doc="Either retrainable classifier was repredicted") pretrainable.value = value # if retrainable we need to keep track of things if value: self.__idhashes = { 'traindata': None, 'targets': None, 'testdata': None } #, 'testtraindata': None} if __debug__ and 'CHECK_RETRAIN' in debug.active: # ??? it is not clear though if idhash is faster than # simple comparison of (dataset != __traineddataset).any(), # but if we like to get rid of __traineddataset then we # should use idhash anyways self.__trained = self.__idhashes.copy() # just same Nones self.__reset_changed_data() self.__invalidatedChangedData = {} elif 'retrainable' in self.__tags__: #self.__reset_changed_data() self.__changedData_isset = False self._changedData = None self.__idhashes = None if __debug__ and 'CHECK_RETRAIN' in debug.active: self.__trained = None
class MixedClass(ClassWithCollections): C = Parameter(1.0, min=0, doc="C parameter") D = Parameter(3.0, min=0, doc="D parameter") state1 = ConditionalAttribute(doc="bogus")
class TestClassProper(ClassWithCollections): state1 = ConditionalAttribute(enabled=False, doc="state1 doc") state2 = ConditionalAttribute(enabled=True, doc="state2 doc")
class MCNullDist(NullDist): """Null-hypothesis distribution is estimated from randomly permuted data labels. The distribution is estimated by calling fit() with an appropriate `DatasetMeasure` or `TransferError` instance and a training and a validation dataset (in case of a `TransferError`). For a customizable amount of cycles the training data labels are permuted and the corresponding measure computed. In case of a `TransferError` this is the error when predicting the *correct* labels of the validation dataset. The distribution can be queried using the `cdf()` method, which can be configured to report probabilities/frequencies from `left` or `right` tail, i.e. fraction of the distribution that is lower or larger than some critical value. This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` returns an array of featurewise probabilities/frequencies. """ _DEV_DOC = """ TODO automagically decide on the number of samples/permutations needed Caution should be paid though since resultant distributions might be quite far from some conventional ones (e.g. Normal) -- it is expected to them to be bimodal (or actually multimodal) in many scenarios. """ dist_samples = ConditionalAttribute( enabled=False, doc='Samples obtained for each permutation') # XXX shouldn't we may be RF permute_attr into a Permutator class? ;) def __init__(self, dist_class=Nonparametric, permutations=100, permute_attr='targets', chunks_attr=None, permute_col='sa', assure_permute=False, **kwargs): """Initialize Monte-Carlo Permutation Null-hypothesis testing Parameters ---------- dist_class : class This can be any class which provides parameters estimate using `fit()` method to initialize the instance, and provides `cdf(x)` method for estimating value of x in CDF. All distributions from SciPy's 'stats' module can be used. permutations : int This many permutations of label will be performed to determine the distribution under the null hypothesis. permute_attr : str Name of the samples attribute to permute. ('targets' by default) chunks_attr : None or str If not None, permutes labels within the chunks, i.e. blocks of data having the same value of `chunks_attr`. permute_col : str, optional What collection `permute_attr` belongs to. assure_permute : bool Passed to func:`~mvpa.datasets.misc.permute_attr`. If True, assures that targets are permuted, i.e. any one is different from the original one """ NullDist.__init__(self, **kwargs) self._dist_class = dist_class self._dist = [] # actual distributions self.__permutations = permutations """Number of permutations to compute the estimate the null distribution.""" self.permute_attr = permute_attr self.chunks_attr = chunks_attr self.assure_permute = assure_permute self.permute_col = permute_col def __repr__(self, prefixes=[]): prefixes_ = ["permutations=%s" % self.__permutations] if self.permute_attr != 'targets': prefixes_ += ['attr=%r' % self.permute_attr] if self.chunks_attr: prefixes_ += ['chunks_attr=%r' % self.chunks_attr] if self.permute_col != 'sa': prefixes_ += ['permute_col=%r' % self.permute_col] if self.assure_permute: prefixes_ += ['assure_permute=%r' % self.assure_permute] if self._dist_class != Nonparametric: prefixes_.insert(0, 'dist_class=%r' % (self._dist_class, )) return super(MCNullDist, self).__repr__(prefixes=prefixes_ + prefixes) def fit(self, measure, wdata, vdata=None): """Fit the distribution by performing multiple cycles which repeatedly permuted labels in the training dataset. Parameters ---------- measure: (`Featurewise`)`DatasetMeasure` or `TransferError` TransferError instance used to compute all errors. wdata: `Dataset` which gets permuted and used to compute the measure/transfer error multiple times. vdata: `Dataset` used for validation. If provided measure is assumed to be a `TransferError` and working and validation dataset are passed onto it. """ # TODO: place exceptions separately so we could avoid circular imports from mvpa.clfs.base import LearnerError dist_samples = [] """Holds the values for randomized labels.""" # estimate null-distribution for p in xrange(self.__permutations): # new permutation all the time # but only permute the training data and keep the testdata constant # if __debug__: debug('STATMC', "Doing %i permutations: %i" \ % (self.__permutations, p+1), cr=True) # TODO this really needs to be more clever! If data samples are # shuffled within a class it really makes no difference for the # classifier, hence the number of permutations to estimate the # null-distribution of transfer errors can be reduced dramatically # when the *right* permutations (the ones that matter) are done. permuted_wdata = wdata.copy('shallow') permuted_wdata.permute_attr(attr=self.permute_attr, chunks_attr=self.chunks_attr, col=self.permute_col, assure_permute=self.assure_permute) # decide on the arguments to measure if not vdata is None: measure_args = [vdata, permuted_wdata] else: measure_args = [permuted_wdata] # compute and store the measure of this permutation # assume it has `TransferError` interface try: res = measure(*measure_args) except LearnerError, e: warning( 'Failed to obtain value from %s due to %s. Measurement' ' was skipped, which could lead to unstable and/or' ' incorrect assessment of the null_dist' % (measure, e)) res = np.asanyarray(res) dist_samples.append(res) if __debug__: debug('STATMC', '') # store samples self.ca.dist_samples = dist_samples = np.asarray(dist_samples) # fit distribution per each element # to decide either it was done on scalars or vectors shape = dist_samples.shape nshape = len(shape) # if just 1 dim, original data was scalar, just create an # artif dimension for it if nshape == 1: dist_samples = dist_samples[:, np.newaxis] # fit per each element. # XXX could be more elegant? may be use np.vectorize? dist_samples_rs = dist_samples.reshape((shape[0], -1)) dist = [] for samples in dist_samples_rs.T: params = self._dist_class.fit(samples) if __debug__ and 'STAT' in debug.active: debug( 'STAT', 'Estimated parameters for the %s are %s' % (self._dist_class, str(params))) dist.append(self._dist_class(*params)) self._dist = dist
class S12(S1__, S2): v12 = ConditionalAttribute()
class S1__(S1_): v1__ = ConditionalAttribute(enabled=False)
class S2(ClassWithCollections): v2 = ConditionalAttribute(enabled=True, doc="values12 is ...")
class S1(ClassWithCollections): v1 = ConditionalAttribute(enabled=True, doc="values1 is ...") v1XXX = ConditionalAttribute(enabled=False, doc="values1 is ...")
class GeneralizedLinearKernel(NumpyKernel): """The linear kernel class. """ sigma_0 = Parameter(1.0, doc=""" A simple constant squared value which is broadcasted across kernel. In the case of GPR -- standard deviation of the Gaussian prior probability Normal(0, sigma_0**2) of the intercept of the linear regression.""") Sigma_p = Parameter(1.0, doc=r""" A generic scalar or vector, or diagonal matrix to scale all dimensions or associate different scaling to each dimensions while computing te kernel matrix: :math:`k(x_A,x_B) = x_A^\top \Sigma_p x_B + \sigma_0^2`. In the case of GPR -- a scalar or a diagonal of covariance matrix of the Gaussian prior probability Normal(0, Sigma_p) on the weights of the linear regression.""") gradients = ConditionalAttribute(enabled=False, doc="Dictionary of gradients per a parameter") gradientslog = ConditionalAttribute(enabled=False, doc="Dictionary of gradients per a parameter in logspace") def __init__(self, *args, **kwargs): # for docstring holder NumpyKernel.__init__(self, *args, **kwargs) ## def __init__(self, Sigma_p=None, sigma_0=1.0, **kwargs): ## """Initialize the linear kernel instance. ## :Parameters: ## Sigma_p : numpy.ndarray ## Covariance matrix of the Gaussian prior probability N(0,Sigma_p) ## on the weights of the linear regression. ## (Defaults to None) ## sigma_0 : float ## the standard deviation of the Gaussian prior N(0,sigma_0**2) ## of the intercept of the linear regression. ## (Deafults to 1.0) ## """ ## # init base class first ## NumpyKernel.__init__(self, **kwargs) ## # TODO: figure out cleaner way... probably by using KernelParameters ;-) ## self.Sigma_p = Sigma_p ## self.sigma_0 = sigma_0 ## def __repr__(self): ## return "%s(Sigma_p=%s, sigma_0=%s)" \ ## % (self.__class__.__name__, str(self.Sigma_p), str(self.sigma_0)) # XXX ??? would we reset correctly to the original value... model selection # currently depends on this I believe def reset(self): super(GeneralizedLinearKernel, self).reset() self._Sigma_p = self._Sigma_p_orig def _compute(self, data1, data2): """Compute kernel matrix. """ # it is better to use separate lines of computation, to don't # incure computation cost without need (otherwise # np.dot(self.Sigma_p, data2.T) can take forever for relatively # large number of features) Sigma_p = self.params.Sigma_p # local binding sigma_0 = self.params.sigma_0 #if scalar - scale second term appropriately if np.isscalar(Sigma_p): if Sigma_p == 1.0: data2_sc = data2.T else: data2_sc = Sigma_p * data2.T # if vector use it as diagonal matrix -- ie scale each row by # the given value elif len(Sigma_p.shape) == 1 and \ Sigma_p.shape[0] == data2.shape[1]: # which due to numpy broadcasting is the same as product # with scalar above data2_sc = (Sigma_p * data2).T # If (diagonal) or full-matrix -- full-featured and lengthy matrix # product elif len(Sigma_p.shape) == 2 and \ Sigma_p.shape[0] == Sigma_p.shape[1] == data2.shape[1]: # which due to numpy broadcasting is the same as product # with scalar above data2_sc = np.dot(Sigma_p, data2.T) else: raise ValueError, "Please provide Sigma_p as a scalar, vector, " \ "or square (diagonal) matrix." # XXX if Sigma_p is changed a warning should be issued! # XXX other cases of incorrect Sigma_p could be catched self._k = k = np.dot(data1, data2_sc) + sigma_0 ** 2 # Compute gradients if any was requested do_g = self.ca.is_enabled('gradients') do_gl = self.ca.is_enabled('gradientslog') if do_g or do_gl: if np.isscalar(Sigma_p): g_Sigma_p = np.dot(data1, data2.T) gl_Sigma_p = Sigma_p * g_Sigma_p else: nfeat = len(Sigma_p) gsize = (len(data1), len(data2), nfeat) if do_g: g_Sigma_p = np.empty(gsize) if do_gl: gl_Sigma_p = np.empty(gsize) for i in xrange(nfeat): outer = np.multiply.outer(data1[:, i], data2[:, i]) if do_g: g_Sigma_p[:, :, i] = outer if do_gl: gl_Sigma_p = Sigma_p[i] * outer if do_g: self.ca.gradients = dict( sigma_0=2*sigma_0, Sigma_p=g_Sigma_p) if do_gl: self.ca.gradientslog = dict( sigma_0=2*sigma_0**2, Sigma_p=gl_Sigma_p) pass
class LinearSVMWeights(Sensitivity): """`Sensitivity` that reports the weights of a linear SVM trained on a given `Dataset`. """ biases = ConditionalAttribute(enabled=True, doc="Offsets of separating hyperplanes") def __init__(self, clf, **kwargs): """Initialize the analyzer with the classifier it shall use. Parameters ---------- clf : LinearSVM classifier to use. Only classifiers sub-classed from `LinearSVM` may be used. """ # init base classes first Sensitivity.__init__(self, clf, **kwargs) def __sg_helper(self, svm): """Helper function to compute sensitivity for a single given SVM""" bias = svm.get_bias() svcoef = np.matrix(svm.get_alphas()) svnums = svm.get_support_vectors() svs = self.clf.traindataset.samples[svnums, :] res = (svcoef * svs).mean(axis=0).A1 return res, bias def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i + 1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert (set([ sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors() ]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert (len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert (set(clf._attrmap.values()) == set([-1.0, 1.0])) assert (sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.params.targets_attr] = sens_labels self.ca.biases = biases return ds
class TestClassProperChild(TestClassProper): state4 = ConditionalAttribute(enabled=False, doc="state4 doc")
class RFE(FeatureSelection): """Recursive feature elimination. A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a certain dataset. These sensitivity maps are in turn used to discard unimportant features. For each feature selection the transfer error on some testdatset is computed. This procedure is repeated until a given `StoppingCriterion` is reached. References ---------- Such strategy after Guyon, I., Weston, J., Barnhill, S., & Vapnik, V. (2002). Gene selection for cancer classification using support vector machines. Mach. Learn., 46(1-3), 389--422. was applied to SVM-based analysis of fMRI data in Hanson, S. J. & Halchenko, Y. O. (2008). Brain reading using full brain support vector machines for object recognition: there is no "face identification area". Neural Computation, 20, 486--503. """ errors = ConditionalAttribute( doc="History of errors through RFE") nfeatures = ConditionalAttribute( doc="History of # of features left") history = ConditionalAttribute( doc="Last step # when each feature was still present") sensitivities = ConditionalAttribute(enabled=False, doc="History of sensitivities (might consume too much memory") def __init__(self, sensitivity_analyzer, transfer_error, feature_selector=FractionTailSelector(0.05), bestdetector=BestDetector(), stopping_criterion=NBackHistoryStopCrit(BestDetector()), train_clf=None, update_sensitivity=True, **kargs ): # XXX Allow for multiple stopping criterions, e.g. error not decreasing # anymore OR number of features less than threshold """Initialize recursive feature elimination Parameters ---------- sensitivity_analyzer : FeaturewiseDatasetMeasure object transfer_error : TransferError object used to compute the transfer error of a classifier based on a certain feature set on the test dataset. NOTE: If sensitivity analyzer is based on the same classifier as transfer_error is using, make sure you initialize transfer_error with train=False, otherwise it would train classifier twice without any necessity. feature_selector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. bestdetector : Functor Given a list of error values it has to return a boolean that signals whether the latest error value is the total minimum. stopping_criterion : Functor Given a list of error values it has to return whether the criterion is fulfilled. train_clf : bool Flag whether the classifier in `transfer_error` should be trained before computing the error. In general this is required, but if the `sensitivity_analyzer` and `transfer_error` share and make use of the same classifier it can be switched off to save CPU cycles. Default `None` checks if sensitivity_analyzer is based on a classifier and doesn't train if so. update_sensitivity : bool If False the sensitivity map is only computed once and reused for each iteration. Otherwise the senstitivities are recomputed at each selection step. """ # base init first FeatureSelection.__init__(self, **kargs) self.__sensitivity_analyzer = sensitivity_analyzer """Sensitivity analyzer used to call at each step.""" self.__transfer_error = transfer_error """Compute transfer error for each feature set.""" self.__feature_selector = feature_selector """Functor which takes care about removing some features.""" self.__stopping_criterion = stopping_criterion self.__bestdetector = bestdetector if train_clf is None: self.__train_clf = isinstance(sensitivity_analyzer, Sensitivity) else: self.__train_clf = train_clf """Flag whether training classifier is required.""" self.__update_sensitivity = update_sensitivity """Flag whether sensitivity map is recomputed for each step.""" # force clf training when sensitivities are not updated as otherwise # shared classifiers are not retrained if not self.__update_sensitivity \ and isinstance(self.__transfer_error, ClassifierError) \ and not self.__train_clf: if __debug__: debug("RFEC", "Forcing training of classifier since " + "sensitivities aren't updated at each step") self.__train_clf = True def _call(self, dataset, testdataset): """Proceed and select the features recursively eliminating less important ones. Parameters ---------- dataset : Dataset used to compute sensitivity maps and train a classifier to determine the transfer error testdataset : Dataset used to test the trained classifer to determine the transfer error Returns a tuple of two new datasets with the feature subset of `dataset` that had the lowest transfer error of all tested sets until the stopping criterion was reached. The first dataset is the feature subset of the training data and the second the selection of the test dataset. """ errors = [] """Computed error for each tested features set.""" ca = self.ca ca.nfeatures = [] """Number of features at each step. Since it is not used by the algorithm it is stored directly in the conditional attribute""" ca.history = arange(dataset.nfeatures) """Store the last step # when the feature was still present """ ca.sensitivities = [] stop = False """Flag when RFE should be stopped.""" results = None """Will hold the best feature set ever.""" wdataset = dataset """Operate on working dataset initially identical.""" wtestdataset = testdataset """Same feature selection has to be performs on test dataset as well. This will hold the current testdataset.""" step = 0 """Counter how many selection step where done.""" orig_feature_ids = arange(dataset.nfeatures) """List of feature Ids as per original dataset remaining at any given step""" sensitivity = None """Contains the latest sensitivity map.""" result_selected_ids = orig_feature_ids """Resultant ids of selected features. Since the best is not necessarily is the last - we better keep this one around. By default -- all features are there""" selected_ids = result_selected_ids while wdataset.nfeatures > 0: if __debug__: debug('RFEC', "Step %d: nfeatures=%d" % (step, wdataset.nfeatures)) # mark the features which are present at this step # if it brings anyb mentionable computational burden in the future, # only mark on removed features at each step ca.history[orig_feature_ids] = step # Compute sensitivity map if self.__update_sensitivity or sensitivity == None: sensitivity = self.__sensitivity_analyzer(wdataset) if len(sensitivity) > 1: raise ValueError( "RFE cannot handle multiple sensitivities at once. " "'%s' returned %i sensitivities." % (self.__sensitivity_analyzer.__class__.__name__, len(sensitivity))) if ca.is_enabled("sensitivities"): ca.sensitivities.append(sensitivity) # do not retrain clf if not necessary if self.__train_clf: error = self.__transfer_error(wtestdataset, wdataset) else: error = self.__transfer_error(wtestdataset, None) # Record the error errors.append(error) # Check if it is time to stop and if we got # the best result stop = self.__stopping_criterion(errors) isthebest = self.__bestdetector(errors) nfeatures = wdataset.nfeatures if ca.is_enabled("nfeatures"): ca.nfeatures.append(wdataset.nfeatures) # store result if isthebest: results = (wdataset, wtestdataset) result_selected_ids = orig_feature_ids if __debug__: debug('RFEC', "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " % (step, nfeatures, error, isthebest, stop)) # stop if it is time to finish if nfeatures == 1 or stop: break # Select features to preserve selected_ids = self.__feature_selector(sensitivity) if __debug__: debug('RFEC_', "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" % (sensitivity, len(selected_ids), selected_ids)) # Create a dataset only with selected features wdataset = wdataset[:, selected_ids] # select corresponding sensitivity values if they are not # recomputed if not self.__update_sensitivity: sensitivity = sensitivity[selected_ids] # need to update the test dataset as well # XXX why should it ever become None? # yoh: because we can have __transfer_error computed # using wdataset. See xia-generalization estimate # in lightsvm. Or for god's sake leave-one-out # on a wdataset # TODO: document these cases in this class if not testdataset is None: wtestdataset = wtestdataset[:, selected_ids] step += 1 # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids selected_ids.sort() if self.ca.is_enabled("history") \ or self.ca.is_enabled('selected_ids'): orig_feature_ids = orig_feature_ids[selected_ids] if hasattr(self.__transfer_error, "clf"): self.__transfer_error.clf.untrain() # charge conditional attributes self.ca.errors = errors self.ca.selected_ids = result_selected_ids # best dataset ever is returned return results
class LinearSVMWeights(Sensitivity): """`SensitivityAnalyzer` for the LIBSVM implementation of a linear SVM. """ _ATTRIBUTE_COLLECTIONS = ['params'] # XXX TODO: should become just as sa may be? biases = ConditionalAttribute(enabled=True, doc="Offsets of separating hyper-planes") split_weights = Parameter( False, allowedtype='bool', doc="If binary classification either to sum SVs per each " "class separately. Note: be careful with interpretation" " of the values") def __init__(self, clf, **kwargs): """Initialize the analyzer with the classifier it shall use. Parameters ---------- clf : LinearSVM classifier to use. Only classifiers sub-classed from `LinearSVM` may be used. """ # init base classes first Sensitivity.__init__(self, clf, **kwargs) def _call(self, dataset, callables=[]): # local bindings clf = self.clf model = clf.model # Labels for sensitivities to be returned sens_labels = None if clf.__is_regression__: nr_class = None svm_labels = None # shouldn't bother to provide "targets" for regressions else: nr_class = model.nr_class svm_labels = model.labels # No need to warn since now we by default we do not do # anything evil and provide labels -- so it is up for a user # to decide either he wants to do something silly #if nr_class != 2: # warning("You are estimating sensitivity for SVM %s trained on %d" % # (str(clf), nr_class) + # " classes. Make sure that it is what you intended to do" ) svcoef = np.matrix(model.get_sv_coef()) svs = np.matrix(model.get_sv()) rhos = np.asarray(model.get_rho()) self.ca.biases = rhos if self.params.split_weights: if nr_class != 2: raise NotImplementedError, \ "Cannot compute per-class weights for" \ " non-binary classification task" # libsvm might have different idea on the ordering # of labels, so we would need to map them back explicitely ds_labels = list(dataset.sa[ clf.params.targets_attr].unique) # labels in the dataset senses = [None for i in ds_labels] # first label is given positive value for i, (c, l) in enumerate([(svcoef > 0, lambda x: x), (svcoef < 0, lambda x: x * -1)]): # convert to array, and just take the meaningful dimension c_ = c.A[0] # NOTE svm_labels are numerical; ds_labels are literal senses[ds_labels.index( clf._attrmap.to_literal(svm_labels[i]))] = \ (l(svcoef[:, c_] * svs[c_, :])).A[0] weights = np.array(senses) sens_labels = svm_labels else: # XXX yoh: .mean() is effectively # averages across "sensitivities" of all paired classifiers (I # think). See more info on this topic in svm.py on how sv_coefs # are stored # # First multiply SV coefficients with the actual SVs to get # weighted impact of SVs on decision, then for each feature # take mean across SVs to get a single weight value # per feature if nr_class is None or nr_class <= 2: # as simple as this weights = (svcoef * svs).A # and only in case of classification if nr_class: # ??? First label seems corresponds to positive sens_labels = [tuple(svm_labels[::-1])] else: # we need to compose correctly per each pair of classifiers. # See docstring for get_sv_coef for more details on internal # structure of bloody storage # total # of pairs npairs = nr_class * (nr_class - 1) / 2 # # of SVs in each class NSVs_perclass = model.get_n_sv() # indices where each class starts in each row of SVs # name is after similar variable in libsvm internals nz_start = np.cumsum([0] + NSVs_perclass[:-1]) nz_end = nz_start + NSVs_perclass # reserve storage weights = np.zeros((npairs, svs.shape[1])) ipair = 0 # index of the pair """ // classifier (i,j): coefficients with // i are in sv_coef[j-1][nz_start[i]...], // j are in sv_coef[i][nz_start[j]...] """ sens_labels = [] for i in xrange(nr_class): for j in xrange(i + 1, nr_class): weights[ipair, :] = np.asarray( svcoef[j - 1, nz_start[i]:nz_end[i]] * svs[nz_start[i]:nz_end[i]] + svcoef[i, nz_start[j]:nz_end[j]] * svs[nz_start[j]:nz_end[j]]) # ??? First label corresponds to positive # that is why [j], [i] sens_labels += [(svm_labels[j], svm_labels[i])] ipair += 1 # go to the next pair assert (ipair == npairs) if __debug__ and 'SVM' in debug.active: if nr_class: nsvs = model.get_n_sv() else: nsvs = model.get_total_n_sv() debug('SVM', "Extracting weights for %s-class SVM: #SVs=%s, " % \ (nr_class, nsvs) + \ " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \ (svcoef.shape, svs.shape, rhos) + \ " Result: min=%f max=%f" % (np.min(weights), np.max(weights))) ds_kwargs = {} if nr_class: # for classification only # and we should have prepared the labels assert (sens_labels is not None) if len(clf._attrmap): if isinstance(sens_labels[0], tuple): sens_labels = asobjarray(sens_labels) sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) # NOTE: `weights` is already and always 2D ds_kwargs = dict(sa={clf.params.targets_attr: sens_labels}) weights_ds = Dataset(weights, **ds_kwargs) return weights_ds _customizeDocInherit = True
class CombinedFeatureSelection(FeatureSelection): """Meta feature selection utilizing several embedded selection methods. Each embedded feature selection method is computed individually. Afterwards all feature sets are combined by either taking the union or intersection of all sets. The individual feature sets of all embedded methods are optionally avialable from the `selections_ids` conditional attribute. """ selections_ids = ConditionalAttribute( doc="List of feature id sets for each performed method.") def __init__(self, feature_selections, combiner, **kwargs): """ Parameters ---------- feature_selections : list FeatureSelection instances to run. Order is not important. combiner : 'union', 'intersection' which method to be used to combine the feature selection set of all computed methods. """ FeatureSelection.__init__(self, **kwargs) self.__feature_selections = feature_selections self.__combiner = combiner def untrain(self): if __debug__: debug("FS_", "Untraining combined FS: %s" % self) for fs in self.__feature_selections: fs.untrain() def _call(self, dataset, testdataset=None): """Really run it. """ # to hold the union selected_ids = None # to hold the individuals self.ca.selections_ids = [] for fs in self.__feature_selections: # we need the feature ids that were selection by each method, # so enable them temporarily fs.ca.change_temporarily(enable_ca=["selected_ids"], other=self) # compute feature selection, but ignore return datasets fs(dataset, testdataset) # retrieve feature ids and determined union of all selections if selected_ids == None: selected_ids = set(fs.ca.selected_ids) else: if self.__combiner == 'union': selected_ids.update(fs.ca.selected_ids) elif self.__combiner == 'intersection': selected_ids.intersection_update(fs.ca.selected_ids) else: raise ValueError, "Unknown combiner '%s'" % self.__combiner # store individual set in state self.ca.selections_ids.append(fs.ca.selected_ids) # restore ca to previous settings fs.ca.reset_changed_temporarily() # finally apply feature set union selection to original datasets selected_ids = sorted(list(selected_ids)) # take care of optional second dataset td_sel = None if not testdataset is None: td_sel = testdataset[:, self.ca.selected_ids] # and main dataset d_sel = dataset[:, selected_ids] # finally store ids in state self.ca.selected_ids = selected_ids return (d_sel, td_sel) feature_selections = property(fget=lambda self: self.__feature_selections, doc="List of `FeatureSelections`") combiner = property(fget=lambda self: self.__combiner, doc="Selection set combination method.")
class Classifier(ClassWithCollections): """Abstract classifier class to be inherited by all classifiers """ # Kept separate from doc to don't pollute help(clf), especially if # we including help for the parent class _DEV__doc__ = """ Required behavior: For every classifier is has to be possible to be instantiated without having to specify the training pattern. Repeated calls to the train() method with different training data have to result in a valid classifier, trained for the particular dataset. It must be possible to specify all classifier parameters as keyword arguments to the constructor. Recommended behavior: Derived classifiers should provide access to *estimates* -- i.e. that information that is finally used to determine the predicted class label. Michael: Maybe it works well if each classifier provides a 'estimates' state member. This variable is a list as long as and in same order as Dataset.uniquetargets (training data). Each item in the list corresponds to the likelyhood of a sample to belong to the respective class. However the semantics might differ between classifiers, e.g. kNN would probably store distances to class- neighbors, where PLR would store the raw function value of the logistic function. So in the case of kNN low is predictive and for PLR high is predictive. Don't know if there is the need to unify that. As the storage and/or computation of this information might be demanding its collection should be switchable and off be default. Nomenclature * predictions : result of the last call to .predict() * estimates : might be different from predictions if a classifier's predict() makes a decision based on some internal value such as probability or a distance. """ # Dict that contains the parameters of a classifier. # This shall provide an interface to plug generic parameter optimizer # on all classifiers (e.g. grid- or line-search optimizer) # A dictionary is used because Michael thinks that access by name is nicer. # Additionally Michael thinks ATM that additional information might be # necessary in some situations (e.g. reasonably predefined parameter range, # minimal iteration stepsize, ...), therefore the value to each key should # also be a dict or we should use mvpa.misc.param.Parameter'... trained_targets = ConditionalAttribute( enabled=True, doc="Set of unique targets it has been trained on") trained_nsamples = ConditionalAttribute( enabled=True, doc="Number of samples it has been trained on") trained_dataset = ConditionalAttribute( enabled=False, doc="The dataset it has been trained on") training_confusion = ConditionalAttribute( enabled=False, doc="Confusion matrix of learning performance") predictions = ConditionalAttribute(enabled=True, doc="Most recent set of predictions") estimates = ConditionalAttribute( enabled=True, doc="Internal classifier estimates the most recent " + "predictions are based on") training_time = ConditionalAttribute( enabled=True, doc="Time (in seconds) which took classifier to train") predicting_time = ConditionalAttribute( enabled=True, doc="Time (in seconds) which took classifier to predict") feature_ids = ConditionalAttribute( enabled=False, doc="Feature IDS which were used for the actual training.") __tags__ = [] """Describes some specifics about the classifier -- is that it is doing regression for instance....""" targets_attr = Parameter( 'targets', allowedtype='bool', # ro=True, doc="""What samples attribute to use as targets.""", index=999) # TODO: make it available only for actually retrainable classifiers retrainable = Parameter( False, allowedtype='bool', doc="""Either to enable retraining for 'retrainable' classifier.""", index=1002) def __init__(self, **kwargs): ClassWithCollections.__init__(self, **kwargs) # XXX # the place to map literal to numerical labels (and back) # this needs to be in the base class, since some classifiers also # have this nasty 'regression' mode, and the code in this class # needs to deal with converting the regression output into discrete # labels # however, preferably the mapping should be kept in the respective # low-level implementations that need it self._attrmap = AttributeMap() self.__trainednfeatures = None """Stores number of features for which classifier was trained. If None -- it wasn't trained at all""" self._set_retrainable(self.params.retrainable, force=True) # deprecate #self.__trainedidhash = None #"""Stores id of the dataset on which it was trained to signal #in trained() if it was trained already on the same dataset""" @property def __summary_class__(self): if 'regression' in self.__tags__: return RegressionStatistics else: return ConfusionMatrix @property def __is_regression__(self): return 'regression' in self.__tags__ def __str__(self): if __debug__ and 'CLF_' in debug.active: return "%s / %s" % (repr(self), super(Classifier, self).__str__()) else: return repr(self) def __repr__(self, prefixes=[]): return super(Classifier, self).__repr__(prefixes=prefixes) def _pretrain(self, dataset): """Functionality prior to training """ # So we reset all conditional attributes and may be free up some memory # explicitly params = self.params if not params.retrainable: self.untrain() else: # just reset the ca, do not untrain self.ca.reset() if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData __idhashes = self.__idhashes __invalidatedChangedData = self.__invalidatedChangedData # if we don't know what was changed we need to figure # them out if __debug__: debug('CLF_', "IDHashes are %s" % (__idhashes)) # Look at the data if any was changed for key, data_ in (('traindata', dataset.samples), ('targets', dataset.sa[params.targets_attr].value)): _changedData[key] = self.__was_data_changed(key, data_) # if those idhashes were invalidated by retraining # we need to adjust _changedData accordingly if __invalidatedChangedData.get(key, False): if __debug__ and not _changedData[key]: debug( 'CLF_', 'Found that idhash for %s was ' 'invalidated by retraining' % key) _changedData[key] = True # Look at the parameters for col in self._paramscols: changedParams = self._collections[col].which_set() if len(changedParams): _changedData[col] = changedParams self.__invalidatedChangedData = {} # reset it on training if __debug__: debug('CLF_', "Obtained _changedData is %s" % (self._changedData)) def _posttrain(self, dataset): """Functionality post training For instance -- computing confusion matrix. Parameters ---------- dataset : Dataset Data which was used for training """ ca = self.ca if ca.is_enabled('trained_targets'): ca.trained_targets = dataset.sa[self.params.targets_attr].unique ca.trained_dataset = dataset ca.trained_nsamples = dataset.nsamples # needs to be assigned first since below we use predict self.__trainednfeatures = dataset.nfeatures if __debug__ and 'CHECK_TRAINED' in debug.active: self.__trainedidhash = dataset.idhash if self.ca.is_enabled('training_confusion') and \ not self.ca.is_set('training_confusion'): # we should not store predictions for training data, # it is confusing imho (yoh) self.ca.change_temporarily(disable_ca=["predictions"]) if self.params.retrainable: # we would need to recheck if data is the same, # XXX think if there is a way to make this all # efficient. For now, probably, retrainable # classifiers have no chance but not to use # training_confusion... sad self.__changedData_isset = False predictions = self.predict(dataset) self.ca.reset_changed_temporarily() self.ca.training_confusion = self.__summary_class__( targets=dataset.sa[self.params.targets_attr].value, predictions=predictions) if self.ca.is_enabled('feature_ids'): self.ca.feature_ids = self._get_feature_ids() ##REF: Name was automagically refactored def _get_feature_ids(self): """Virtual method to return feature_ids used while training Is not intended to be called anywhere but from _posttrain, thus classifier is assumed to be trained at this point """ # By default all features are used return range(self.__trainednfeatures) def summary(self): """Providing summary over the classifier""" s = "Classifier %s" % self ca = self.ca ca_enabled = ca.enabled if self.trained: s += "\n trained" if ca.is_set('training_time'): s += ' in %.3g sec' % ca.training_time s += ' on data with' if ca.is_set('trained_targets'): s += ' targets:%s' % list(ca.trained_targets) nsamples, nchunks = None, None if ca.is_set('trained_nsamples'): nsamples = ca.trained_nsamples if ca.is_set('trained_dataset'): td = ca.trained_dataset nsamples, nchunks = td.nsamples, len(td.sa['chunks'].unique) if nsamples is not None: s += ' #samples:%d' % nsamples if nchunks is not None: s += ' #chunks:%d' % nchunks s += " #features:%d" % self.__trainednfeatures if ca.is_set('feature_ids'): s += ", used #features:%d" % len(ca.feature_ids) if ca.is_set('training_confusion'): s += ", training error:%.3g" % ca.training_confusion.error else: s += "\n not yet trained" if len(ca_enabled): s += "\n enabled ca:%s" % ', '.join( [str(ca[x]) for x in ca_enabled]) return s def clone(self): """Create full copy of the classifier. It might require classifier to be untrained first due to present SWIG bindings. TODO: think about proper re-implementation, without enrollment of deepcopy """ if __debug__: debug("CLF", "Cloning %s#%s" % (self, id(self))) try: return deepcopy(self) except: self.untrain() return deepcopy(self) def _train(self, dataset): """Function to be actually overridden in derived classes """ raise NotImplementedError def train(self, dataset): """Train classifier on a dataset Shouldn't be overridden in subclasses unless explicitly needed to do so """ if dataset.nfeatures == 0 or dataset.nsamples == 0: raise DegenerateInputError, \ "Cannot train classifier on degenerate data %s" % dataset if __debug__: debug("CLF", "Training classifier %(clf)s on dataset %(dataset)s", msgargs={ 'clf': self, 'dataset': dataset }) self._pretrain(dataset) # remember the time when started training t0 = time.time() if dataset.nfeatures > 0: result = self._train(dataset) else: warning("Trying to train on dataset with no features present") if __debug__: debug("CLF", "No features present for training, no actual training " \ "is called") result = None self.ca.training_time = time.time() - t0 self._posttrain(dataset) return result def _prepredict(self, dataset): """Functionality prior prediction """ if not ('notrain2predict' in self.__tags__): # check if classifier was trained if that is needed if not self.trained: raise ValueError, \ "Classifier %s wasn't yet trained, therefore can't " \ "predict" % self nfeatures = dataset.nfeatures #data.shape[1] # check if number of features is the same as in the data # it was trained on if nfeatures != self.__trainednfeatures: raise ValueError, \ "Classifier %s was trained on data with %d features, " % \ (self, self.__trainednfeatures) + \ "thus can't predict for %d features" % nfeatures if self.params.retrainable: if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData data = np.asanyarray(dataset.samples) _changedData['testdata'] = \ self.__was_data_changed('testdata', data) if __debug__: debug( 'CLF_', "prepredict: Obtained _changedData is %s" % (_changedData)) def _postpredict(self, dataset, result): """Functionality after prediction is computed """ self.ca.predictions = result if self.params.retrainable: self.__changedData_isset = False def _predict(self, dataset): """Actual prediction """ raise NotImplementedError @accepts_samples_as_dataset def predict(self, dataset): """Predict classifier on data Shouldn't be overridden in subclasses unless explicitly needed to do so. Also subclasses trying to call super class's predict should call _predict if within _predict instead of predict() since otherwise it would loop """ ## ??? yoh: changed to asany from as without exhaustive check data = np.asanyarray(dataset.samples) if __debug__: debug("CLF", "Predicting classifier %(clf)s on ds %(dataset)s", msgargs={ 'clf': self, 'dataset': dataset }) # remember the time when started computing predictions t0 = time.time() ca = self.ca # to assure that those are reset (could be set due to testing # post-training) ca.reset(['estimates', 'predictions']) self._prepredict(dataset) if self.__trainednfeatures > 0 \ or 'notrain2predict' in self.__tags__: result = self._predict(dataset) else: warning( "Trying to predict using classifier trained on no features") if __debug__: debug("CLF", "No features were present for training, prediction is " \ "bogus") result = [None] * data.shape[0] ca.predicting_time = time.time() - t0 # with labels mapping in-place, we also need to go back to the # literal labels if self._attrmap: try: result = self._attrmap.to_literal(result) except KeyError, e: raise FailedToPredictError, \ "Failed to convert predictions from numeric into " \ "literals: %s" % e self._postpredict(dataset, result) return result
class kNN(Classifier): """ k-Nearest-Neighbour classifier. This is a simple classifier that bases its decision on the distances between the training dataset samples and the test sample(s). Distances are computed using a customizable distance function. A certain number (`k`)of nearest neighbors is selected based on the smallest distances and the labels of this neighboring samples are fed into a voting function to determine the labels of the test sample. Training a kNN classifier is extremely quick, as no actuall training is performed as the training dataset is simply stored in the classifier. All computations are done during classifier prediction. Notes ----- If enabled, kNN stores the votes per class in the 'values' state after calling predict(). """ distances = ConditionalAttribute(enabled=False, doc="Distances computed for each sample") __tags__ = ['knn', 'non-linear', 'binary', 'multiclass', 'notrain2predict'] def __init__(self, k=2, dfx=squared_euclidean_distance, voting='weighted', **kwargs): """ Parameters ---------- k : unsigned integer Number of nearest neighbours to be used for voting. dfx : functor Function to compute the distances between training and test samples. Default: squared euclidean distance voting : str Voting method used to derive predictions from the nearest neighbors. Possible values are 'majority' (simple majority of classes determines vote) and 'weighted' (votes are weighted according to the relative frequencies of each class in the training data). **kwargs Additonal arguments are passed to the base class. """ # init base class first Classifier.__init__(self, **kwargs) self.__k = k self.__dfx = dfx self.__voting = voting self.__data = None def __repr__(self, prefixes=[]): """Representation of the object """ return super(kNN, self).__repr__([ "k=%d" % self.__k, "dfx=%s" % self.__dfx, "voting=%s" % repr(self.__voting) ] + prefixes) def __str__(self): return "%s\n data: %s" % \ (Classifier.__str__(self), indent_doc(self.__data)) def _train(self, data): """Train the classifier. For kNN it is degenerate -- just stores the data. """ self.__data = data if __debug__: if str(data.samples.dtype).startswith('uint') \ or str(data.samples.dtype).startswith('int'): warning("kNN: input data is in integers. " + \ "Overflow on arithmetic operations might result in"+\ " errors. Please convert dataset's samples into" +\ " floating datatype if any error is reported.") self.__weights = None # create dictionary with an item for each condition uniquelabels = data.sa[self.params.targets_attr].unique self.__votes_init = dict(zip(uniquelabels, [0] * len(uniquelabels))) @accepts_dataset_as_samples def _predict(self, data): """Predict the class labels for the provided data. Returns a list of class labels (one for each data sample). """ # make sure we're talking about arrays data = np.asarray(data) # checks only in debug mode if __debug__: if not data.ndim == 2: raise ValueError, "Data array must be two-dimensional." if not data.shape[1] == self.__data.nfeatures: raise ValueError, "Length of data samples (features) does " \ "not match the classifier." # compute the distance matrix between training and test data with # distances stored row-wise, ie. distances between test sample [0] # and all training samples will end up in row 0 dists = self.__dfx(self.__data.samples, data).T if self.ca.is_enabled('distances'): # TODO: theoretically we should have used deepcopy for sa # here self.ca.distances = Dataset(dists, fa=self.__data.sa.copy()) # determine the k nearest neighbors per test sample knns = dists.argsort(axis=1)[:, :self.__k] # predicted class labels will go here predicted = [] if self.__voting == 'majority': vfx = self.get_majority_vote elif self.__voting == 'weighted': vfx = self.get_weighted_vote else: raise ValueError, "kNN told to perform unknown voting '%s'." \ % self.__voting # perform voting results = [vfx(knn) for knn in knns] # extract predictions predicted = [r[0] for r in results] # store the predictions in the state. Relies on State._setitem to do # nothing if the relevant state member is not enabled self.ca.predictions = predicted self.ca.estimates = np.array([r[1] for r in results]) return predicted ##REF: Name was automagically refactored def get_majority_vote(self, knn_ids): """Simple voting by choosing the majority of class neighbors. """ # local bindings _data = self.__data targets_sa_name = self.params.targets_attr targets_sa = _data.sa[targets_sa_name] labels = targets_sa.value uniquelabels = targets_sa.unique # number of occerences for each unique class in kNNs votes = self.__votes_init.copy() for nn in knn_ids: votes[labels[nn]] += 1 # find the class with most votes # return votes as well to store them in the state if _dict_has_key: # approx 5% faster implementation than below maxvotes = max(votes.iteritems(), key=lambda x: x[1])[0] else: # no key keyword for max in elderly versions maxvotes = max([(v, k) for k, v in votes.iteritems()])[1] return maxvotes, \ [votes[ul] for ul in uniquelabels] # transform into lists ##REF: Name was automagically refactored def get_weighted_vote(self, knn_ids): """Vote with classes weighted by the number of samples per class. """ # local bindings _data = self.__data targets_sa_name = self.params.targets_attr targets_sa = _data.sa[targets_sa_name] uniquelabels = targets_sa.unique # Lazy evaluation if self.__weights is None: # # It seemed to Yarik that this has to be evaluated just once per # training dataset. # self.__labels = labels = targets_sa.value Nlabels = len(labels) Nuniquelabels = len(uniquelabels) # TODO: To get proper speed up for the next line only, # histogram should be computed # via sorting + counting "same" elements while reducing. # Guaranteed complexity is NlogN whenever now it is N^2 # compute the relative proportion of samples belonging to each # class (do it in one loop to improve speed and reduce readability self.__weights = \ [ 1.0 - ((labels == label).sum() / Nlabels) \ for label in uniquelabels ] self.__weights = dict(zip(uniquelabels, self.__weights)) labels = self.__labels # number of occerences for each unique class in kNNs votes = self.__votes_init.copy() for nn in knn_ids: votes[labels[nn]] += 1 # weight votes votes = [self.__weights[ul] * votes[ul] for ul in uniquelabels] # find the class with most votes # return votes as well to store them in the state return uniquelabels[np.asarray(votes).argmax()], \ votes def untrain(self): """Reset trained state""" self.__data = None super(kNN, self).untrain()
class GPR(Classifier): """Gaussian Process Regression (GPR). """ predicted_variances = ConditionalAttribute( enabled=False, doc="Variance per each predicted value") log_marginal_likelihood = ConditionalAttribute( enabled=False, doc="Log Marginal Likelihood") log_marginal_likelihood_gradient = ConditionalAttribute( enabled=False, doc="Log Marginal Likelihood Gradient") __tags__ = ['gpr', 'regression', 'retrainable'] # NOTE XXX Parameters of the classifier. Values available as # clf.parameter or clf.params.parameter, or as # clf.params['parameter'] (as the full Parameter object) # # __doc__ and __repr__ for class is conviniently adjusted to # reflect values of those params # Kernel machines/classifiers should be refactored also to behave # the same and define kernel parameter appropriately... TODO, but SVMs # already kinda do it nicely ;-) sigma_noise = Parameter( 0.001, allowedtype='float', min=1e-10, doc="the standard deviation of the gaussian noise.") # XXX For now I don't introduce kernel parameter since yet to unify # kernel machines #kernel = Parameter(None, allowedtype='Kernel', # doc="Kernel object defining the covariance between instances. " # "(Defaults to KernelSquaredExponential if None in arguments)") lm = Parameter(None, min=0.0, allowedtype='None or float', doc="""The regularization term lambda. Increase this when the kernel matrix is not positive definite. If None, some regularization will be provided upon necessity""") def __init__(self, kernel=None, **kwargs): """Initialize a GPR regression analysis. Parameters ---------- kernel : Kernel a kernel object defining the covariance between instances. (Defaults to SquaredExponentialKernel if None in arguments) """ # init base class first Classifier.__init__(self, **kwargs) # It does not make sense to calculate a confusion matrix for a GPR # XXX it does ;) it will be a RegressionStatistics actually ;-) # So if someone desires -- let him have it # self.ca.enable('training_confusion', False) # set kernel: if kernel is None: kernel = SquaredExponentialKernel() debug( "GPR", "No kernel was provided, falling back to default: %s" % kernel) self.__kernel = kernel # append proper clf_internal depending on the kernel # TODO: add "__tags__" to kernels since the check # below does not scale if isinstance(kernel, GeneralizedLinearKernel) or \ isinstance(kernel, LinearKernel): self.__tags__ += ['linear'] else: self.__tags__ += ['non-linear'] if externals.exists('openopt'): self.__tags__ += ['has_sensitivity'] # No need to initialize conditional attributes. Unless they got set # they would raise an exception self.predicted_variances = # None self.log_marginal_likelihood = None self._init_internals() pass def _init_internals(self): """Reset some internal variables to None. To be used in constructor and untrain() """ self._train_fv = None self._labels = None self._km_train_train = None self._train_labels = None self._alpha = None self._L = None self._LL = None # XXX EO: useful for model selection but not working in general # self.__kernel.reset() pass def __repr__(self): """String summary of the object """ return super(GPR, self).__repr__(prefixes=['kernel=%s' % self.__kernel]) def compute_log_marginal_likelihood(self): """ Compute log marginal likelihood using self.train_fv and self.targets. """ if __debug__: debug("GPR", "Computing log_marginal_likelihood") self.ca.log_marginal_likelihood = \ -0.5*Ndot(self._train_labels, self._alpha) - \ Nlog(self._L.diagonal()).sum() - \ self._km_train_train.shape[0] * _halflog2pi return self.ca.log_marginal_likelihood def compute_gradient_log_marginal_likelihood(self): """Compute gradient of the log marginal likelihood. This version use a more compact formula provided by Williams and Rasmussen book. """ # XXX EO: check whether the precomputed self.alpha self.Kinv # are actually the ones corresponding to the hyperparameters # used to compute this gradient! # YYY EO: currently this is verified outside gpr.py but it is # not an efficient solution. # XXX EO: Do some memoizing since it could happen that some # hyperparameters are kept constant by user request, so we # don't need (somtimes) to recompute the corresponding # gradient again. COULD THIS BE TAKEN INTO ACCOUNT BY THE # NEW CACHED KERNEL INFRASTRUCTURE? # self.Kinv = np.linalg.inv(self._C) # Faster: Kinv = SLcho_solve(self._LL, np.eye(self._L.shape[0])) alphalphaT = np.dot(self._alpha[:, None], self._alpha[None, :]) tmp = alphalphaT - Kinv # Pass tmp to __kernel and let it compute its gradient terms. # This scales up to huge number of hyperparameters: grad_LML_hypers = self.__kernel.compute_lml_gradient( tmp, self._train_fv) grad_K_sigma_n = 2.0 * self.params.sigma_noise * np.eye(tmp.shape[0]) # Add the term related to sigma_noise: # grad_LML_sigma_n = 0.5 * np.trace(np.dot(tmp,grad_K_sigma_n)) # Faster formula: tr(AB) = (A*B.T).sum() grad_LML_sigma_n = 0.5 * (tmp * (grad_K_sigma_n).T).sum() lml_gradient = np.hstack([grad_LML_sigma_n, grad_LML_hypers]) self.log_marginal_likelihood_gradient = lml_gradient return lml_gradient def compute_gradient_log_marginal_likelihood_logscale(self): """Compute gradient of the log marginal likelihood when hyperparameters are in logscale. This version use a more compact formula provided by Williams and Rasmussen book. """ # Kinv = np.linalg.inv(self._C) # Faster: Kinv = SLcho_solve(self._LL, np.eye(self._L.shape[0])) alphalphaT = np.dot(self._alpha[:, None], self._alpha[None, :]) tmp = alphalphaT - Kinv grad_LML_log_hypers = \ self.__kernel.compute_lml_gradient_logscale(tmp, self._train_fv) grad_K_log_sigma_n = 2.0 * self.params.sigma_noise**2 * np.eye( Kinv.shape[0]) # Add the term related to sigma_noise: # grad_LML_log_sigma_n = 0.5 * np.trace(np.dot(tmp, grad_K_log_sigma_n)) # Faster formula: tr(AB) = (A * B.T).sum() grad_LML_log_sigma_n = 0.5 * (tmp * (grad_K_log_sigma_n).T).sum() lml_gradient = np.hstack([grad_LML_log_sigma_n, grad_LML_log_hypers]) self.log_marginal_likelihood_gradient = lml_gradient return lml_gradient ##REF: Name was automagically refactored def get_sensitivity_analyzer(self, flavor='auto', **kwargs): """Returns a sensitivity analyzer for GPR. Parameters ---------- flavor : str What sensitivity to provide. Valid values are 'linear', 'model_select', 'auto'. In case of 'auto' selects 'linear' for linear kernel and 'model_select' for the rest. 'linear' corresponds to GPRLinearWeights and 'model_select' to GRPWeights """ # XXX The following two lines does not work since # self.__kernel is instance of LinearKernel and not # just LinearKernel. How to fix? # YYY yoh is not sure what is the problem... LinearKernel is actually # kernel.LinearKernel so everything shoudl be ok if flavor == 'auto': flavor = ('model_select', 'linear')\ [int(isinstance(self.__kernel, GeneralizedLinearKernel) or isinstance(self.__kernel, LinearKernel))] if __debug__: debug("GPR", "Returning '%s' sensitivity analyzer" % flavor) # Return proper sensitivity if flavor == 'linear': return GPRLinearWeights(self, **kwargs) elif flavor == 'model_select': # sanity check if not ('has_sensitivity' in self.__tags__): raise ValueError, \ "model_select flavor is not available probably " \ "due to not available 'openopt' module" return GPRWeights(self, **kwargs) else: raise ValueError, "Flavor %s is not recognized" % flavor def _train(self, data): """Train the classifier using `data` (`Dataset`). """ # local bindings for faster lookup params = self.params retrainable = params.retrainable if retrainable: newkernel = False newL = False _changedData = self._changedData self._train_fv = train_fv = data.samples # GRP relies on numerical labels # yoh: yeah -- GPR now is purely regression so no conversion # is necessary train_labels = data.sa[params.targets_attr].value self._train_labels = train_labels if not retrainable or _changedData['traindata'] \ or _changedData.get('kernel_params', False): if __debug__: debug("GPR", "Computing train train kernel matrix") self.__kernel.compute(train_fv) self._km_train_train = km_train_train = asarray(self.__kernel) newkernel = True if retrainable: self._km_train_test = None # reset to facilitate recomputation else: if __debug__: debug( "GPR", "Not recomputing kernel since retrainable and " "nothing has changed") km_train_train = self._km_train_train # reuse if not retrainable or newkernel or _changedData['params']: if __debug__: debug("GPR", "Computing L. sigma_noise=%g" \ % params.sigma_noise) # XXX it seems that we do not need binding to object, but may be # commented out code would return? self._C = km_train_train + \ params.sigma_noise ** 2 * \ np.identity(km_train_train.shape[0], 'd') # The following decomposition could raise # np.linalg.linalg.LinAlgError because of numerical # reasons, due to the too rapid decay of 'self._C' # eigenvalues. In that case we try adding a small constant # to self._C, e.g. epsilon=1.0e-20. It should be a form of # Tikhonov regularization. This is equivalent to adding # little white gaussian noise to data. # # XXX EO: how to choose epsilon? # # Cholesky decomposition is provided by three different # NumPy/SciPy routines (fastest first): # 1) self._LL = scipy.linalg.cho_factor(self._C, lower=True) # self._L = L = np.tril(self._LL[0]) # 2) self._L = scipy.linalg.cholesky(self._C, lower=True) # 3) self._L = numpy.linalg.cholesky(self._C) # Even though 1 is the fastest we choose 2 since 1 does # not return a clean lower-triangular matrix (see docstring). # PBS: I just made it so the KernelMatrix is regularized # all the time. I figured that if ever you were going to # use regularization, you would want to set it yourself # and use the same value for all folds of your data. # YOH: Ideally so, but in real "use cases" some might have no # clue, also our unittests (actually clfs_examples) might # fail without any good reason. So lets return a magic with # an option to forbid any regularization (if lm is None) try: # apply regularization lm, C = params.lm, self._C if lm is not None: epsilon = lm * np.eye(C.shape[0]) self._L = SLcholesky(C + epsilon, lower=True) else: # do 10 attempts to raise each time by 10 self._L = _SLcholesky_autoreg(C, nsteps=None, lower=True) self._LL = (self._L, True) except SLAError: raise SLAError("Kernel matrix is not positive, definite. " "Try increasing the lm parameter.") pass newL = True else: if __debug__: debug( "GPR", "Not computing L since kernel, data and params " "stayed the same") # XXX we leave _alpha being recomputed, although we could check # if newL or _changedData['targets'] # if __debug__: debug("GPR", "Computing alpha") # L = self._L # reuse # self._alpha = NLAsolve(L.transpose(), # NLAsolve(L, train_labels)) # Faster: self._alpha = SLcho_solve(self._LL, train_labels) # compute only if the state is enabled if self.ca.is_enabled('log_marginal_likelihood'): self.compute_log_marginal_likelihood() pass if retrainable: # we must assign it only if it is retrainable self.ca.retrained = not newkernel or not newL if __debug__: debug("GPR", "Done training") pass @accepts_dataset_as_samples def _predict(self, data): """ Predict the output for the provided data. """ retrainable = self.params.retrainable ca = self.ca if not retrainable or self._changedData['testdata'] \ or self._km_train_test is None: if __debug__: debug('GPR', "Computing train test kernel matrix") self.__kernel.compute(self._train_fv, data) km_train_test = asarray(self.__kernel) if retrainable: self._km_train_test = km_train_test ca.repredicted = False else: if __debug__: debug('GPR', "Not recomputing train test kernel matrix") km_train_test = self._km_train_test ca.repredicted = True predictions = Ndot(km_train_test.transpose(), self._alpha) if ca.is_enabled('predicted_variances'): # do computation only if conditional attribute was enabled if not retrainable or self._km_test_test is None \ or self._changedData['testdata']: if __debug__: debug('GPR', "Computing test test kernel matrix") self.__kernel.compute(data) km_test_test = asarray(self.__kernel) if retrainable: self._km_test_test = km_test_test else: if __debug__: debug('GPR', "Not recomputing test test kernel matrix") km_test_test = self._km_test_test if __debug__: debug("GPR", "Computing predicted variances") L = self._L # v = NLAsolve(L, km_train_test) # Faster: piv = np.arange(L.shape[0]) v = SL.lu_solve((L.T, piv), km_train_test, trans=1) # self.predicted_variances = \ # Ndiag(km_test_test - Ndot(v.T, v)) \ # + self.sigma_noise**2 # Faster formula: np.diag(Ndot(v.T, v)) = (v**2).sum(0): ca.predicted_variances = Ndiag(km_test_test) - (v ** 2).sum(0) \ + self.params.sigma_noise ** 2 pass if __debug__: debug("GPR", "Done predicting") ca.estimates = predictions return predictions ##REF: Name was automagically refactored def _set_retrainable(self, value, force=False): """Internal function : need to set _km_test_test """ super(GPR, self)._set_retrainable(value, force) if force or (value and value != self.params.retrainable): self._km_test_test = None def untrain(self): super(GPR, self).untrain() # XXX might need to take special care for retrainable. later self._init_internals() pass def set_hyperparameters(self, hyperparameter): """ Set hyperparameters' values. Note that 'hyperparameter' is a sequence so the order of its values is important. First value must be sigma_noise, then other kernel's hyperparameters values follow in the exact order the kernel expect them to be. """ if hyperparameter[0] < self.params['sigma_noise'].min: raise InvalidHyperparameterError() self.params.sigma_noise = hyperparameter[0] if hyperparameter.size > 1: self.__kernel.set_hyperparameters(hyperparameter[1:]) pass return kernel = property(fget=lambda self: self.__kernel) pass
class GLM(FeaturewiseDatasetMeasure): """General linear model (GLM). Regressors can be defined in a design matrix and a linear fit of the data is computed univariately (i.e. indepently for each feature). This measure can report 'raw' parameter estimates (i.e. beta weights) of the linear model, as well as standardized parameters (z-stat) using an ordinary least squares (aka fixed-effects) approach to estimate the parameter estimate. The measure is reported in a (nfeatures x nregressors)-shaped array. """ pe = ConditionalAttribute(enabled=False, doc="Parameter estimates (nfeatures x nparameters).") zstat = ConditionalAttribute(enabled=False, doc="Standardized parameter estimates (nfeatures x nparameters).") def __init__(self, design, voi='pe', **kwargs): """ Parameters ---------- design : array (nsamples x nregressors) GLM design matrix. voi : {'pe', 'zstat'} Variable of interest that should be reported as feature-wise measure. 'beta' are the parameter estimates and 'zstat' returns standardized parameter estimates. """ FeaturewiseDatasetMeasure.__init__(self, **kwargs) # store the design matrix as a such (no copying if already array) self._design = np.asmatrix(design) # what should be computed ('variable of interest') if not voi in ['pe', 'zstat']: raise ValueError, \ "Unknown variable of interest '%s'" % str(voi) self._voi = voi # will store the precomputed Moore-Penrose pseudo-inverse of the # design matrix (lazy calculation) self._inv_design = None # also store the inverse of the inner product for beta variance # estimation self._inv_ip = None def _call(self, dataset): # just for the beauty of it X = self._design # precompute transformation is not yet done if self._inv_design is None: self._inv_ip = (X.T * X).I self._inv_design = self._inv_ip * X.T # get parameter estimations for all features at once # (betas x features) betas = self._inv_design * dataset.samples # charge state self.ca.pe = pe = betas.T.A # if betas and no z-stats are desired return them right away if not self._voi == 'pe' or self.ca.is_enabled('zstat'): # compute residuals residuals = X * betas residuals -= dataset.samples # estimates of the parameter variance and compute zstats # assumption of mean(E) == 0 and equal variance # XXX next lines ignore off-diagonal elements and hence covariance # between regressors. The humble being writing these lines asks the # god of statistics for forgives, because it knows not what it does diag_ip = np.diag(self._inv_ip) # (features x betas) beta_vars = np.array([ r.var() * diag_ip for r in residuals.T ]) # (parameter x feature) zstat = pe / np.sqrt(beta_vars) # charge state self.ca.zstat = zstat if self._voi == 'pe': # return as (beta x feature) result = Dataset(pe.T) elif self._voi == 'zstat': # return as (zstat x feature) result = Dataset(zstat.T) else: # we shall never get to this point raise ValueError, \ "Unknown variable of interest '%s'" % str(self._voi) result.sa['regressor'] = np.arange(len(result)) return result
class SensitivityBasedFeatureSelection(FeatureSelection): """Feature elimination. A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a certain dataset. These sensitivity maps are in turn used to discard unimportant features. """ sensitivity = ConditionalAttribute(enabled=False) def __init__(self, sensitivity_analyzer, feature_selector=FractionTailSelector(0.05), **kwargs): """Initialize feature selection Parameters ---------- sensitivity_analyzer : FeaturewiseDatasetMeasure sensitivity analyzer to come up with sensitivity feature_selector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. """ # base init first FeatureSelection.__init__(self, **kwargs) self.__sensitivity_analyzer = sensitivity_analyzer """Sensitivity analyzer to use once""" self.__feature_selector = feature_selector """Functor which takes care about removing some features.""" def untrain(self): if __debug__: debug("FS_", "Untraining sensitivity-based FS: %s" % self) self.__sensitivity_analyzer.untrain() def _call(self, dataset, testdataset=None): """Select the most important features Parameters ---------- dataset : Dataset used to compute sensitivity maps testdataset : Dataset optional dataset to select features on Returns a tuple of two new datasets with selected feature subset of `dataset`. """ sensitivity = self.__sensitivity_analyzer(dataset) """Compute the sensitivity map.""" self.ca.sensitivity = sensitivity # Select features to preserve selected_ids = self.__feature_selector(sensitivity) if __debug__: debug( "FS_", "Sensitivity: %s Selected ids: %s" % (sensitivity, selected_ids)) # Create a dataset only with selected features wdataset = dataset[:, selected_ids] if not testdataset is None: wtestdataset = testdataset[:, selected_ids] else: wtestdataset = None # Differ from the order in RFE when actually error reported is for results = (wdataset, wtestdataset) # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids selected_ids.sort() self.ca.selected_ids = selected_ids # dataset with selected features is returned return results # make it accessible from outside sensitivity_analyzer = property( fget=lambda self: self.__sensitivity_analyzer, doc="Measure which was used to do selection")
class IFS(FeatureSelection): """Incremental feature search. A scalar `DatasetMeasure` is computed multiple times on variations of a certain dataset. These measures are in turn used to incrementally select important features. Starting with an empty feature set the dataset measure is first computed for each single feature. A number of features is selected based on the resulting data measure map (using an `ElementSelector`). Next the dataset measure is computed again using each feature in addition to the already selected feature set. Again the `ElementSelector` is used to select more features. For each feature selection the transfer error on some testdatset is computed. This procedure is repeated until a given `StoppingCriterion` is reached. """ errors = ConditionalAttribute() def __init__(self, data_measure, transfer_error, bestdetector=BestDetector(), stopping_criterion=NBackHistoryStopCrit(BestDetector()), feature_selector=FixedNElementTailSelector(1, tail='upper', mode='select'), **kwargs): """Initialize incremental feature search Parameters ---------- data_measure : DatasetMeasure Computed for each candidate feature selection. The measure has to compute a scalar value. transfer_error : TransferError Compute against a test dataset for each incremental feature set. bestdetector : Functor Given a list of error values it has to return a boolean that signals whether the latest error value is the total minimum. stopping_criterion : Functor Given a list of error values it has to return whether the criterion is fulfilled. """ # bases init first FeatureSelection.__init__(self, **kwargs) self.__data_measure = data_measure self.__transfer_error = transfer_error self.__feature_selector = feature_selector self.__bestdetector = bestdetector self.__stopping_criterion = stopping_criterion def _call(self, dataset, testdataset): """Proceed and select the features recursively eliminating less important ones. Parameters ---------- dataset : Dataset used to select features and train classifiers to determine the transfer error. testdataset : Dataset used to test the trained classifer on a certain feature set to determine the transfer error. Returns ------- A tuple with the dataset containing the feature subset of `dataset` that had the lowest transfer error of all tested sets until the stopping criterion was reached. The tuple also contains a dataset with the corrsponding features from the `testdataset`. """ errors = [] """Computed error for each tested features set.""" # feature candidate are all features in the pattern object candidates = range(dataset.nfeatures) # initially empty list of selected features selected = [] # results in here please results = None # as long as there are candidates left # the loop will most likely get broken earlier if the stopping # criterion is reached while len(candidates): # measures for all candidates measures = [] # for all possible candidates for i, candidate in enumerate(candidates): if __debug__: debug('IFSC', "Tested %i" % i, cr=True) # take the new candidate and all already selected features # select a new temporay feature subset from the dataset # XXX assume MappedDataset and issue plain=True ?? tmp_dataset = \ dataset[:, selected + [candidate]] # compute data measure on this feature set measures.append(self.__data_measure(tmp_dataset)) measures = [np.asscalar(m) for m in measures] # Select promissing feature candidates (staging) # IDs are only applicable to the current set of feature candidates tmp_staging_ids = self.__feature_selector(measures) # translate into real candidate ids staging_ids = [candidates[i] for i in tmp_staging_ids] # mark them as selected and remove from candidates selected += staging_ids for i in staging_ids: candidates.remove(i) # compute transfer error for the new set # XXX assume MappedDataset and issue plain=True ?? error = self.__transfer_error(testdataset[:, selected], dataset[:, selected]) errors.append(error) # Check if it is time to stop and if we got # the best result stop = self.__stopping_criterion(errors) isthebest = self.__bestdetector(errors) if __debug__: debug('IFSC', "nselected %i; error: %.4f " \ "best/stop=%d/%d\n" \ % (len(selected), errors[-1], isthebest, stop), cr=True, lf=True) if isthebest: # do copy to survive later selections results = copy(selected) # leave the loop when the criterion is reached if stop: break # charge state self.ca.errors = errors # best dataset ever is returned return dataset[:, results], testdataset[:, results]