def clone(self): """Create full copy of the classifier. It might require classifier to be untrained first due to present SWIG bindings. TODO: think about proper re-implementation, without enrollment of deepcopy """ try: return deepcopy(self) except: self.untrain() return deepcopy(self)
def clone(self): """Create full copy of the classifier. It might require classifier to be untrained first due to present SWIG bindings. TODO: think about proper re-implementation, without enrollment of deepcopy """ if __debug__: debug("CLF", "Cloning %s#%s" % (self, id(self))) try: return deepcopy(self) except: self.untrain() return deepcopy(self)
def testFeatureSelection(self): """Testing feature selection: sorted/not sorted, feature groups """ origdata = datasets["uni2large"].samples[:10, :20] data = Dataset(samples=origdata, labels=2, chunks=2) # define some feature groups data.defineFeatureGroups(N.repeat(range(4), 5)) unmasked = data.samples.copy() # default must be no mask self.failUnless(data.nfeatures == 20) features_to_select = [3, 0, 17] features_to_select_copy = copy.deepcopy(features_to_select) features_to_select_sorted = copy.deepcopy(features_to_select) features_to_select_sorted.sort() bsel = N.array([False] * 20) bsel[features_to_select] = True # check selection with feature list for sel, issorted in [ (data.selectFeatures(features_to_select, sort=False), False), (data.selectFeatures(features_to_select, sort=True), True), (data.select(slice(None), features_to_select), True), (data.select(slice(None), N.array(features_to_select)), True), (data.select(slice(None), bsel), True), ]: self.failUnless(sel.nfeatures == 3) # check size of the masked patterns self.failUnless(sel.samples.shape == (10, 3)) # check that the right features are selected fts = (features_to_select, features_to_select_sorted)[int(issorted)] self.failUnless((unmasked[:, fts] == sel.samples).all()) # check grouping information self.failUnless((sel._dsattr["featuregroups"] == [0, 0, 3]).all()) # check side effect on features_to_select parameter: self.failUnless(features_to_select == features_to_select_copy) # check selection by feature group id gsel = data.selectFeatures(groups=[2, 3]) self.failUnless(gsel.nfeatures == 10) self.failUnless(set(gsel._dsattr["featuregroups"]) == set([2, 3]))
def _forward_data(self, data): if self.__chunks_attr is not None: raise RuntimeError( "%s cannot do chunk-wise Z-scoring of plain data " "since it has to be parameterized with chunks_attr." % self ) if self.__param_est is not None: raise RuntimeError( "%s cannot do Z-scoring with estimating " "parameters on some attributes of plain" "data." % self ) params = self.__params_dict if params is None: raise RuntimeError, "ZScoreMapper needs to be trained before call to forward" # mappers should not modify the input data # cast the data to float, since in-place operations below to not upcast! if np.issubdtype(data.dtype, np.integer): if self._secret_inplace_zscore: raise TypeError( "Cannot perform inplace z-scoring since data is of integer " "type. Please convert to float before calling zscore" ) mdata = data.astype(self.__dtype) elif self._secret_inplace_zscore: mdata = data else: # do not call .copy() directly, since it might not be an array mdata = copy.deepcopy(data) self._zscore(mdata, *params["__all__"]) return mdata
def selectFeatures(self, ids, plain=False, sort=False): """Select features given their ids. The methods behaves similar to Dataset.selectFeatures(), but additionally takes care of adjusting the embedded mapper appropriately. :Parameters: ids: sequence Iterable container to select ids plain: boolean Flag whether to return MappedDataset (or just Dataset) sort: boolean Flag whether to sort Ids. Order matters and selectFeatures assumes incremental order. If not such, in non-optimized code selectFeatures would verify the order and sort """ # call base method to get selected feature subset if plain: sdata = Dataset(self._data, self._dsattr, check_data=False, copy_samples=False, copy_data=False, copy_dsattr=False) return sdata.selectFeatures(ids=ids, sort=sort) else: sdata = Dataset.selectFeatures(self, ids=ids, sort=sort) # since we have new DataSet we better have a new mapper sdata._dsattr['mapper'] = copy.deepcopy(sdata._dsattr['mapper']) if sort: sdata._dsattr['mapper'].selectOut(sorted(ids)) else: sdata._dsattr['mapper'].selectOut(ids) return sdata
def test_deep_copying_state_variable(self): for v in (True, False): sv = ConditionalAttribute(enabled=v, doc="Testing") sv.enabled = not v sv_dc = copy.deepcopy(sv) self.failUnlessEqual(sv.enabled, sv_dc.enabled) self.failUnlessEqual(sv.name, sv_dc.name) self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
def setUp(self): self.backup = [] # paranoid check self.cfgstr = str(cfg) # clean up externals cfg for proper testing if cfg.has_section('externals'): self.backup = copy.deepcopy(cfg.items('externals')) cfg.remove_section('externals')
def setUp(self): self.backup = [] # paranoid check self.cfgstr = str(cfg) # clean up externals cfg for proper testing if cfg.has_section("externals"): self.backup = copy.deepcopy(cfg.items("externals")) cfg.remove_section("externals")
def select_samples(self, selection): """Return new ColumnData with selected samples""" data = copy.deepcopy(self) for k, v in data.iteritems(): data[k] = [v[x] for x in selection] data._check() return data
def testMoreSVD(self): pm = SVDMapper() # train SVD pm.train(self.largefeat) # mixing matrix cannot be square self.failUnlessEqual(pm.proj.shape, (40, 10)) # only first singular value significant self.failUnless(pm.sv[:1] > 10) self.failUnless((pm.sv[1:] < 10).all()) # now project data into SVD space p = pm.forward(self.largefeat.samples) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.failUnless(var[:1] > 1.0) self.failUnless((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = pm.reverse(p) self.failUnlessEqual(rp.shape, self.largefeat.samples.shape) self.failUnless((N.round(rp) == self.largefeat.samples).all()) self.failUnlessEqual(pm.getInSize(), 40) self.failUnlessEqual(pm.getOutSize(), 10) # copy mapper pm2 = deepcopy(pm) # now remove all but the first 2 components from the mapper pm2.selectOut([0, 1]) # sanity check self.failUnlessEqual(pm2.getInSize(), 40) self.failUnlessEqual(pm2.getOutSize(), 2) # but orginal mapper must be left intact self.failUnlessEqual(pm.getInSize(), 40) self.failUnlessEqual(pm.getOutSize(), 10) # data should still be fully recoverable by 'reverse()' rp2 = pm2.reverse(p[:, [0, 1]]) self.failUnlessEqual(rp2.shape, self.largefeat.samples.shape) self.failUnless(N.abs(rp2 - self.largefeat.samples).sum() < 0.0001) # now make new random data and do forward->reverse check data = N.random.normal(size=(98, 40)) data_f = pm.forward(data) self.failUnlessEqual(data_f.shape, (98, 10)) data_r = pm.reverse(data_f) self.failUnlessEqual(data_r.shape, (98, 40))
def testCompareToZscore(self): """Test by comparing to results of elderly z-score function """ for ds in self.dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper() zsm.train(ds1) ds1z = zsm.forward(ds1.samples) zscore(ds2, perchunk=False) self.failUnless(N.linalg.norm(ds1z - ds2.samples) < 1e-12) self.failUnless((ds1.samples == ds.samples).all(), msg="It seems we modified original dataset!") ds0 = zsm.reverse(ds1z) self.failUnless(N.linalg.norm(ds0 - ds.samples) < 1e-12, msg="Can't reconstruct from z-scores")
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) zsm.train(ds1) ds1z = zsm.forward(ds1.samples) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def isSorted(items): """Check if listed items are in sorted order. :Parameters: `items`: iterable container :return: `True` if were sorted. Otherwise `False` + Warning """ items_sorted = deepcopy(items) items_sorted.sort() equality = items_sorted == items # XXX yarik forgotten analog to isiterable if hasattr(equality, '__iter__'): equality = N.all(equality) return equality
def testAutoOptimizePCA(self): # train PCA self.pm.train(self.largefeat) # mixing matrix cannot be square # self.failUnlessEqual(self.pm.mix.shape, (10, 40)) # only first eigenvalue significant self.failUnless(self.pm.sv[:1] > 10) self.failUnless((self.pm.sv[1:] < 10).all()) # now project data into PCA space p = self.pm.forward(self.largefeat.samples) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.failUnless(var[:1] > 1.0) self.failUnless((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = self.pm.reverse(p) self.failUnlessEqual(rp.shape, self.largefeat.samples.shape) self.failUnless((N.round(rp) == self.largefeat.samples).all()) self.failUnlessEqual(self.pm.getInSize(), 40) # self.failUnlessEqual(self.pm.getOutSize(), 10) self.failUnlessEqual(self.pm.getOutSize(), 40) # copy mapper pm2 = deepcopy(self.pm) # now remove all but the first 2 components from the mapper pm2.selectOut([0,1]) # sanity check self.failUnlessEqual(pm2.getInSize(), 40) self.failUnlessEqual(pm2.getOutSize(), 2) # but orginal mapper must be left intact self.failUnlessEqual(self.pm.getInSize(), 40) # self.failUnlessEqual(self.pm.getOutSize(), 10) self.failUnlessEqual(self.pm.getOutSize(), 40) # data should still be fully recoverable by 'reverse()' rp2 = pm2.reverse(p[:,[0,1]]) self.failUnlessEqual(rp2.shape, self.largefeat.samples.shape) self.failUnless((N.round(rp2) == self.largefeat.samples).all())
def test_id_hash(self, pair): a, b = pair a1 = deepcopy(a) a_1 = idhash(a) self.failUnless(a_1 == idhash(a), msg="Must be of the same idhash") self.failUnless(a_1 != idhash(b), msg="Must be of different idhash") if isinstance(a, np.ndarray): self.failUnless(a_1 != idhash(a.T), msg=".T must be of different idhash") if not isinstance(a, tuple): self.failUnless(a_1 != idhash(a1), msg="Must be of different idhash") a[2] += 1; a_2 = idhash(a) self.failUnless(a_1 != a_2, msg="Idhash must change") else: a_2 = a_1 a = a[2:]; a_3 = idhash(a) self.failUnless(a_2 != a_3, msg="Idhash must change after slicing")
def is_sorted(items): """Check if listed items are in sorted order. Parameters ---------- `items`: iterable container :return: `True` if were sorted. Otherwise `False` + Warning """ items_sorted = deepcopy(items) items_sorted.sort() equality = items_sorted == items # XXX yarik forgotten analog to isiterable if hasattr(equality, '__iter__'): equality = np.all(equality) return equality
def __new__(cls, *args, **kwargs): """Instantiate ClassWithCollections object """ self = super(ClassWithCollections, cls).__new__(cls) s__dict__ = self.__dict__ # init variable # XXX: Added as pylint complained (rightfully) -- not sure if false # is the proper default self.__params_set = False # need to check to avoid override of enabled ca in the case # of multiple inheritance, like both ClassWithCollectionsl and # Harvestable if not s__dict__.has_key('_collections'): s__class__ = self.__class__ collections = copy.deepcopy(s__class__._collections_template) s__dict__['_collections'] = collections s__dict__['_known_attribs'] = {} """Dictionary to contain 'links' to the collections from each known attribute. Is used to gain some speed up in lookup within __getattribute__ and __setattr__ """ # Assign owner to all collections for col, collection in collections.iteritems(): if col in s__dict__: raise ValueError, \ "Object %s has already attribute %s" % \ (self, col) s__dict__[col] = collection collection.name = col collection.owner = self self.__params_set = False if __debug__: descr = kwargs.get('descr', None) debug("COL", "ClassWithCollections.__new__ was done " "for %s#%s with descr=%s" \ % (s__class__.__name__, id(self), descr)) return self
def asDescreteTime(self, dt, storeoffset=False): """Convert `onset` and `duration` information into descrete timepoints. :Parameters: dt: float Temporal distance between two timepoints in the same unit as `onset` and `duration`. storeoffset: bool If True, the temporal offset between original `onset` and descretized `onset` is stored as an additional item in `features`. :Return: A copy of the original `Event` with `onset` and optionally `duration` replaced by their corresponding descrete timepoint. The new onset will correspond to the timepoint just before or exactly at the original onset. The new duration will be the number of timepoints covering the event from the computed onset timepoint till the timepoint exactly at the end, or just after the event. Note again, that the new values are expressed as #timepoint and not in their original unit! """ dt = float(dt) onset = self['onset'] out = deepcopy(self) # get the timepoint just prior the onset out['onset'] = int(N.floor(onset / dt)) if storeoffset: # compute offset offset = onset - (out['onset'] * dt) if out.has_key('features'): out['features'].append(offset) else: out['features'] = [offset] if out.has_key('duration'): # how many timepoint cover the event (from computed onset # to the one timepoint just after the end of the event out['duration'] = int(N.ceil((onset + out['duration']) / dt) \ - out['onset']) return out
def __new__(cls, *args, **kwargs): """Instantiate ClassWithCollections object """ self = super(ClassWithCollections, cls).__new__(cls) s__dict__ = self.__dict__ # init variable # XXX: Added as pylint complained (rightfully) -- not sure if false # is the proper default self.__params_set = False # need to check to avoid override of enabled ca in the case # of multiple inheritance, like both ClassWithCollectionsl and # Harvestable if not s__dict__.has_key('_collections'): s__class__ = self.__class__ collections = copy.deepcopy(s__class__._collections_template) s__dict__['_collections'] = collections s__dict__['_known_attribs'] = {} """Dictionary to contain 'links' to the collections from each known attribute. Is used to gain some speed up in lookup within __getattribute__ and __setattr__ """ # Assign owner to all collections for col, collection in collections.iteritems(): if col in s__dict__: raise ValueError, \ "Object %s has already attribute %s" % \ (self, col) s__dict__[col] = collection collection.name = col self.__params_set = False if __debug__: descr = kwargs.get('descr', None) debug("COL", "ClassWithCollections.__new__ was done " "for %s#%s with descr=%s" \ % (s__class__.__name__, id(self), descr)) return self
def _call(self, dataset): """Compute the sensitivity map. Returns a 1d array of sensitivities for all features in `dataset`. """ # first cast to floating point dtype, because noise is most likely # floating point as well and '+=' on int would not do the right thing # XXX should we already deepcopy here to keep orig dtype? if not N.issubdtype(dataset.samples.dtype, N.float): dataset.setSamplesDType('float32') if __debug__: nfeatures = dataset.nfeatures sens_map = [] # compute the datameasure on the original dataset # this is used as a baseline orig_measure = self.__datameasure(dataset) # do for every _single_ feature in the dataset for feature in xrange(dataset.nfeatures): if __debug__: debug('PSA', "Analyzing %i features: %i [%i%%]" \ % (nfeatures, feature+1, float(feature+1)/nfeatures*100,), cr=True) # make a copy of the dataset to preserve data integrity wdata = deepcopy(dataset) # add noise to current feature wdata.samples[:, feature] += self.__noise(size=wdata.nsamples) # compute the datameasure on the perturbed dataset perturbed_measure = self.__datameasure(wdata) # difference from original datameasure is sensitivity sens_map.append(perturbed_measure - orig_measure) if __debug__: debug('PSA', '') return N.array(sens_map)
def test_id_hash(self, pair): a, b = pair a1 = deepcopy(a) a_1 = idhash(a) self.failUnless(a_1 == idhash(a), msg="Must be of the same idhash") self.failUnless(a_1 != idhash(b), msg="Must be of different idhash") if isinstance(a, np.ndarray): self.failUnless(a_1 != idhash(a.T), msg=".T must be of different idhash") if not isinstance(a, tuple): self.failUnless(a_1 != idhash(a1), msg="Must be of different idhash") a[2] += 1 a_2 = idhash(a) self.failUnless(a_1 != a_2, msg="Idhash must change") else: a_2 = a_1 a = a[2:] a_3 = idhash(a) self.failUnless(a_2 != a_3, msg="Idhash must change after slicing")
def test_generic_tests(self): """Test all classifiers for conformant behavior """ for clf_, traindata in \ [(clfswh['binary'], datasets['dumb2']), (clfswh['multiclass'], datasets['dumb'])]: traindata_copy = deepcopy(traindata) # full copy of dataset for clf in clf_: clf.train(traindata) self.failUnless( (traindata.samples == traindata_copy.samples).all(), "Training of a classifier shouldn't change original dataset") # TODO: enforce uniform return from predict?? #predicted = clf.predict(traindata.samples) #self.failUnless(isinstance(predicted, np.ndarray)) # Just simple test that all of them are syntaxed correctly self.failUnless(str(clf) != "") self.failUnless(repr(clf) != "")
def add(self, targets, predictions, estimates=None): """Add new results to the set of known results""" if len(targets) != len(predictions): raise ValueError, \ "Targets[%d] and predictions[%d]" % (len(targets), len(predictions)) + \ " have different number of samples" # extract value if necessary if isinstance(estimates, Collectable): estimates = estimates.value if estimates is not None and len(targets) != len(estimates): raise ValueError, \ "Targets[%d] and estimates[%d]" % (len(targets), len(estimates)) + \ " have different number of samples" # enforce labels in predictions to be of the same datatype as in # targets, since otherwise we are getting doubles for unknown at a # given moment labels nonetype = type(None) for i in xrange(len(targets)): t1, t2 = type(targets[i]), type(predictions[i]) # if there were no prediction made - leave None, otherwise # convert to appropriate type if t1 != t2 and t2 != nonetype: #warning("Obtained target %s and prediction %s are of " % # (t1, t2) + "different datatypes.") if isinstance(predictions, tuple): predictions = list(predictions) predictions[i] = t1(predictions[i]) if estimates is not None: # assure that we have a copy, or otherwise further in-place # modifications might screw things up (some classifiers share # estimates and spit out results) estimates = copy.deepcopy(estimates) self.__sets.append( (targets, predictions, estimates) ) self._computed = False
def test_generic_tests(self): """Test all classifiers for conformant behavior """ for clf_, traindata in \ [(clfswh['binary'], datasets['dumb2']), (clfswh['multiclass'], datasets['dumb'])]: traindata_copy = deepcopy(traindata) # full copy of dataset for clf in clf_: clf.train(traindata) self.failUnless( (traindata.samples == traindata_copy.samples).all(), "Training of a classifier shouldn't change original dataset" ) # TODO: enforce uniform return from predict?? #predicted = clf.predict(traindata.samples) #self.failUnless(isinstance(predicted, np.ndarray)) # Just simple test that all of them are syntaxed correctly self.failUnless(str(clf) != "") self.failUnless(repr(clf) != "")
def test_more_svd(self): pm = SVDMapper() # train SVD pm.train(self.largefeat) # mixing matrix cannot be square self.failUnlessEqual(pm.proj.shape, (40, 10)) # only first singular value significant self.failUnless(pm.sv[:1] > 10) self.failUnless((pm.sv[1:] < 10).all()) # now project data into SVD space p = pm.forward(self.largefeat) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.failUnless(var[:1] > 1.0) self.failUnless((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = pm.reverse(p) self.failUnlessEqual(rp.shape, self.largefeat.shape) self.failUnless((np.round(rp) == self.largefeat).all()) # copy mapper pm2 = deepcopy(pm) # now make new random data and do forward->reverse check data = np.random.normal(size=(98, 40)) data_f = pm.forward(data) self.failUnlessEqual(data_f.shape, (98, 10)) data_r = pm.reverse(data_f) self.failUnlessEqual(data_r.shape, (98, 40))
def test_more_svd(self): pm = SVDMapper() # train SVD pm.train(self.largefeat) # mixing matrix cannot be square self.failUnlessEqual(pm.proj.shape, (40, 10)) # only first singular value significant self.failUnless(pm.sv[:1] > 10) self.failUnless((pm.sv[1:] < 10).all()) # now project data into SVD space p = pm.forward(self.largefeat) # only variance of first component significant var = p.var(axis=0) # test that only one component has variance self.failUnless(var[:1] > 1.0) self.failUnless((var[1:] < 0.0001).all()) # check that the mapped data can be fully recovered by 'reverse()' rp = pm.reverse(p) self.failUnlessEqual(rp.shape, self.largefeat.shape) self.failUnless((np.round(rp) == self.largefeat).all()) # copy mapper pm2 = deepcopy(pm) # now make new random data and do forward->reverse check data = np.random.normal(size=(98,40)) data_f = pm.forward(data) self.failUnlessEqual(data_f.shape, (98,10)) data_r = pm.reverse(data_f) self.failUnlessEqual(data_r.shape, (98,40))
def test_proper_state(self): proper = TestClassProper() proper2 = TestClassProper(enable_ca=['state1'], disable_ca=['state2']) # disable_ca should override anything in enable_ca proper3 = TestClassProper(enable_ca=['all'], disable_ca='all') self.failUnlessEqual(len(proper3.ca.enabled), 0, msg="disable_ca should override anything in enable_ca") proper.ca.state2 = 1000 value = proper.ca.state2 self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval") proper.ca.disable('state2') proper.ca.state2 = 10000 self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled") proper4 = copy.deepcopy(proper) proper.ca.reset('state2') self.failUnlessRaises(UnknownStateError, proper.ca.__getattribute__, 'state2') """Must be blank after being reset""" self.failUnlessEqual(proper4.ca.state2, 1000, msg="Simple assignment after being reset in original instance") proper.ca.enable(['state2']) self.failUnlessEqual(set(proper.ca.keys()), set(['state1', 'state2'])) if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active: # skip testing since all ca are on now return self.failUnless(proper.ca.enabled == ['state2']) self.failUnless(set(proper2.ca.enabled) == set(['state1'])) self.failUnlessRaises(AttributeError, proper.__getattribute__, 'state12') # if documentary on the state is appropriate self.failUnlessEqual(proper2.ca.listing, ['%sstate1+%s: state1 doc' % (_def_sep, _def_sep), '%sstate2%s: state2 doc' % (_def_sep, _def_sep)]) # if __str__ lists correct number of ca str_ = str(proper2) self.failUnless(str_.find('2 ca:') != -1) # check if disable works self.failUnless(set(proper2.ca.enabled), set(['state1'])) proper2.ca.disable("all") self.failUnlessEqual(set(proper2.ca.enabled), set()) proper2.ca.enable("all") self.failUnlessEqual(len(proper2.ca.enabled), 2) proper2.ca.state1, proper2.ca.state2 = 1,2 self.failUnlessEqual(proper2.ca.state1, 1) self.failUnlessEqual(proper2.ca.state2, 2) # now reset them proper2.ca.reset('all') self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1') self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')
def testCombinedMapper(self): # simple case: two array of different shape combined m = CombinedMapper([DenseArrayMapper(mask=N.ones((2,3,4))), MaskMapper(mask=N.array((1,1)))]) self.failUnless(m.getInSize() == 26) self.failUnless(m.getOutSize() == 26) d1 = N.ones((5,2,3,4)) d2_broken = N.ones((6,2)) + 1 d2 = N.ones((5,2)) + 1 # should not work for sample mismatch self.failUnlessRaises(ValueError, m.forward, (d1, d2_broken)) # check forward mapping (size and identity) mf = m.forward((d1, d2)) self.failUnless(mf.shape == (5, 26)) self.failUnless((mf[:,:24] == 1).all()) self.failUnless((mf[:,-2:] == 2).all()) # check reverse mapping self.failUnlessRaises(ValueError, m.reverse, N.arange(12)) mr = m.reverse(N.arange(26) + 1) self.failUnless(len(mr) == 2) self.failUnless((mr[0] == N.arange(24).reshape((2,3,4)) + 1).all()) self.failUnless((mr[1] == N.array((25,26))).all()) # check reverse mapping of multiple samples mr = m.reverse(N.array([N.arange(26) + 1 for i in range(4)])) self.failUnless(len(mr) == 2) self.failUnless( (mr[0] == N.array([N.arange(24).reshape((2,3,4)) + 1 for i in range(4)])).all()) self.failUnless( (mr[1] == N.array([N.array((25,26)) for i in range(4)])).all()) # check dummy train m.train(Dataset(samples=N.random.rand(10,26), labels=range(10))) self.failUnlessRaises(ValueError, m.train, Dataset(samples=N.random.rand(10,25), labels=range(10))) # check neighbor information # fail if invalid id self.failUnlessRaises(ValueError, m.getNeighbor, 26) # neighbors for last feature of first mapper, ie. # close in out space but infinite/undefined distance in in-space self.failUnless([n for n in m.getNeighbor(23, radius=2)] == [6, 7, 10, 11, 15, 18, 19, 21, 22, 23]) # check feature selection m.selectOut((23,25)) self.failUnless(m.getInSize() == 26) self.failUnless(m.getOutSize() == 2) # check reverse mapping of truncated mapper mr = m.reverse(N.array((99,88))) target1 = N.zeros((2,3,4)) target1[1,2,3] = 99 target2 = N.array((0, 88)) self.failUnless(len(mr) == 2) self.failUnless((mr[0] == target1).all()) self.failUnless((mr[1] == target2).all()) # check forward mapping self.failUnless((m.forward((d1, d2))[0] == (1, 2)).all()) # check copying mc = deepcopy(m) mc.selectOut([1]) self.failUnless(m.getOutSize() == 2) self.failUnless(mc.getOutSize() == 1)
def test_proper_state(self): proper = TestClassProper() proper2 = TestClassProper(enable_ca=['state1'], disable_ca=['state2']) # disable_ca should override anything in enable_ca proper3 = TestClassProper(enable_ca=['all'], disable_ca='all') self.failUnlessEqual( len(proper3.ca.enabled), 0, msg="disable_ca should override anything in enable_ca") proper.ca.state2 = 1000 value = proper.ca.state2 self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment/retrieval") proper.ca.disable('state2') proper.ca.state2 = 10000 self.failUnlessEqual(proper.ca.state2, 1000, msg="Simple assignment after being disabled") proper4 = copy.deepcopy(proper) proper.ca.reset('state2') self.failUnlessRaises(UnknownStateError, proper.ca.__getattribute__, 'state2') """Must be blank after being reset""" self.failUnlessEqual( proper4.ca.state2, 1000, msg="Simple assignment after being reset in original instance") proper.ca.enable(['state2']) self.failUnlessEqual(Set(proper.ca.keys()), Set(['state1', 'state2'])) if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active: # skip testing since all ca are on now return self.failUnless(proper.ca.enabled == ['state2']) self.failUnless(Set(proper2.ca.enabled) == Set(['state1'])) self.failUnlessRaises(AttributeError, proper.__getattribute__, 'state12') # if documentary on the state is appropriate self.failUnlessEqual(proper2.ca.listing, [ '%sstate1+%s: state1 doc' % (_def_sep, _def_sep), '%sstate2%s: state2 doc' % (_def_sep, _def_sep) ]) # if __str__ lists correct number of ca str_ = str(proper2) self.failUnless(str_.find('2 ca:') != -1) # check if disable works self.failUnless(Set(proper2.ca.enabled), Set(['state1'])) proper2.ca.disable("all") self.failUnlessEqual(Set(proper2.ca.enabled), Set()) proper2.ca.enable("all") self.failUnlessEqual(len(proper2.ca.enabled), 2) proper2.ca.state1, proper2.ca.state2 = 1, 2 self.failUnlessEqual(proper2.ca.state1, 1) self.failUnlessEqual(proper2.ca.state2, 2) # now reset them proper2.ca.reset('all') self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state1') self.failUnlessRaises(UnknownStateError, proper2.ca.__getattribute__, 'state2')
def __init__(self, samples=None, events=None, mask=None, evconv=False, storeoffset=False, tr=None, enforce_dim=4, **kwargs): """ :Paramaters: mask: str | NiftiImage | ndarray Filename of a NIfTI image or a `NiftiImage` instance or an ndarray of appropriate shape. evconv: bool Convert event definitions using `onset` and `duration` in some temporal unit into #sample notation. storeoffset: Bool Whether to store temproal offset information when converting Events into descrete time. Only considered when evconv == True. tr: float Temporal distance of two adjacent NIfTI volumes. This can be used to override the corresponding value in the NIfTI header. enforce_dim : int or None If not None, it is the dimensionality of the data to be enforced, commonly 4D for the data, and 3D for the mask in case of fMRI. """ # check if we are in copy constructor mode if events is None: EventDataset.__init__(self, samples=samples, events=events, mask=mask, **kwargs) return nifti = getNiftiFromAnySource(samples, ensure=True, enforce_dim=enforce_dim) # no copying samples = nifti.data # do not put the whole NiftiImage in the dict as this will most # likely be deepcopy'ed at some point and ensuring data integrity # of the complex Python-C-Swig hybrid might be a tricky task. # Only storing the header dict should achieve the same and is more # memory efficient and even simpler dsattr = {'niftihdr': nifti.header} # determine TR, take from NIfTI header by default dt = nifti.rtime # override if necessary if not tr is None: dt = tr # NiftiDataset uses a DescreteMetric with cartesian # distance and element size from the NIfTI header # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x) elementsize = [dt] + [i for i in reversed(nifti.voxdim)] # XXX metric might be inappropriate if boxcar has length 1 # might move metric setup after baseclass init and check what has # really happened metric = DescreteMetric(elementsize=elementsize, distance_function=cartesianDistance) # convert EVs if necessary -- not altering original if evconv: if dt == 0: raise ValueError, "'dt' cannot be zero when converting Events" events = [ev.asDescreteTime(dt, storeoffset) for ev in events] else: # do not touch the original events = deepcopy(events) # forcefully convert onset and duration into integers, as expected # by the baseclass for ev in events: oldonset = ev['onset'] oldduration = ev['duration'] ev['onset'] = int(ev['onset']) ev['duration'] = int(ev['duration']) if not oldonset == ev['onset'] \ or not oldduration == ev['duration']: warning("Loosing information during automatic integer " "conversion of EVs. Consider an explicit conversion" " by setting `evconv` in ERNiftiDataset().") # pull mask array from NIfTI (if present) if mask is None: pass elif isinstance(mask, N.ndarray): # plain array can be passed on to base class pass else: mask_nim = getNiftiFromAnySource(mask) if not mask_nim is None: mask = getNiftiData(mask_nim) else: raise ValueError, "Cannot load mask from '%s'" % mask # finally init baseclass EventDataset.__init__(self, samples=samples, events=events, mask=mask, dametric=metric, dsattr=dsattr, **kwargs)
def _call(self, dataset): """Perform cross-validation on a dataset. 'dataset' is passed to the splitter instance and serves as the source dataset to generate split for the single cross-validation folds. """ # store the results of the splitprocessor results = [] self.ca.splits = [] # local bindings ca = self.ca clf = self.__transerror.clf expose_testdataset = self.__expose_testdataset # what ca to enable in terr terr_enable = [] for state_var in ['confusion', 'training_confusion', 'samples_error']: if ca.is_enabled(state_var): terr_enable += [state_var] # charge ca with initial values summaryClass = clf.__summary_class__ clf_hastestdataset = hasattr(clf, 'testdataset') self.ca.confusion = summaryClass() self.ca.training_confusion = summaryClass() self.ca.transerrors = [] if ca.is_enabled('samples_error'): dataset.init_origids('samples', attr=self.__samples_idattr, mode='existing') self.ca.samples_error = dict( [(id_, []) for id_ in dataset.sa[self.__samples_idattr].value]) # enable requested ca in child TransferError instance (restored # again below) if len(terr_enable): self.__transerror.ca.change_temporarily( enable_ca=terr_enable) # We better ensure that underlying classifier is not trained if we # are going to deepcopy transerror if ca.is_enabled("transerrors"): self.__transerror.untrain() # collect sum info about the split that where made for the resulting # dataset splitinfo = [] # splitter for split in self.__splitter(dataset): splitinfo.append( "%s->%s" % (','.join([str(c) for c in split[0].sa[self.__splitter.splitattr].unique]), ','.join([str(c) for c in split[1].sa[self.__splitter.splitattr].unique]))) # only train classifier if splitter provides something in first # element of tuple -- the is the behavior of TransferError if ca.is_enabled("splits"): self.ca.splits.append(split) if ca.is_enabled("transerrors"): # copy first and then train, as some classifiers cannot be copied # when already trained, e.g. SWIG'ed stuff lastsplit = None for ds in split: if ds is not None: lastsplit = ds.a.lastsplit break if lastsplit: # only if we could deduce that it was last split # use the 'mother' transerror transerror = self.__transerror else: # otherwise -- deep copy transerror = deepcopy(self.__transerror) else: transerror = self.__transerror # assign testing dataset if given classifier can digest it if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = split[1] # run the beast result = transerror(split[1], split[0]) # unbind the testdataset from the classifier if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = None # next line is important for 'self._harvest' call self._harvest(locals()) # XXX Look below -- may be we should have not auto added .? # then transerrors also could be deprecated if ca.is_enabled("transerrors"): self.ca.transerrors.append(transerror) # XXX: could be merged with next for loop using a utility class # that can add dict elements into a list if ca.is_enabled("samples_error"): for k, v in \ transerror.ca.samples_error.iteritems(): self.ca.samples_error[k].append(v) # pull in child ca for state_var in ['confusion', 'training_confusion']: if ca.is_enabled(state_var): ca[state_var].value.__iadd__( transerror.ca[state_var].value) if __debug__: debug("CROSSC", "Split #%d: result %s" \ % (len(results), `result`)) results.append(result) # Since we could have operated with a copy -- bind the last used one back self.__transerror = transerror # put ca of child TransferError back into original config if len(terr_enable): self.__transerror.ca.reset_changed_temporarily() self.ca.results = results """Store conditional attribute if it is enabled""" results = Dataset(results, sa={'cv_fold': splitinfo}) return results
def test_multivariate(self): mv_perf = [] mv_lin_perf = [] uv_perf = [] l_clf = clfswh['linear', 'svm'][0] nl_clf = clfswh['non-linear', 'svm'][0] #orig_keys = nl_clf.param._params.keys() #nl_param_orig = nl_clf.param._params.copy() # l_clf = LinearNuSVMC() # XXX ??? not sure what below meant and it is obsolete if # using SG... commenting out for now # for some reason order is not preserved thus dictionaries are not # the same any longer -- lets compare values #self.failUnlessEqual([nl_clf.param._params[k] for k in orig_keys], # [nl_param_orig[k] for k in orig_keys], # msg="New instance mustn't override values in previously created") ## and keys separately #self.failUnlessEqual(set(nl_clf.param._params.keys()), # set(orig_keys), # msg="New instance doesn't change set of parameters in original") # We must be able to deepcopy not yet trained SVMs now import mvpa.support.copy as copy try: nl_clf.untrain() nl_clf_copy = copy.deepcopy(nl_clf) except: self.fail(msg="Failed to deepcopy not-yet trained SVM %s" % nl_clf) for i in xrange(20): train = pure_multivariate_signal( 20, 3 ) test = pure_multivariate_signal( 20, 3 ) # use non-linear CLF on 2d data nl_clf.train(train) p_mv = nl_clf.predict(test.samples) mv_perf.append(np.mean(p_mv==test.targets)) # use linear CLF on 2d data l_clf.train(train) p_lin_mv = l_clf.predict(test.samples) mv_lin_perf.append(np.mean(p_lin_mv==test.targets)) # use non-linear CLF on 1d data nl_clf.train(train[:, 0]) p_uv = nl_clf.predict(test[:, 0].samples) uv_perf.append(np.mean(p_uv==test.targets)) mean_mv_perf = np.mean(mv_perf) mean_mv_lin_perf = np.mean(mv_lin_perf) mean_uv_perf = np.mean(uv_perf) # non-linear CLF has to be close to perfect self.failUnless( mean_mv_perf > 0.9 ) # linear CLF cannot learn this problem! self.failUnless( mean_mv_perf > mean_mv_lin_perf ) # univariate has insufficient information self.failUnless( mean_uv_perf < mean_mv_perf )
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common: zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] for i, (m, data) in enumerate(zip(mappers, data_mapped)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ZSC zscore(data, chunks_attr=None) ds = dataset_wizard(samples=data, targets=commonspace) #ZSC zscore(ds, chunks_attr=None) m.train(ds) data_temp = m.forward(data) #ZSC zscore(data_temp, chunks_attr=None) data_mapped[i] = data_temp if residuals is not None: residuals[0, i] = np.linalg.norm(data_temp - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds)) if residuals is not None: residuals[1+loop, i] = np.linalg.norm(data_mapped - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() # shallow copy so we could assign new labels #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) if residuals is not None: data_mapped = m.forward(ds_new) residuals[-1, i] = np.linalg.norm(data_mapped - commonspace) return mappers
def test_retrainables(self, clf): # XXX we agreed to not worry about this for the initial 0.6 release raise SkipTest # we need a copy since will tune its internals later on clf = clf.clone() clf.ca.change_temporarily(enable_ca = ['estimates'], # ensure that it does do predictions # while training disable_ca=['training_stats']) clf_re = clf.clone() # TODO: .retrainable must have a callback to call smth like # _set_retrainable clf_re._set_retrainable(True) # need to have high snr so we don't 'cope' with problematic # datasets since otherwise unittests would fail. dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1, 'nonbogus_features':[2,4], 'snr': 5.0} ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NB datasets will be changed by the end of testing, so if # are to change to use generic datasets - make sure to copy # them here ds = deepcopy(datasets['uni2large']) clf.untrain() clf_re.untrain() trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) trerr_re = TransferMeasure(clf_re, Splitter('train'), disable_ca=['training_stats'], postproc=BinaryFxNode(mean_mismatch_error, 'targets')) # Just check for correctness of retraining err_1 = np.asscalar(trerr(ds)) self.failUnless(err_1<0.3, msg="We should test here on easy dataset. Got error of %s" % err_1) values_1 = clf.ca.estimates[:] # some times retraining gets into deeper optimization ;-) eps = 0.05 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 def batch_test(retrain=True, retest=True, closer=True): err = np.asscalar(trerr(ds)) err_re = np.asscalar(trerr_re(ds)) corr = np.corrcoef( clf.ca.estimates, clf_re.ca.estimates)[0, 1] corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1] if __debug__: debug('TEST', "Retraining stats: errors %g %g corr %g " "with old error %g corr %g" % (err, err_re, corr, err_1, corr_old)) self.failUnless(clf_re.ca.retrained == retrain, ("Must fully train", "Must retrain instead of full training")[retrain]) self.failUnless(clf_re.ca.repredicted == retest, ("Must fully test", "Must retest instead of full testing")[retest]) self.failUnless(corr > corrcoef_eps, msg="Result must be close to the one without retraining." " Got corrcoef=%s" % (corr)) if closer: self.failUnless( corr >= corr_old, msg="Result must be closer to current without retraining" " than to old one. Got corrcoef=%s" % (corr_old)) # Check sequential retraining/retesting for i in xrange(3): flag = bool(i!=0) # ok - on 1st call we should train/test, then retrain/retest # and we can't compare for closinest to old result since # we are working on the same data/classifier batch_test(retrain=flag, retest=flag, closer=False) # should retrain nicely if we change a parameter if 'C' in clf.params: clf.params.C *= 0.1 clf_re.params.C *= 0.1 batch_test() elif 'sigma_noise' in clf.params: clf.params.sigma_noise *= 100 clf_re.params.sigma_noise *= 100 batch_test() else: raise RuntimeError, \ 'Please implement testing while changing some of the ' \ 'params for clf %s' % clf # should retrain nicely if we change kernel parameter if hasattr(clf, 'kernel_params') and len(clf.kernel_params): clf.kernel_params.gamma = 0.1 clf_re.kernel_params.gamma = 0.1 # retest is false since kernel got recomputed thus # can't expect to use the same kernel batch_test(retest=not('gamma' in clf.kernel_params)) # should retrain nicely if we change labels permute = AttributePermutator('targets', assure=True) oldlabels = dstrain.targets[:] dstrain = permute(dstrain) self.failUnless((oldlabels != dstrain.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # Change labels in testing oldlabels = dstest.targets[:] dstest = permute(dstest) self.failUnless((oldlabels != dstest.targets).any(), msg="We should succeed at permutting -- now got the same targets") ds = vstack((dstrain, dstest)) batch_test() # should re-train if we change data # reuse trained SVM and its 'final' optimization point if not clf.__class__.__name__ in ['GPR']: # on GPR everything depends on the data ;-) oldsamples = dstrain.samples.copy() dstrain.samples[:] += dstrain.samples*0.05 self.failUnless((oldsamples != dstrain.samples).any()) ds = vstack((dstrain, dstest)) batch_test(retest=False) clf.ca.reset_changed_temporarily() # test retrain() # TODO XXX -- check validity clf_re.retrain(dstrain); self.failUnless(clf_re.ca.retrained) clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.ca.retrained) clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.ca.retrained) # test repredict() clf_re.repredict(dstest.samples); self.failUnless(clf_re.ca.repredicted) self.failUnlessRaises(RuntimeError, clf_re.repredict, dstest.samples, labels=True) """for now retesting with anything changed makes no sense""" clf_re._set_retrainable(False)
def _call(self, dataset): """Perform cross-validation on a dataset. 'dataset' is passed to the splitter instance and serves as the source dataset to generate split for the single cross-validation folds. """ # store the results of the splitprocessor results = [] self.splits = [] # local bindings states = self.states clf = self.__transerror.clf expose_testdataset = self.__expose_testdataset # what states to enable in terr terr_enable = [] for state_var in ['confusion', 'training_confusion', 'samples_error']: if states.isEnabled(state_var): terr_enable += [state_var] # charge states with initial values summaryClass = clf._summaryClass clf_hastestdataset = hasattr(clf, 'testdataset') self.confusion = summaryClass() self.training_confusion = summaryClass() self.transerrors = [] self.samples_error = dict([(id, []) for id in dataset.origids]) # enable requested states in child TransferError instance (restored # again below) if len(terr_enable): self.__transerror.states._changeTemporarily( enable_states=terr_enable) # splitter for split in self.__splitter(dataset): # only train classifier if splitter provides something in first # element of tuple -- the is the behavior of TransferError if states.isEnabled("splits"): self.splits.append(split) if states.isEnabled("transerrors"): # copy first and then train, as some classifiers cannot be copied # when already trained, e.g. SWIG'ed stuff transerror = deepcopy(self.__transerror) else: transerror = self.__transerror # assign testing dataset if given classifier can digest it if clf_hastestdataset and expose_testdataset: clf.testdataset = split[1] pass # run the beast result = transerror(split[1], split[0]) # unbind the testdataset from the classifier if clf_hastestdataset and expose_testdataset: clf.testdataset = None # next line is important for 'self._harvest' call self._harvest(locals()) # XXX Look below -- may be we should have not auto added .? # then transerrors also could be deprecated if states.isEnabled("transerrors"): self.transerrors.append(transerror) # XXX: could be merged with next for loop using a utility class # that can add dict elements into a list if states.isEnabled("samples_error"): for k, v in \ transerror.states.getvalue("samples_error").iteritems(): self.samples_error[k].append(v) # pull in child states for state_var in ['confusion', 'training_confusion']: if states.isEnabled(state_var): states.getvalue(state_var).__iadd__( transerror.states.getvalue(state_var)) if __debug__: debug("CROSSC", "Split #%d: result %s" \ % (len(results), `result`)) results.append(result) # Since we could have operated with a copy -- bind the last used one back self.__transerror = transerror # put states of child TransferError back into original config if len(terr_enable): self.__transerror.states._resetEnabledTemporarily() self.results = results """Store state variable if it is enabled""" # Provide those labels_map if appropriate try: if states.isEnabled("confusion"): states.confusion.labels_map = dataset.labels_map if states.isEnabled("training_confusion"): states.training_confusion.labels_map = dataset.labels_map except: pass return self.__combiner(results)
def resample(self, nt=None, sr=None, dt=None, window='ham', inplace=True, **kwargs): """Convenience method to resample data sample channel-wise. Resampling target can be specified by number of timepoint or temporal distance or sampling rate. Please note that this method only operates on `ChannelDataset` and always returns such. :Parameters: nt: int Number of timepoints to resample to. dt: float Temporal distance of samples after resampling. sr: float Target sampling rate. inplace : bool If inplace=False, it would create and return a new dataset with new samples **kwargs: All additional arguments are passed to resample() from scipy.signal :Return: ChannelDataset """ if nt is None and sr is None and dt is None: raise ValueError, \ "Required argument missing. Either needs ntimepoints, sr or dt." # get data in original shape orig_data = self.O if len(orig_data.shape) != 3: raise ValueError, "resample() only works with data from ChannelDataset." orig_nt = orig_data.shape[2] orig_length = self.dt * orig_nt if nt is None: # translate dt or sr into nt if not dt is None: nt = orig_nt * float(self.dt) / dt elif not sr is None: nt = orig_nt * float(sr) / self.samplingrate else: raise RuntimeError, 'This should not happen!' else: raise RuntimeError, 'This should not happen!' nt = N.round(nt) # downsample data data = signal.resample(orig_data, nt, axis=2, window=window, **kwargs) new_dt = float(orig_length) / nt dsattr = self._dsattr # would be needed for not inplace generation if inplace: dsattr['ch_dt'] = new_dt # XXX We could have resampled range(nsamples) and # rounded it. and adjust then mapper's mask # accordingly instead of creating a new one. # It would give us opportunity to assess what # resampling did... mapper = MaskMapper(N.ones(data.shape[1:], dtype='bool')) # reassign a new mapper. dsattr['mapper'] = mapper self.samples = mapper.forward(data) return self else: # we have to pass dsattr inside to don't loose # some additional attributes such as # labels_map dsattr = copy.deepcopy(dsattr) return ChannelDataset(data=self._data, dsattr=dsattr, samples=data, t0=self.t0, dt=new_dt, channelids=self.channelids, copy_data=True, copy_dsattr=False)
class _SVM(Classifier): """Support Vector Machine Classifier. Base class for all external SVM implementations. """ """ Derived classes should define: * _KERNELS: map(dict) should define assignment to a tuple containing implementation kernel type, list of parameters adherent to the kernel, and sensitivity analyzer e.g.:: _KERNELS = { 'linear': (shogun.Kernel.LinearKernel, (), LinearSVMWeights), 'rbf' : (shogun.Kernel.GaussianKernel, ('gamma',), None), ... } * _KNOWN_IMPLEMENTATIONS: map(dict) should define assignment to a tuple containing implementation of the SVM, list of parameters adherent to the implementation, additional internals, and description e.g.:: _KNOWN_IMPLEMENTATIONS = { 'C_SVC' : (svm.svmc.C_SVC, ('C',), ('binary', 'multiclass'), 'C-SVM classification'), ... } """ _ATTRIBUTE_COLLECTIONS = ['params' ] # enforce presence of params collections # Placeholder: map kernel names to sensitivity classes, ie # 'linear':LinearSVMWeights, for each backend _KNOWN_SENSITIVITIES = {} kernel = Parameter(None, allowedtype=Kernel, doc='Kernel object', index=-1) _SVM_PARAMS = { 'C': Parameter(-1.0, doc='Trade-off parameter between width of the ' 'margin and number of support vectors. Higher C -- ' 'more rigid margin SVM. In linear kernel, negative ' 'values provide automatic scaling of their value ' 'according to the norm of the data'), 'nu': Parameter(0.5, min=0.0, max=1.0, doc='Fraction of datapoints within the margin'), 'cache_size': Parameter(100, doc='Size of the kernel cache, specified in megabytes'), 'tube_epsilon': Parameter(0.01, doc='Epsilon in epsilon-insensitive loss function of ' 'epsilon-SVM regression (SVR)'), 'tau': Parameter(1e-6, doc='TAU parameter of KRR regression in shogun'), 'probability': Parameter(0, doc='Flag to signal either probability estimate is obtained ' 'within LIBSVM'), 'shrinking': Parameter(1, doc='Either shrinking is to be conducted'), 'weight_label': Parameter([], allowedtype='[int]', doc='To be used in conjunction with weight for custom ' 'per-label weight'), # TODO : merge them into a single dictionary 'weight': Parameter([], allowedtype='[double]', doc='Custom weights per label'), # For some reason setting up epsilon to 1e-5 slowed things down a bit # in comparison to how it was before (in yoh/master) by up to 20%... not clear why # may be related to 1e-3 default within _svm.py? 'epsilon': Parameter( 5e-5, min=1e-10, doc= 'Tolerance of termination criteria. (For nu-SVM default is 0.001)') } _KNOWN_PARAMS = () # just a placeholder to please lintian """Parameters which are specific to a given instantiation of SVM """ __tags__ = ['svm', 'kernel-based', 'swig'] def __init__(self, **kwargs): """Init base class of SVMs. *Not to be publicly used* TODO: handling of parameters might migrate to be generic for all classifiers. SVMs are chosen to be testbase for that functionality to see how well it would fit. """ # Check if requested implementation is known svm_impl = kwargs.get('svm_impl', None) if not svm_impl in self._KNOWN_IMPLEMENTATIONS: raise ValueError, \ "Unknown SVM implementation '%s' is requested for %s." \ "Known are: %s" % (svm_impl, self.__class__, self._KNOWN_IMPLEMENTATIONS.keys()) self._svm_impl = svm_impl impl, add_params, add_internals, descr = \ self._KNOWN_IMPLEMENTATIONS[svm_impl] # Add corresponding parameters to 'known' depending on the # implementation chosen if add_params is not None: self._KNOWN_PARAMS = \ self._KNOWN_PARAMS[:] + list(add_params) # Assign per-instance __tags__ self.__tags__ = self.__tags__[:] # Add corresponding internals if add_internals is not None: self.__tags__ += list(add_internals) self.__tags__.append(svm_impl) k = kwargs.get('kernel', None) if k is None: kwargs['kernel'] = self.__default_kernel_class__() if 'linear' in ('%s' % kwargs['kernel']).lower(): # XXX not necessarily best self.__tags__ += ['linear', 'has_sensitivity'] else: self.__tags__ += ['non-linear'] # pop out all args from **kwargs which are known to be SVM parameters _args = {} for param in self._KNOWN_PARAMS + ['svm_impl' ]: # Update to remove kp's? if param in kwargs: _args[param] = kwargs.pop(param) try: Classifier.__init__(self, **kwargs) except TypeError, e: if "__init__() got an unexpected keyword argument " in e.args[0]: # TODO: make it even more specific -- if that argument is listed # within _SVM_PARAMS e.args = tuple( [e.args[0] + "\n Given SVM instance of class %s knows following parameters: %s" % (self.__class__, self._KNOWN_PARAMS) + \ list(e.args)[1:]]) raise e # populate collections and add values from arguments for paramfamily, paramset in ((self._KNOWN_PARAMS, self.params), ): for paramname in paramfamily: if not (paramname in self._SVM_PARAMS): raise ValueError, "Unknown parameter %s" % paramname + \ ". Known SVM params are: %s" % self._SVM_PARAMS.keys() param = deepcopy(self._SVM_PARAMS[paramname]) if paramname in _args: param.value = _args[paramname] # XXX might want to set default to it -- not just value paramset[paramname] = param # TODO: Below commented out because kernel_type has been removed. # Find way to set default C as necessary # tune up C if it has one and non-linear classifier is used #if self.params.has_key('C') and kernel_type != "linear" \ #and self.params['C'].is_default: #if __debug__: #debug("SVM_", "Assigning default C value to be 1.0 for SVM " #"%s with non-linear kernel" % self) #self.params['C'].default = 1.0 # Some postchecks if self.params.has_key('weight') and self.params.has_key( 'weight_label'): if not len(self.params.weight_label) == len(self.params.weight): raise ValueError, "Lenghts of 'weight' and 'weight_label' lists" \ "must be equal." if __debug__: debug("SVM", "Initialized %s with kernel %s" % (self, self.params.kernel))
def _call(self, dataset): """Perform cross-validation on a dataset. 'dataset' is passed to the splitter instance and serves as the source dataset to generate split for the single cross-validation folds. """ # store the results of the splitprocessor results = [] self.ca.splits = [] # local bindings ca = self.ca clf = self.__transerror.clf expose_testdataset = self.__expose_testdataset # what ca to enable in terr terr_enable = [] for state_var in ['confusion', 'training_confusion', 'samples_error']: if ca.is_enabled(state_var): terr_enable += [state_var] # charge ca with initial values summaryClass = clf.__summary_class__ clf_hastestdataset = hasattr(clf, 'testdataset') self.ca.confusion = summaryClass() self.ca.training_confusion = summaryClass() self.ca.transerrors = [] if ca.is_enabled('samples_error'): dataset.init_origids('samples', attr=self.__samples_idattr, mode='existing') self.ca.samples_error = dict([ (id_, []) for id_ in dataset.sa[self.__samples_idattr].value ]) # enable requested ca in child TransferError instance (restored # again below) if len(terr_enable): self.__transerror.ca.change_temporarily(enable_ca=terr_enable) # We better ensure that underlying classifier is not trained if we # are going to deepcopy transerror if ca.is_enabled("transerrors"): self.__transerror.untrain() # collect sum info about the split that where made for the resulting # dataset splitinfo = [] # splitter for split in self.__splitter(dataset): splitinfo.append("%s->%s" % (','.join([ str(c) for c in split[0].sa[self.__splitter.splitattr].unique ]), ','.join([ str(c) for c in split[1].sa[self.__splitter.splitattr].unique ]))) # only train classifier if splitter provides something in first # element of tuple -- the is the behavior of TransferError if ca.is_enabled("splits"): self.ca.splits.append(split) if ca.is_enabled("transerrors"): # copy first and then train, as some classifiers cannot be copied # when already trained, e.g. SWIG'ed stuff lastsplit = None for ds in split: if ds is not None: lastsplit = ds.a.lastsplit break if lastsplit: # only if we could deduce that it was last split # use the 'mother' transerror transerror = self.__transerror else: # otherwise -- deep copy transerror = deepcopy(self.__transerror) else: transerror = self.__transerror # assign testing dataset if given classifier can digest it if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = split[1] # run the beast result = transerror(split[1], split[0]) # unbind the testdataset from the classifier if clf_hastestdataset and expose_testdataset: transerror.clf.testdataset = None # next line is important for 'self._harvest' call self._harvest(locals()) # XXX Look below -- may be we should have not auto added .? # then transerrors also could be deprecated if ca.is_enabled("transerrors"): self.ca.transerrors.append(transerror) # XXX: could be merged with next for loop using a utility class # that can add dict elements into a list if ca.is_enabled("samples_error"): for k, v in \ transerror.ca.samples_error.iteritems(): self.ca.samples_error[k].append(v) # pull in child ca for state_var in ['confusion', 'training_confusion']: if ca.is_enabled(state_var): ca[state_var].value.__iadd__( transerror.ca[state_var].value) if __debug__: debug("CROSSC", "Split #%d: result %s" \ % (len(results), `result`)) results.append(result) # Since we could have operated with a copy -- bind the last used one back self.__transerror = transerror # put ca of child TransferError back into original config if len(terr_enable): self.__transerror.ca.reset_changed_temporarily() self.ca.results = results """Store conditional attribute if it is enabled""" results = Dataset(results, sa={'cv_fold': splitinfo}) return results
def test_retrainables(self, clf): # we need a copy since will tune its internals later on clf = clf.clone() clf.ca.change_temporarily( enable_ca=['estimates'], # ensure that it does do predictions # while training disable_ca=['training_confusion']) clf_re = clf.clone() # TODO: .retrainable must have a callback to call smth like # _set_retrainable clf_re._set_retrainable(True) # need to have high snr so we don't 'cope' with problematic # datasets since otherwise unittests would fail. dsargs = { 'perlabel': 50, 'nlabels': 2, 'nfeatures': 5, 'nchunks': 1, 'nonbogus_features': [2, 4], 'snr': 5.0 } ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NB datasets will be changed by the end of testing, so if # are to change to use generic datasets - make sure to copy # them here dstrain = deepcopy(datasets['uni2large_train']) dstest = deepcopy(datasets['uni2large_test']) clf.untrain() clf_re.untrain() trerr, trerr_re = TransferError(clf), \ TransferError(clf_re, disable_ca=['training_confusion']) # Just check for correctness of retraining err_1 = trerr(dstest, dstrain) self.failUnless( err_1 < 0.3, msg="We should test here on easy dataset. Got error of %s" % err_1) values_1 = clf.ca.estimates[:] # some times retraining gets into deeper optimization ;-) eps = 0.05 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 def batch_test(retrain=True, retest=True, closer=True): err = trerr(dstest, dstrain) err_re = trerr_re(dstest, dstrain) corr = np.corrcoef(clf.ca.estimates, clf_re.ca.estimates)[0, 1] corr_old = np.corrcoef(values_1, clf_re.ca.estimates)[0, 1] if __debug__: debug( 'TEST', "Retraining stats: errors %g %g corr %g " "with old error %g corr %g" % (err, err_re, corr, err_1, corr_old)) self.failUnless(clf_re.ca.retrained == retrain, ("Must fully train", "Must retrain instead of full training")[retrain]) self.failUnless(clf_re.ca.repredicted == retest, ("Must fully test", "Must retest instead of full testing")[retest]) self.failUnless( corr > corrcoef_eps, msg="Result must be close to the one without retraining." " Got corrcoef=%s" % (corr)) if closer: self.failUnless( corr >= corr_old, msg="Result must be closer to current without retraining" " than to old one. Got corrcoef=%s" % (corr_old)) # Check sequential retraining/retesting for i in xrange(3): flag = bool(i != 0) # ok - on 1st call we should train/test, then retrain/retest # and we can't compare for closinest to old result since # we are working on the same data/classifier batch_test(retrain=flag, retest=flag, closer=False) # should retrain nicely if we change a parameter if 'C' in clf.params: clf.params.C *= 0.1 clf_re.params.C *= 0.1 batch_test() elif 'sigma_noise' in clf.params: clf.params.sigma_noise *= 100 clf_re.params.sigma_noise *= 100 batch_test() else: raise RuntimeError, \ 'Please implement testing while changing some of the ' \ 'params for clf %s' % clf # should retrain nicely if we change kernel parameter if hasattr(clf, 'kernel_params') and len(clf.kernel_params): clf.kernel_params.gamma = 0.1 clf_re.kernel_params.gamma = 0.1 # retest is false since kernel got recomputed thus # can't expect to use the same kernel batch_test(retest=not ('gamma' in clf.kernel_params)) # should retrain nicely if we change labels oldlabels = dstrain.targets[:] dstrain.permute_attr(assure_permute=True) self.failUnless( (oldlabels != dstrain.targets).any(), msg="We should succeed at permutting -- now got the same targets") batch_test() # Change labels in testing oldlabels = dstest.targets[:] dstest.permute_attr(assure_permute=True) self.failUnless( (oldlabels != dstest.targets).any(), msg="We should succeed at permutting -- now got the same targets") batch_test() # should re-train if we change data # reuse trained SVM and its 'final' optimization point if not clf.__class__.__name__ in [ 'GPR' ]: # on GPR everything depends on the data ;-) oldsamples = dstrain.samples.copy() dstrain.samples[:] += dstrain.samples * 0.05 self.failUnless((oldsamples != dstrain.samples).any()) batch_test(retest=False) clf.ca.reset_changed_temporarily() # test retrain() # TODO XXX -- check validity clf_re.retrain(dstrain) self.failUnless(clf_re.ca.retrained) clf_re.retrain(dstrain, labels=True) self.failUnless(clf_re.ca.retrained) clf_re.retrain(dstrain, traindataset=True) self.failUnless(clf_re.ca.retrained) # test repredict() clf_re.repredict(dstest.samples) self.failUnless(clf_re.ca.repredicted) self.failUnlessRaises(RuntimeError, clf_re.repredict, dstest.samples, labels=True) """for now retesting with anything changed makes no sense""" clf_re._set_retrainable(False)