def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if __debug__ and chunks_attr is not None: nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr) min_nsamples_per_chunk = np.min(nsamples_per_chunk.values()) if min_nsamples_per_chunk in range(3, 6): warning( "Z-scoring chunk-wise having a chunk with only " "%d samples is 'discouraged'. " "You have chunks with following number of samples: %s" % ( min_nsamples_per_chunk, nsamples_per_chunk, )) if min_nsamples_per_chunk <= 2: warning( "Z-scoring chunk-wise having a chunk with less " "than three samples will set features in these " "samples to either zero (with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk). " "You have chunks with following number of samples: %s" % (nsamples_per_chunk, )) params = self.__params_dict if params is None: raise RuntimeError, \ "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # but deepcopy the samples since _zscore would modify inplace mds.samples = mds.samples.copy() # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if '__all__' in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params['__all__']) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c)) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if __debug__ and not chunks_attr is None: nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr) min_nsamples_per_chunk = np.min(nsamples_per_chunk.values()) if min_nsamples_per_chunk in range(3, 6): warning( "Z-scoring chunk-wise having a chunk with only " "%d samples is 'discouraged'. " "You have chunks with following number of samples: %s" % (min_nsamples_per_chunk, nsamples_per_chunk) ) if min_nsamples_per_chunk <= 2: warning( "Z-scoring chunk-wise having a chunk with less " "than three samples will set features in these " "samples to either zero (with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk). " "You have chunks with following number of samples: %s" % (nsamples_per_chunk,) ) params = self.__params_dict if params is None: raise RuntimeError, "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # but deepcopy the samples since _zscore would modify inplace mds.samples = mds.samples.copy() # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if "__all__" in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params["__all__"]) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c) ) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def _forward_dataset(self, ds): # local binding chunks_attr = self.__chunks_attr dtype = self.__dtype if ( __debug__ and not chunks_attr is None and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2 ): warning( "Z-scoring chunk-wise having a chunk with less than three " "samples will set features in these samples to either zero " "(with 1 sample in a chunk) " "or -1/+1 (with 2 samples in a chunk)." ) params = self.__params_dict if params is None: raise RuntimeError, "ZScoreMapper needs to be trained before call to forward" if self._secret_inplace_zscore: mds = ds else: # shallow copy to put the new stuff in mds = ds.copy(deep=False) # cast the data to float, since in-place operations below do not upcast! if np.issubdtype(mds.samples.dtype, np.integer): mds.samples = mds.samples.astype(dtype) if "__all__" in params: # we have a global parameter set mds.samples = self._zscore(mds.samples, *params["__all__"]) else: # per chunk z-scoring for c in mds.sa[chunks_attr].unique: if not c in params: raise RuntimeError( "%s has no parameters for chunk '%s'. It probably " "wasn't present in the training dataset!?" % (self.__class__.__name__, c) ) slicer = np.where(mds.sa[chunks_attr].value == c)[0] mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c]) return mds
def test_cper_class(self, clf): if not (clf.params.has_key('C')): # skip those without C return ds = datasets['uni2medium'].copy() ds__ = datasets['uni2medium'].copy() # # ballanced set # Lets add a bit of noise to drive classifier nuts. same # should be done for disballanced set ds__.samples = ds__.samples + \ 0.5 * np.random.normal(size=(ds__.samples.shape)) # # disballanced set # lets overpopulate label 0 times = 20 ds_ = ds[(range(ds.nsamples) + range(ds.nsamples//2) * times)] ds_.samples = ds_.samples + \ 0.5 * np.random.normal(size=(ds_.samples.shape)) spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel #print ds_.targets, ds_.chunks cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') # on balanced e = cve(ds__) tpr_1 = cve.ca.stats.stats["TPR"][1] # on disbalanced e = cve(ds_) tpr_2 = cve.ca.stats.stats["TPR"][1] # Set '1 C per label' # recreate cvte since previous might have operated on copies cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') oldC = clf.params.C # TODO: provide clf.params.C not with a tuple but dictionary # with C per label (now order is deduced in a cruel way) ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]]) clf.params.C = (-1/ratio, -1*ratio) try: # on disbalanced but with balanced C e_ = cve(ds_) # reassign C clf.params.C = oldC except: clf.params.C = oldC raise tpr_3 = cve.ca.stats.stats["TPR"][1] # Actual tests if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(tpr_1 > 0.25, msg="Without disballance we should have some " "hits, but got TPR=%.3f" % tpr_1) self.assertTrue(tpr_2 < 0.25, msg="With disballance we should have almost no " "hits for minor, but got TPR=%.3f" % tpr_2) self.assertTrue(tpr_3 > 0.25, msg="With disballanced data but ratio-based Cs " "we should have some hits for minor, but got " "TPR=%.3f" % tpr_3)
def test_cper_class(self, clf): if not ('C' in clf.params): # skip those without C return ds = datasets['uni2medium'].copy() ds__ = datasets['uni2medium'].copy() # # ballanced set # Lets add a bit of noise to drive classifier nuts. same # should be done for disballanced set ds__.samples += 0.5 * np.random.normal(size=(ds__.samples.shape)) # # disballanced set # lets overpopulate label 0 times = 20 ds_ = ds[(list(range(ds.nsamples)) + list(range(ds.nsamples // 2)) * times)] ds_.samples += 0.5 * np.random.normal(size=(ds_.samples.shape)) spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') # on balanced e = cve(ds__) tpr_1 = cve.ca.stats.stats["TPR"][1] # we should be able to print summary for the classifier clf_summary = clf.summary() if externals.exists('libsvm') and isinstance(clf, libsvm.SVM): self.assertIn(" #SVs:", clf_summary) self.assertIn(" #bounded_SVs:", clf_summary) self.assertIn(" used_C:", clf_summary) # on disbalanced e = cve(ds_) tpr_2 = cve.ca.stats.stats["TPR"][1] # Set '1 C per label' # recreate cvte since previous might have operated on copies cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats') oldC = clf.params.C # TODO: provide clf.params.C not with a tuple but dictionary # with C per label (now order is deduced in a cruel way) ratio = np.sqrt(float(spl[ds_.UT[0]]) / spl[ds_.UT[1]]) clf.params.C = (-1 / ratio, -1 * ratio) try: # on disbalanced but with balanced C e_ = cve(ds_) # reassign C clf.params.C = oldC except: clf.params.C = oldC raise tpr_3 = cve.ca.stats.stats["TPR"][1] # Actual tests if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(tpr_1 > 0.25, msg="Without disballance we should have some " "hits, but got TPR=%.3f" % tpr_1) self.assertTrue(tpr_2 < 0.25, msg="With disballance we should have almost no " "hits for minor, but got TPR=%.3f" % tpr_2) self.assertTrue(tpr_3 > 0.25, msg="With disballanced data but ratio-based Cs " "we should have some hits for minor, but got " "TPR=%.3f" % tpr_3)