Exemple #1
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if __debug__ and chunks_attr is not None:
            nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr)
            min_nsamples_per_chunk = np.min(nsamples_per_chunk.values())
            if min_nsamples_per_chunk in range(3, 6):
                warning(
                    "Z-scoring chunk-wise having a chunk with only "
                    "%d samples is 'discouraged'. "
                    "You have chunks with following number of samples: %s" % (
                        min_nsamples_per_chunk,
                        nsamples_per_chunk,
                    ))
            if min_nsamples_per_chunk <= 2:
                warning(
                    "Z-scoring chunk-wise having a chunk with less "
                    "than three samples will set features in these "
                    "samples to either zero (with 1 sample in a chunk) "
                    "or -1/+1 (with 2 samples in a chunk). "
                    "You have chunks with following number of samples: %s" %
                    (nsamples_per_chunk, ))

        params = self.__params_dict
        if params is None:
            raise RuntimeError, \
                  "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)
            # but deepcopy the samples since _zscore would modify inplace
            mds.samples = mds.samples.copy()

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if '__all__' in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params['__all__'])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" %
                        (self.__class__.__name__, c))
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer],
                                                   *params[c])

        return mds
Exemple #2
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if __debug__ and not chunks_attr is None:
            nsamples_per_chunk = get_nsamples_per_attr(ds, chunks_attr)
            min_nsamples_per_chunk = np.min(nsamples_per_chunk.values())
            if min_nsamples_per_chunk in range(3, 6):
                warning(
                    "Z-scoring chunk-wise having a chunk with only "
                    "%d samples is 'discouraged'. "
                    "You have chunks with following number of samples: %s"
                    % (min_nsamples_per_chunk, nsamples_per_chunk)
                )
            if min_nsamples_per_chunk <= 2:
                warning(
                    "Z-scoring chunk-wise having a chunk with less "
                    "than three samples will set features in these "
                    "samples to either zero (with 1 sample in a chunk) "
                    "or -1/+1 (with 2 samples in a chunk). "
                    "You have chunks with following number of samples: %s" % (nsamples_per_chunk,)
                )

        params = self.__params_dict
        if params is None:
            raise RuntimeError, "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)
            # but deepcopy the samples since _zscore would modify inplace
            mds.samples = mds.samples.copy()

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if "__all__" in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params["__all__"])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" % (self.__class__.__name__, c)
                    )
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c])

        return mds
Exemple #3
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if (
            __debug__
            and not chunks_attr is None
            and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2
        ):
            warning(
                "Z-scoring chunk-wise having a chunk with less than three "
                "samples will set features in these samples to either zero "
                "(with 1 sample in a chunk) "
                "or -1/+1 (with 2 samples in a chunk)."
            )

        params = self.__params_dict
        if params is None:
            raise RuntimeError, "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if "__all__" in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params["__all__"])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" % (self.__class__.__name__, c)
                    )
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c])

        return mds
Exemple #4
0
    def test_cper_class(self, clf):
        if not (clf.params.has_key('C')):
            # skip those without C
            return

        ds = datasets['uni2medium'].copy()
        ds__ = datasets['uni2medium'].copy()
        #
        # ballanced set
        # Lets add a bit of noise to drive classifier nuts. same
        # should be done for disballanced set
        ds__.samples = ds__.samples + \
                       0.5 * np.random.normal(size=(ds__.samples.shape))
        #
        # disballanced set
        # lets overpopulate label 0
        times = 20
        ds_ = ds[(range(ds.nsamples) + range(ds.nsamples//2) * times)]
        ds_.samples = ds_.samples + \
                      0.5 * np.random.normal(size=(ds_.samples.shape))
        spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel
        #print ds_.targets, ds_.chunks

        cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
        # on balanced
        e = cve(ds__)
        tpr_1 = cve.ca.stats.stats["TPR"][1]

        # on disbalanced
        e = cve(ds_)
        tpr_2 =  cve.ca.stats.stats["TPR"][1]

        # Set '1 C per label'
        # recreate cvte since previous might have operated on copies
        cve = CrossValidation(clf, NFoldPartitioner(),
                                          enable_ca='stats')
        oldC = clf.params.C
        # TODO: provide clf.params.C not with a tuple but dictionary
        #       with C per label (now order is deduced in a cruel way)
        ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]])
        clf.params.C = (-1/ratio, -1*ratio)
        try:
            # on disbalanced but with balanced C
            e_ = cve(ds_)
            # reassign C
            clf.params.C = oldC
        except:
            clf.params.C = oldC
            raise
        tpr_3 = cve.ca.stats.stats["TPR"][1]

        # Actual tests
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(tpr_1 > 0.25,
                            msg="Without disballance we should have some "
                            "hits, but got TPR=%.3f" % tpr_1)

            self.assertTrue(tpr_2 < 0.25,
                            msg="With disballance we should have almost no "
                            "hits for minor, but got TPR=%.3f" % tpr_2)

            self.assertTrue(tpr_3 > 0.25,
                            msg="With disballanced data but ratio-based Cs "
                            "we should have some hits for minor, but got "
                            "TPR=%.3f" % tpr_3)
Exemple #5
0
    def test_cper_class(self, clf):
        if not ('C' in clf.params):
            # skip those without C
            return

        ds = datasets['uni2medium'].copy()
        ds__ = datasets['uni2medium'].copy()
        #
        # ballanced set
        # Lets add a bit of noise to drive classifier nuts. same
        # should be done for disballanced set
        ds__.samples += 0.5 * np.random.normal(size=(ds__.samples.shape))
        #
        # disballanced set
        # lets overpopulate label 0
        times = 20
        ds_ = ds[(list(range(ds.nsamples)) +
                  list(range(ds.nsamples // 2)) * times)]
        ds_.samples += 0.5 * np.random.normal(size=(ds_.samples.shape))
        spl = get_nsamples_per_attr(ds_, 'targets')  #_.samplesperlabel

        cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
        # on balanced
        e = cve(ds__)
        tpr_1 = cve.ca.stats.stats["TPR"][1]

        # we should be able to print summary for the classifier
        clf_summary = clf.summary()
        if externals.exists('libsvm') and isinstance(clf, libsvm.SVM):
            self.assertIn(" #SVs:", clf_summary)
            self.assertIn(" #bounded_SVs:", clf_summary)
            self.assertIn(" used_C:", clf_summary)

        # on disbalanced
        e = cve(ds_)
        tpr_2 = cve.ca.stats.stats["TPR"][1]

        # Set '1 C per label'
        # recreate cvte since previous might have operated on copies
        cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
        oldC = clf.params.C
        # TODO: provide clf.params.C not with a tuple but dictionary
        #       with C per label (now order is deduced in a cruel way)
        ratio = np.sqrt(float(spl[ds_.UT[0]]) / spl[ds_.UT[1]])
        clf.params.C = (-1 / ratio, -1 * ratio)
        try:
            # on disbalanced but with balanced C
            e_ = cve(ds_)
            # reassign C
            clf.params.C = oldC
        except:
            clf.params.C = oldC
            raise
        tpr_3 = cve.ca.stats.stats["TPR"][1]

        # Actual tests
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(tpr_1 > 0.25,
                            msg="Without disballance we should have some "
                            "hits, but got TPR=%.3f" % tpr_1)

            self.assertTrue(tpr_2 < 0.25,
                            msg="With disballance we should have almost no "
                            "hits for minor, but got TPR=%.3f" % tpr_2)

            self.assertTrue(tpr_3 > 0.25,
                            msg="With disballanced data but ratio-based Cs "
                            "we should have some hits for minor, but got "
                            "TPR=%.3f" % tpr_3)