Exemple #1
0
    def selectFeatures(self, ids, plain=False, sort=False):
        """Select features given their ids.

        The methods behaves similar to Dataset.selectFeatures(), but
        additionally takes care of adjusting the embedded mapper
        appropriately.

        :Parameters:
          ids: sequence
            Iterable container to select ids
          plain: boolean
            Flag whether to return MappedDataset (or just Dataset)
          sort: boolean
            Flag whether to sort Ids. Order matters and selectFeatures
            assumes incremental order. If not such, in non-optimized
            code selectFeatures would verify the order and sort
        """

        # call base method to get selected feature subset
        if plain:
            sdata = Dataset(self._data, self._dsattr, check_data=False,
                            copy_samples=False, copy_data=False,
                            copy_dsattr=False)
            return sdata.selectFeatures(ids=ids, sort=sort)
        else:
            sdata = Dataset.selectFeatures(self, ids=ids, sort=sort)
            # since we have new DataSet we better have a new mapper
            sdata._dsattr['mapper'] = copy.deepcopy(sdata._dsattr['mapper'])
            if sort:
                sdata._dsattr['mapper'].selectOut(sorted(ids))
            else:
                sdata._dsattr['mapper'].selectOut(ids)
            return sdata
Exemple #2
0
    def testEvilSelects(self):
        """Test some obscure selections of samples via select() or __getitem__
        """
        origdata = datasets["uni2large"].samples[:100, :10].T
        data = Dataset(
            samples=origdata,
            #       0  1  2  3  4  5  6  7  8  9
            labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9],
            chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6],
        )

        # malformed getitem
        if __debug__:
            # check is enforced only in __debug__
            self.failUnlessRaises(ValueError, data.__getitem__, "labels", "featu")

        # too many indicies
        self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1)

        # various getitems which should carry the same result
        for sel in [
            data.select("chunks", [2, 6], labels=[3, 2], features=slice(None)),
            data.select("all", "all", labels=[2, 3], chunks=[2, 6]),
            data["chunks", [2, 6], "labels", [3, 2]],
            data[:, :, "chunks", [2, 6], "labels", [3, 2]],
            # get warnings but should work as the rest for now
            data[3:8, "chunks", [2, 6, 2, 6], "labels", [3, 2]],
        ]:
            self.failUnless(N.all(sel.origids == [3, 7]))
            self.failUnless(sel.nfeatures == 100)
            self.failUnless(N.all(sel.samples == origdata[[3, 7]]))

        target = origdata[[3, 7]]
        target = target[:, [1, 3]]
        # various getitems which should carry the same result
        for sel in [
            data.select("all", [1, 3], "chunks", [2, 6], labels=[3, 2]),
            data[:, [1, 3], "chunks", [2, 6], "labels", [3, 2]],
            data[:, [1, 3], "chunks", [2, 6], "labels", [3, 2]],
            # get warnings but should work as the rest for now
            data[3:8, [1, 1, 3, 1], "chunks", [2, 6, 2, 6], "labels", [3, 2]],
        ]:
            self.failUnless(N.all(sel.origids == [3, 7]))
            self.failUnless(sel.nfeatures == 2)
            self.failUnless(N.all(sel.samples == target))

        # Check if we get empty selection if requesting impossible
        self.failUnless(data.select(chunks=[23]).nsamples == 0)

        # Check .where()
        self.failUnless(N.all(data.where(chunks=[2, 6]) == [1, 3, 7, 9]))
        self.failUnless(N.all(data.where(chunks=[2, 6], labels=[22, 3]) == [3]))
        # both samples and features
        idx = data.where("all", [1, 3, 10], labels=[2, 3, 4])
        self.failUnless(N.all(idx[1] == [1, 3, 10]))
        self.failUnless(N.all(idx[0] == range(2, 8)))
        # empty query
        self.failUnless(data.where() is None)
        # empty result
        self.failUnless(data.where(labels=[123]) == [])
Exemple #3
0
    def testCombinedPatternAndFeatureMasking(self):
        data = Dataset(samples=N.arange(20).reshape((4, 5)), labels=1, chunks=1)

        self.failUnless(data.nsamples == 4)
        self.failUnless(data.nfeatures == 5)
        fsel = data.selectFeatures([1, 2])
        fpsel = fsel.selectSamples([0, 3])
        self.failUnless(fpsel.nsamples == 2)
        self.failUnless(fpsel.nfeatures == 2)

        self.failUnless((fpsel.samples == [[1, 2], [16, 17]]).all())
Exemple #4
0
    def testFeatureMaskConversion(self):
        dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1)

        mask = dataset.convertFeatureIds2FeatureMask(range(dataset.nfeatures))
        self.failUnless(len(mask) == dataset.nfeatures)
        self.failUnless((mask == True).all())

        self.failUnless((dataset.convertFeatureMask2FeatureIds(mask) == range(3)).all())

        mask[1] = False

        self.failUnless((dataset.convertFeatureMask2FeatureIds(mask) == [0, 2]).all())
Exemple #5
0
    def testApplyMapper(self):
        """Test creation of new dataset by applying a mapper"""
        mapper = MaskMapper(N.array([1, 0, 1]))
        dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1)
        seldataset = dataset.applyMapper(featuresmapper=mapper)
        self.failUnless((dataset.selectFeatures([0, 2]).samples == seldataset.samples).all())

        # Lets do simple test on maskmapper reverse since it seems to
        # do evil things. Those checks are done only in __debug__
        if __debug__:
            # should fail since in mask we have just 2 features now
            self.failUnlessRaises(ValueError, mapper.reverse, [10, 20, 30])
            self.failUnlessRaises(ValueError, mapper.forward, [10, 20])
Exemple #6
0
    def testLabelsMapping(self):
        """Test mapping of the labels from strings to numericals
        """
        od = {"apple": 0, "orange": 1}
        samples = [[3], [2], [3]]
        labels_l = ["apple", "orange", "apple"]

        # test broadcasting of the label
        ds = Dataset(samples=samples, labels="orange")
        self.failUnless(N.all(ds.labels == ["orange"] * 3))

        # Test basic mapping of litteral labels
        for ds in [
            Dataset(samples=samples, labels=labels_l, labels_map=od),
            # Figure out mapping
            Dataset(samples=samples, labels=labels_l, labels_map=True),
        ]:
            self.failUnless(N.all(ds.labels == [0, 1, 0]))
            self.failUnless(ds.labels_map == od)
            ds_ = ds[1]
            self.failUnless(ds_.labels_map == od, msg="selectSamples should provide full mapping preserved")

        # We should complaint about insufficient mapping
        self.failUnlessRaises(ValueError, Dataset, samples=samples, labels=labels_l, labels_map={"apple": 0})

        # Conformance to older behavior -- if labels are given in
        # strings, no mapping occur by default
        ds2 = Dataset(samples=samples, labels=labels_l)
        self.failUnlessEqual(ds2.labels_map, None)

        # We should label numerical labels if it was requested:
        od3 = {1: 100, 2: 101, 3: 100}
        ds3 = Dataset(samples=samples, labels=[1, 2, 3], labels_map=od3)
        self.failUnlessEqual(ds3.labels_map, od3)
        self.failUnless(N.all(ds3.labels == [100, 101, 100]))

        ds3_ = ds3[1]
        self.failUnlessEqual(ds3.labels_map, od3)

        ds4 = Dataset(samples=samples, labels=labels_l)

        # Lets check setting the labels map
        ds = Dataset(samples=samples, labels=labels_l, labels_map=od)

        self.failUnlessRaises(ValueError, ds.setLabelsMap, {"orange": 1, "nonorange": 3})
        new_map = {"tasty": 0, "crappy": 1}
        ds.labels_map = new_map.copy()
        self.failUnlessEqual(ds.labels_map, new_map)
Exemple #7
0
    def _call(self, dataset):
        """Extract weights from GPR
        """

        clf = self.clf
        kernel = clf.kernel
        train_fv = clf._train_fv
        if isinstance(kernel, LinearKernel):
            Sigma_p = 1.0
        else:
            Sigma_p = kernel.params.Sigma_p

        weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha))

        if self.ca.is_enabled('variances'):
            # super ugly formulas that can be quite surely improved:
            tmp = np.linalg.inv(clf._L)
            Kyinv = Ndot(tmp.T, tmp)
            # XXX in such lengthy matrix manipulations you might better off
            #     using np.matrix where * is a matrix product
            self.ca.variances = Ndiag(
                Sigma_p -
                Ndot(Sigma_p,
                     Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p)))))
        return Dataset(np.atleast_2d(weights))
Exemple #8
0
    def _call(self, dataset):
        sensitivities = []
        for ind, analyzer in enumerate(self.__analyzers):
            if __debug__:
                debug("SA", "Computing sensitivity for SA#%d:%s" %
                      (ind, analyzer))
            sensitivity = analyzer(dataset)
            sensitivities.append(sensitivity)

        if __debug__:
            debug("SA",
                  "Returning %d sensitivities from %s" %
                  (len(sensitivities), self.__class__.__name__))

        sa_attr = self._sa_attr
        if isinstance(sensitivities[0], AttrDataset):
            smerged = None
            for i, s in enumerate(sensitivities):
                s.sa[sa_attr] = np.repeat(i, len(s))
                if smerged is None:
                    smerged = s
                else:
                    smerged.append(s)
            sensitivities = smerged
        else:
            sensitivities = \
                Dataset(sensitivities,
                        sa={sa_attr: np.arange(len(sensitivities))})

        self.ca.sensitivities = sensitivities

        return sensitivities
Exemple #9
0
    def _forward_dataset_helper(self, ds):
        # local binding
        num = self.__num

        pos = None
        if not self.__position_attr is None:
            # we know something about sample position
            pos = ds.sa[self.__position_attr].value
            rsamples, pos = resample(ds.samples,
                                     self.__num,
                                     t=pos,
                                     window=self.__window_args)
        else:
            # we know nothing about samples position
            rsamples = resample(ds.samples,
                                self.__num,
                                t=None,
                                window=self.__window_args)
        # new dataset that reuses that feature and dataset attributes of the
        # source
        mds = Dataset(rsamples, fa=ds.fa, a=ds.a)

        # the tricky part is what to do with the samples attributes, since their
        # number has changes
        if self.__attr_strategy == 'remove':
            # nothing to be done
            pass
        elif self.__attr_strategy == 'sample':
            step = int(len(ds) / num)
            sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa])
            mds.sa.update(sa)
        elif self.__attr_strategy == 'resample':
            # resample the attributes themselves
            sa = {}
            for k in ds.sa:
                v = ds.sa[k].value
                if pos is None:
                    sa[k] = resample(v,
                                     self.__num,
                                     t=None,
                                     window=self.__window_args)
                else:
                    if k == self.__position_attr:
                        # position attr will be handled separately at the end
                        continue
                    sa[k] = resample(v,
                                     self.__num,
                                     t=pos,
                                     window=self.__window_args)[0]
            # inject them all
            mds.sa.update(sa)
        else:
            raise ValueError("Unkown attribute handling strategy '%s'." %
                             self.__attr_strategy)

        if not pos is None:
            # we got the new sample positions and can store them
            mds.sa[self.__position_attr] = pos
        return mds
 def test_linear_kernel(self):
     """Simplistic testing of linear kernel"""
     d1 = Dataset(np.asarray([range(5)] * 10, dtype=float))
     lk = npK.LinearKernel()
     lk.compute(d1)
     self.failUnless(lk._k.shape == (10, 10),
                     "Failure computing LinearKernel (Size mismatch)")
     self.failUnless((lk._k == 30).all(), "Failure computing LinearKernel")
Exemple #11
0
def test_resample():
    time = np.linspace(0, 2 * np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa={
                     'time': time,
                     'section': np.repeat(range(10), 10)
                 })
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num,
                           window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:],
                              mds_partial.samples[2:],
                              decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0, 1], len(ds))
    rm = FFTResampleMapper(num,
                           attr_strategy='sample',
                           chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10), 2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
Exemple #12
0
    def testId(self):
        """Test Dataset.idhash() if it gets changed if any of the
        labels/chunks changes
        """

        dataset = Dataset(samples=N.arange(12).reshape((4, 3)), labels=1, chunks=1)
        origid = dataset.idhash
        dataset.labels = [3, 1, 2, 3]  # change all labels
        self.failUnless(origid != dataset.idhash, msg="Changing all labels should alter dataset's idhash")

        origid = dataset.idhash

        z = dataset.labels[1]
        self.failUnlessEqual(origid, dataset.idhash, msg="Accessing shouldn't change idhash")
        z = dataset.chunks
        self.failUnlessEqual(origid, dataset.idhash, msg="Accessing shouldn't change idhash")
        z[2] = 333
        self.failUnless(origid != dataset.idhash, msg="Changing value in attribute should change idhash")

        origid = dataset.idhash
        dataset.samples[1, 1] = 1000
        self.failUnless(origid != dataset.idhash, msg="Changing value in data should change idhash")

        origid = dataset.idhash
        dataset.permuteLabels(True)
        self.failUnless(origid != dataset.idhash, msg="Permutation also changes idhash")

        dataset.permuteLabels(False)
        self.failUnless(origid == dataset.idhash, msg="idhash should be restored after " "permuteLabels(False)")
    def test_cached_kernel(self):
        nchunks = 5
        n = 50 * nchunks
        d = Dataset(np.random.randn(n, 132))
        d.sa.chunks = np.random.randint(nchunks, size=n)

        # We'll compare against an Rbf just because it has a parameter to change
        rk = npK.RbfKernel(sigma=1.5)

        # Assure two kernels are independent for this test
        ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5))
        ck.compute(d)  # Initial cache of all data

        self.failUnless(ck._recomputed,
                        'CachedKernel was not initially computed')

        # Try some splitting
        for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]:
            rk.compute(chunk)
            ck.compute(chunk)
            self.kernel_equiv(rk, ck)  #, accuracy=1e-12)
            self.failIf(ck._recomputed,
                        "CachedKernel incorrectly recomputed it's kernel")

        # Test what happens when a parameter changes
        ck.params.sigma = 3.5
        ck.compute(d)
        self.failUnless(ck._recomputed,
                        "CachedKernel doesn't recompute on kernel change")
        rk.params.sigma = 3.5
        rk.compute(d)
        self.failUnless(np.all(rk._k == ck._k),
                        'Cached and rbf kernels disagree after kernel change')

        # Now test handling new data
        d2 = Dataset(np.random.randn(32, 43))
        ck.compute(d2)
        self.failUnless(
            ck._recomputed,
            "CachedKernel did not automatically recompute new data")
        ck.compute(d)
        self.failUnless(ck._recomputed,
                        "CachedKernel did not recompute old data which had\n" +\
                        "previously been computed, but had the cache overriden")
Exemple #14
0
    def testFeatureSelection(self):
        """Testing feature selection: sorted/not sorted, feature groups
        """
        origdata = datasets["uni2large"].samples[:10, :20]
        data = Dataset(samples=origdata, labels=2, chunks=2)

        # define some feature groups
        data.defineFeatureGroups(N.repeat(range(4), 5))

        unmasked = data.samples.copy()

        # default must be no mask
        self.failUnless(data.nfeatures == 20)

        features_to_select = [3, 0, 17]
        features_to_select_copy = copy.deepcopy(features_to_select)
        features_to_select_sorted = copy.deepcopy(features_to_select)
        features_to_select_sorted.sort()

        bsel = N.array([False] * 20)
        bsel[features_to_select] = True
        # check selection with feature list
        for sel, issorted in [
            (data.selectFeatures(features_to_select, sort=False), False),
            (data.selectFeatures(features_to_select, sort=True), True),
            (data.select(slice(None), features_to_select), True),
            (data.select(slice(None), N.array(features_to_select)), True),
            (data.select(slice(None), bsel), True),
        ]:
            self.failUnless(sel.nfeatures == 3)

            # check size of the masked patterns
            self.failUnless(sel.samples.shape == (10, 3))

            # check that the right features are selected
            fts = (features_to_select, features_to_select_sorted)[int(issorted)]
            self.failUnless((unmasked[:, fts] == sel.samples).all())

            # check grouping information
            self.failUnless((sel._dsattr["featuregroups"] == [0, 0, 3]).all())

            # check side effect on features_to_select parameter:
            self.failUnless(features_to_select == features_to_select_copy)

        # check selection by feature group id
        gsel = data.selectFeatures(groups=[2, 3])
        self.failUnless(gsel.nfeatures == 10)
        self.failUnless(set(gsel._dsattr["featuregroups"]) == set([2, 3]))
Exemple #15
0
    def testAttributes(self):
        """Test adding custom attributes to a dataset
        """
        # class BlobbyDataset(Dataset):
        #    pass
        # TODO: we can't assign attributes to those for now...
        ds = Dataset(samples=range(5), labels=1, chunks=1)
        self.failUnlessRaises(AttributeError, lambda x: x.blobs, ds)
        """Dataset.blobs should fail since .blobs wasn't yet registered"""

        # register new attribute but it would alter only new instances
        Dataset._registerAttribute("blobs", "_data", hasunique=True)
        ds = Dataset(samples=range(5), labels=1, chunks=1)
        self.failUnless(not ds.blobs != [0], msg="By default new attributes supposed to get 0 as the value")

        try:
            ds.blobs = [1, 2]
            self.fail(msg="Dataset.blobs=[1,2] should fail since " "there is 5 samples")
        except ValueError, e:
            pass
Exemple #16
0
    def _call(self, dataset):
        """Computes featurewise I-RELIEF weights."""
        samples = dataset.samples
        NS, NF = samples.shape[:2]
        if self.w_guess == None:
            self.w = np.ones(NF, 'd')
        # do normalization in all cases to be safe :)
        self.w = self.w / (self.w**2).sum()

        M, H = self.compute_M_H(dataset.targets)

        while True:
            self.k = self.kernel(length_scale=self.kernel_width / self.w)
            d_w_k = self.k.computed(samples).as_raw_np()
            # set d_w_k to zero where distance=0 (i.e. kernel ==
            # 1.0), otherwise I-RELIEF could not converge.
            # XXX Note that kernel==1 for distance=0 only for
            # exponential kernels!!  IMPROVE
            d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0
            ni = np.zeros(NF, 'd')
            for n in range(NS):
                # d_w_k[n,n] could be omitted since == 0.0
                gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \
                                / (d_w_k[n, :].sum()-d_w_k[n, n]))
                alpha_n = np.nan_to_num(d_w_k[n, M[n]] /
                                        (d_w_k[n, M[n]].sum()))
                beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum()))

                m_n = (np.abs(samples[n, :] - samples[M[n], :]) \
                        * alpha_n[:, None]).sum(0)
                h_n = (np.abs(samples[n, :] - samples[H[n], :]) \
                        * beta_n[:, None]).sum(0)
                ni += gamma_n * (m_n - h_n)
            ni = ni / NS

            ni_plus = np.clip(ni, 0.0,
                              np.inf)  # set all negative elements to zero
            w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
            change = np.abs(w_new - self.w).sum()
            if __debug__ and 'IRELIEF' in debug.active:
                debug(
                    'IRELIEF',
                    "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" %
                    (change, w_new.max(), w_new.min(), w_new.mean(),
                     w_new.std(), np.isnan(w_new).sum()))

            # update weights:
            self.w = w_new
            if change < self.threshold:
                break

        return Dataset(self.w[np.newaxis])
Exemple #17
0
def test_resample():
    time = np.linspace(0, 2*np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa = {'time': time,
                       'section': np.repeat(range(10), 10)})
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num, window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0,1], len(ds))
    rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10),2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
def test_datasetmapping():
    # 6 samples, 4 features
    data = np.arange(24).reshape(6, 4)
    ds = Dataset(data,
                 sa={
                     'timepoints': np.arange(6),
                     'multidim': data.copy()
                 },
                 fa={'fid': np.arange(4)})
    # with overlapping and non-overlapping boxcars
    startpoints = [0, 1, 4]
    boxlength = 2
    bm = BoxcarMapper(startpoints, boxlength, inspace='boxy')
    # train is critical
    bm.train(ds)
    mds = bm.forward(ds)
    assert_equal(len(mds), len(startpoints))
    assert_equal(mds.nfeatures, boxlength)
    # all samples attributes remain, but the can rotated/compressed into
    # multidimensional attributes
    assert_equal(sorted(mds.sa.keys()),
                 ['boxy_onsetidx'] + sorted(ds.sa.keys()))
    assert_equal(mds.sa.multidim.shape,
                 (len(startpoints), boxlength, ds.nfeatures))
    assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength))
    assert_array_equal(mds.sa.timepoints.flatten(),
                       np.array([(s, s + 1) for s in startpoints]).flatten())
    assert_array_equal(mds.sa.boxy_onsetidx, startpoints)
    # feature attributes also get rotated and broadcasted
    assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid])
    # and finally there is a new one
    assert_array_equal(mds.fa.boxy_offsetidx,
                       np.repeat(np.arange(boxlength), 4).reshape(2, -1))

    # now see how it works on reverse()
    rds = bm.reverse(mds)
    # we got at least something of all original attributes back
    assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys()))
    assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys()))
    # it is not possible to reconstruct the full samples array
    # some samples even might show up multiple times (when there are overlapping
    # boxcars
    assert_array_equal(
        rds.samples,
        np.array([[0, 1, 2, 3], [4, 5, 6, 7], [4, 5, 6, 7], [8, 9, 10, 11],
                  [16, 17, 18, 19], [20, 21, 22, 23]]))
    assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5])
    assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints])
    # but feature attributes should be fully recovered
    assert_array_equal(rds.fa.fid, ds.fa.fid)
    def testSimple(self):
        data = N.arange(24).reshape(8,3)
        labels = [0, 1] * 4
        chunks = N.repeat(N.array((0,1)),4)

        # correct results
        csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]]
        clabels = [0, 1, 0, 1]
        cchunks = [0, 0, 1, 1]

        ds = Dataset(samples=data, labels=labels, chunks=chunks)

        # default behavior
        m = SampleGroupMapper()

        # error if not trained
        self.failUnlessRaises(RuntimeError, m, data)

        # train mapper first
        m.train(ds)

        self.failUnless((m.forward(ds.samples) == csamples).all())
        self.failUnless((m.forward(ds.labels) == clabels).all())
        self.failUnless((m.forward(ds.chunks) == cchunks).all())


        # directly apply to dataset
        # using untrained mapper!
        mapped = ds.applyMapper(samplesmapper=SampleGroupMapper())

        self.failUnless(mapped.nsamples == 4)
        self.failUnless(mapped.nfeatures == 3)
        self.failUnless((mapped.samples == csamples).all())
        self.failUnless((mapped.labels == clabels).all())
        self.failUnless((mapped.chunks == cchunks).all())
        # make sure origids get regenerated
        self.failUnless((mapped.origids == range(4)).all())
Exemple #20
0
 def test_1d_multispace_searchlight(self):
     ds = Dataset([np.arange(6)])
     ds.fa['coord1'] = np.repeat(np.arange(3), 2)
     # add a second space to the dataset
     ds.fa['coord2'] = np.tile(np.arange(2), 3)
     measure = lambda x: "+".join([str(x) for x in x.samples[0]])
     # simply select each feature once
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(1)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(1),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
 def test_1d_multispace_searchlight(self):
     ds = Dataset([np.arange(6)])
     ds.fa['coord1'] = np.repeat(np.arange(3), 2)
     # add a second space to the dataset
     ds.fa['coord2'] = np.tile(np.arange(2), 3)
     measure = lambda x: "+".join([str(x) for x in x.samples[0]])
     # simply select each feature once
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(1)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(1),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
Exemple #22
0
def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
                         means=None, nonbogus_features=None, snr=1.0):
    """Generate a dataset where each label is some normally
    distributed beastie around specified mean (0 if None).

    snr is assuming that signal has std 1.0 so we just divide noise by snr

    Probably it is a generalization of pureMultivariateSignal where
    means=[ [0,1], [1,0] ]

    Specify either means or nonbogus_features so means get assigned
    accordingly
    """

    data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr)
    if (means is None) and (not nonbogus_features is None):
        if len(nonbogus_features) > nlabels:
            raise ValueError, "Can't assign simply a feature to a " + \
                  "class: more nonbogus_features than labels"
        means = N.zeros((len(nonbogus_features), nfeatures))
        # pure multivariate -- single bit per feature
        for i in xrange(len(nonbogus_features)):
            means[i, nonbogus_features[i]] = 1.0
    if not means is None:
        # add mean
        data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0)
    # bring it 'under 1', since otherwise some classifiers have difficulties
    # during optimization
    data = 1.0/(N.max(N.abs(data))) * data
    labels = N.concatenate([N.repeat('L%d' % i, perlabel)
                                for i in range(nlabels)])
    chunks = N.concatenate([N.repeat(range(nchunks),
                                     perlabel/nchunks) for i in range(nlabels)])
    ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True)
    ds.nonbogus_features = nonbogus_features
    return ds
Exemple #23
0
    def build_streamline_things(self):
        # Build a dataset having samples of different lengths. This is
        # trying to mimic a possible interface for streamlines
        # datasets, i.e., an iterable container of Mx3 points, where M
        # depends on each single streamline.

        # trying to pack it into an 'object' array to prevent conversion in the
        # Dataset
        self.streamline_samples = np.array([
                                   np.random.rand(3,3),
                                   np.random.rand(5,3),
                                   np.random.rand(7,3)],
                                   dtype='object')
        self.dataset = Dataset(self.streamline_samples)
        self.similarities = [StreamlineSimilarity(distance=corouge)]
Exemple #24
0
    def _call(self, dataset):
        """Computes featurewise I-RELIEF weights."""
        samples = dataset.samples
        NS, NF = samples.shape[:2]

        if self.w_guess == None:
            w = np.ones(NF, 'd')

        w /= (w**2).sum()  # do normalization in all cases to be safe :)

        M, H = self.compute_M_H(dataset.targets)

        while True:
            d_w_k = self.k(pnorm_w(data1=samples, weight=w, p=1))
            ni = np.zeros(NF, 'd')
            for n in range(NS):
                # d_w_k[n, n] could be omitted since == 0.0
                gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \
                                / (d_w_k[n, :].sum() - d_w_k[n, n]))
                alpha_n = np.nan_to_num(d_w_k[n, M[n]] /
                                        (d_w_k[n, M[n]].sum()))
                beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum()))

                m_n = (np.abs(samples[n, :] - samples[M[n], :]) \
                       * alpha_n[:, None]).sum(0)
                h_n = (np.abs(samples[n, :] - samples[H[n], :]) \
                       * beta_n[:, None]).sum(0)
                ni += gamma_n * (m_n - h_n)

            ni = ni / NS

            ni_plus = np.clip(ni, 0.0,
                              np.inf)  # set all negative elements to zero
            w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
            change = np.abs(w_new - w).sum()
            if __debug__ and 'IRELIEF' in debug.active:
                debug('IRELIEF',
                      "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" \
                      % (change, w_new.max(), w_new.min(), w_new.mean(),
                         w_new.std(), np.isnan(w_new).sum()))

            # update weights:
            w = w_new
            if change < self.threshold:
                break

        self.w = w
        return Dataset(self.w[np.newaxis])
Exemple #25
0
    def testLabelRandomizationAndSampling(self):
        """
        """
        data = Dataset(samples=N.ones((5, 1)), labels=range(5), chunks=1)
        data += Dataset(samples=N.ones((5, 1)) + 1, labels=range(5), chunks=2)
        data += Dataset(samples=N.ones((5, 1)) + 2, labels=range(5), chunks=3)
        data += Dataset(samples=N.ones((5, 1)) + 3, labels=range(5), chunks=4)
        data += Dataset(samples=N.ones((5, 1)) + 4, labels=range(5), chunks=5)
        self.failUnless(data.samplesperlabel == {0: 5, 1: 5, 2: 5, 3: 5, 4: 5})

        sample = data.getRandomSamples(2)
        self.failUnless(sample.samplesperlabel.values() == [2, 2, 2, 2, 2])

        self.failUnless((data.uniquechunks == range(1, 6)).all())

        # store the old labels
        origlabels = data.labels.copy()

        data.permuteLabels(True)

        self.failIf((data.labels == origlabels).all())

        data.permuteLabels(False)

        self.failUnless((data.labels == origlabels).all())

        # now try another object with the same data
        data2 = Dataset(samples=data.samples, labels=data.labels, chunks=data.chunks)

        # labels are the same as the originals
        self.failUnless((data2.labels == origlabels).all())

        # now permute in the new object
        data2.permuteLabels(True)

        # must not affect the old one
        self.failUnless((data.labels == origlabels).all())
        # but only the new one
        self.failIf((data2.labels == origlabels).all())
Exemple #26
0
def eep_dataset(samples, targets=None, chunks=None):
    """Create a dataset using an EEP binary file as source.

    EEP files are used by *eeprobe* a software for analysing even-related
    potentials (ERP), which was developed at the Max-Planck Institute for
    Cognitive Neuroscience in Leipzig, Germany.

      http://www.ant-neuro.com/products/eeprobe

    Parameters
    ----------
    samples : str or EEPBin instance
      This is either a filename of an EEP file, or an EEPBin instance, providing
      the samples data in EEP format.
    targets, chunks : sequence or scalar or None
      Values are pass through to `Dataset.from_wizard()`. See its documentation
      for more information.

    Returns
    -------
    Dataset
      Besides is usual attributes (e.g. targets, chunks, and a mapper). The
      returned dataset also includes feature attributes associating each same
      with a channel (by id), and a specific timepoint -- based on information
      read from the EEP data.
    """
    if isinstance(samples, str):
        # open the eep file
        eb = EEPBin(samples)
    elif isinstance(samples, EEPBin):
        # nothing special
        eb = samples
    else:
        raise ValueError("eep_dataset takes the filename of an "
                         "EEP file or a EEPBin object as 'samples' argument.")

    # init dataset
    ds = Dataset.from_channeltimeseries(eb.data,
                                        targets=targets,
                                        chunks=chunks,
                                        t0=eb.t0,
                                        dt=eb.dt,
                                        channelids=eb.channels)
    return ds
Exemple #27
0
def bench_pymvpa(X, Y):
    """
    bench with pymvpa (by default uses a custom swig-generated wrapper
    around libsvm)
    """
    from mvpa.datasets import Dataset
    from mvpa.clfs import svm

    gc.collect()

    # start time
    tstart = datetime.now()
    data = Dataset(samples=X, labels=Y)
    clf = svm.RbfCSVMC(C=1.)
    clf.train(data)
    Z = clf.predict(X)
    delta = (datetime.now() - tstart)

    # stop time
    mvpa_results.append(delta.seconds + delta.microseconds / mu_second)
Exemple #28
0
def eep_dataset(samples, targets=None, chunks=None):
    """Create a dataset using an EEP binary file as source.

    EEP files are used by *eeprobe* a software for analysing even-related
    potentials (ERP), which was developed at the Max-Planck Institute for
    Cognitive Neuroscience in Leipzig, Germany.

      http://www.ant-neuro.com/products/eeprobe

    Parameters
    ----------
    samples : str or EEPBin instance
      This is either a filename of an EEP file, or an EEPBin instance, providing
      the samples data in EEP format.
    targets, chunks : sequence or scalar or None
      Values are pass through to `Dataset.from_wizard()`. See its documentation
      for more information.

    Returns
    -------
    Dataset
      Besides is usual attributes (e.g. targets, chunks, and a mapper). The
      returned dataset also includes feature attributes associating each same
      with a channel (by id), and a specific timepoint -- based on information
      read from the EEP data.
    """
    if isinstance(samples, str):
        # open the eep file
        eb = EEPBin(samples)
    elif isinstance(samples, EEPBin):
        # nothing special
        eb = samples
    else:
        raise ValueError("eep_dataset takes the filename of an "
              "EEP file or a EEPBin object as 'samples' argument.")

    # init dataset
    ds = Dataset.from_channeltimeseries(
            eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt,
            channelids=eb.channels)
    return ds
Exemple #29
0
    def __init__(self, samples=None, mapper=None, dsattr=None, **kwargs):
        """
        If `samples` and `mapper` arguments are not `None` the mapper is
        used to forward-map the samples array and the result is passed
        to the `Dataset` constructor.

        :Parameters:
          mapper: Instance of `Mapper`
            This mapper will be embedded in the dataset and is used and
            updated, by all subsequent mapping or feature selection
            procedures.
          **kwargs:
            All other arguments are simply passed to and handled by
            the constructor of `Dataset`.
        """
        # there are basically two mode for the constructor:
        # 1. internal mode - only data and dsattr dict
        # 2. user mode - samples != None # and mapper != None

        # see if dsattr is none, if so, set to empty dict
        if dsattr is None:
            dsattr = {}

        # if a mapper was passed, store it in dsattr dict that gets passed
        # to base Dataset
        if not mapper is None:
            # TODO: check mapper for compliance with dimensionality within _data
            #       may be only within __debug__
            dsattr['mapper'] = mapper

        # if the samples are passed to the special arg, use the mapper to
        # transform them.
        if not samples is None:
            if not dsattr.has_key('mapper') or dsattr['mapper'] is None:
                raise DatasetError, \
                      "Constructor of MappedDataset requires a mapper " \
                      "if unmapped samples are provided."
            Dataset.__init__(self,
                             samples=mapper.forward(samples),
                             dsattr=dsattr,
                             **(kwargs))
        else:
            Dataset._checkCopyConstructorArgs(samples=samples,
                                              dsattr=dsattr,
                                              **kwargs)
            Dataset.__init__(self, dsattr=dsattr, **(kwargs))
Exemple #30
0
    def testIdsonboundaries(self):
        """Test detection of transition points

        Shame on Yarik -- he didn't create unittests right away... damn me
        """
        ds = Dataset(
            samples=N.array(range(10), ndmin=2).T,
            labels=[0, 0, 1, 1, 0, 0, 1, 1, 0, 0],
            chunks=[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        )
        self.failUnless(
            ds.idsonboundaries() == [0, 2, 4, 5, 6, 8],
            "We should have got ids whenever either chunk or " "label changes",
        )
        self.failUnless(ds.idsonboundaries(attributes_to_track=["chunks"]) == [0, 5])
        # Preceding samples
        self.failUnless(ds.idsonboundaries(prior=1, post=-1, attributes_to_track=["chunks"]) == [4, 9])
        self.failUnless(ds.idsonboundaries(prior=2, post=-1, attributes_to_track=["chunks"]) == [3, 4, 8, 9])
        self.failUnless(
            ds.idsonboundaries(prior=2, post=-1, attributes_to_track=["chunks"], revert=True) == [0, 1, 2, 5, 6, 7]
        )
        self.failUnless(ds.idsonboundaries(prior=1, post=1, attributes_to_track=["chunks"]) == [0, 1, 4, 5, 6, 9])
        # all should be there
        self.failUnless(ds.idsonboundaries(prior=2) == range(10))
Exemple #31
0
def test_polydetrend():
    samples_forwhole = np.array( [[1.0, 2, 3, 4, 5, 6],
                                 [-2.0, -4, -6, -8, -10, -12]], ndmin=2 ).T
    samples_forchunks = np.array( [[1.0, 2, 3, 3, 2, 1],
                                  [-2.0, -4, -6, -6, -4, -2]], ndmin=2 ).T
    chunks = [0, 0, 0, 1, 1, 1]
    chunks_bad = [ 0, 0, 1, 1, 1, 0]
    target_whole = np.array( [[-3.0, -2, -1, 1, 2, 3],
                             [-6, -4, -2,  2, 4, 6]], ndmin=2 ).T
    target_chunked = np.array( [[-1.0, 0, 1, 1, 0, -1],
                               [2, 0, -2, -2, 0, 2]], ndmin=2 ).T


    ds = Dataset(samples_forwhole)

    # this one will auto-train the mapper on first use
    dm = PolyDetrendMapper(polyord=1, inspace='police')
    mds = dm(ds)
    # features are linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials
    assert_array_equal(mds.sa.police, np.arange(len(ds)))

    # hackish way to get the previous regressors into a dataset
    ds.sa['opt_reg_const'] = dm._regs[:,0]
    ds.sa['opt_reg_lin'] = dm._regs[:,1]
    # using these precomputed regressors, we should get the same result as
    # before even if we do not generate a regressor for linear
    dm_optreg = PolyDetrendMapper(polyord=0,
                                  opt_regs=['opt_reg_const', 'opt_reg_lin'])
    mds_optreg = dm_optreg(ds)
    assert_array_almost_equal(mds_optreg, np.zeros(mds.shape))


    ds = Dataset(samples_forchunks)
    # 'constant' detrending removes the mean
    mds = PolyDetrendMapper(polyord=0)(ds)
    assert_array_almost_equal(
            mds.samples,
            samples_forchunks - np.mean(samples_forchunks, axis=0))
    # if there is no GLOBAL linear trend it should be identical to mean removal
    # even if trying to remove linear
    mds2 = PolyDetrendMapper(polyord=1)(ds)
    assert_array_almost_equal(mds, mds2)

    # chunk-wise detrending
    ds = dataset_wizard(samples_forchunks, chunks=chunks)
    dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='police')
    mds = dm(ds)
    # features are chunkswise linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials, which is the identical linspace in both
    # chunks
    assert_array_equal(mds.sa.police, range(3) * 2)
    # non-matching number of samples cannot be mapped
    assert_raises(ValueError, dm, ds[:-1])
    # however, if the dataset knows about the space it is possible
    ds.sa['police'] = mds.sa.police
    # XXX this should be
    #mds2 = dm(ds[1:-1])
    #assert_array_equal(mds[1:-1], mds2)
    # XXX but right now is
    assert_raises(NotImplementedError, dm, ds[1:-1])

    # Detrend must preserve the size of dataset
    assert_equal(mds.shape, ds.shape)

    # small additional test for break points
    # although they are no longer there
    ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T,
                 targets=chunks, chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1)(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # test of different polyord on each chunk
    target_mixed = np.array( [[-1.0, 0, 1, 0, 0, 0],
                             [2.0, 0, -2, 0, 0, 0]], ndmin=2 ).T
    ds = dataset_wizard(samples_forchunks.copy(), targets=chunks, chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0,1])(ds)
    assert_array_almost_equal(mds, target_mixed)

    # test irregluar spacing of samples, but with corrective time info
    samples_forwhole = np.array( [[1.0, 4, 6, 8, 2, 9],
                                 [-2.0, -8, -12, -16, -4, -18]], ndmin=2 ).T
    ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:,0]})
    # linear detrending that makes use of temporal info from dataset
    dm = PolyDetrendMapper(polyord=1, inspace='time')
    mds = dm(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # and now the same stuff, but with chunking and ordered by time
    samples_forchunks = np.array( [[1.0, 3, 3, 2, 2, 1],
                                  [-2.0, -6, -6, -4, -4, -2]], ndmin=2 ).T
    chunks = [0, 1, 0, 1, 0, 1]
    time = [4, 4, 12, 8, 8, 12]
    ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time})
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='time')(ds)

    # the whole thing must not affect the source data
    assert_array_equal(ds, samples_forchunks)
    # but if done inplace that is no longer true
    poly_detrend(ds, chunks_attr='chunks', polyord=1, inspace='time')
    assert_array_equal(ds, mds)
Exemple #32
0
    def testSampleSelection(self):
        origdata = datasets["uni2large"].samples[:100, :10].T
        data = Dataset(samples=origdata, labels=2, chunks=2)

        self.failUnless(data.nsamples == 10)

        # set single pattern to enabled
        for sel in [data.selectSamples(5), data.select(5), data.select(slice(5, 6))]:
            self.failUnless(sel.nsamples == 1)
            self.failUnless(data.nfeatures == 100)
            self.failUnless(sel.origids == [5])

        # check duplicate selections
        for sel in [
            data.selectSamples([5, 5]),
            # Following ones would fail since select removes
            # repetitions (XXX)
            # data.select([5,5]),
            # data.select([5,5], 'all'),
            # data.select([5,5], slice(None)),
        ]:
            self.failUnless(sel.nsamples == 2)
            self.failUnless((sel.samples[0] == data.samples[5]).all())
            self.failUnless((sel.samples[0] == sel.samples[1]).all())
            self.failUnless(len(sel.labels) == 2)
            self.failUnless(len(sel.chunks) == 2)
            self.failUnless((sel.origids == [5, 5]).all())

            self.failUnless(sel.samples.shape == (2, 100))

        # check selection by labels
        for sel in [
            data.selectSamples(data.idsbylabels(2)),
            data.select(labels=2),
            data.select("labels", 2),
            data.select("labels", [2]),
            data["labels", [2]],
            data["labels":[2], "labels":2],
            data["labels":[2]],
        ]:
            self.failUnless(sel.nsamples == data.nsamples)
            self.failUnless(N.all(sel.samples == data.samples))
        # not present label
        for sel in [
            data.selectSamples(data.idsbylabels(3)),
            data.select(labels=3),
            data.select("labels", 3),
            data.select("labels", [3]),
        ]:
            self.failUnless(sel.nsamples == 0)

        data = Dataset(samples=origdata, labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9], chunks=2)
        for sel in [
            data.selectSamples(data.idsbylabels([2, 3])),
            data.select("labels", [2, 3]),
            data.select("labels", [2, 3], labels=[1, 2, 3, 4]),
            data.select("labels", [2, 3], chunks=[1, 2, 3, 4]),
            data["labels":[2, 3], "chunks":[1, 2, 3, 4]],
            data["chunks":[1, 2, 3, 4], "labels":[2, 3]],
        ]:
            self.failUnless(N.all(sel.origids == [3.0, 4.0, 5.0, 7.0]))

        # lets cause it to compute unique labels
        self.failUnless((data.uniquelabels == [2, 3, 4, 8, 9]).all())

        # select some samples removing some labels completely
        sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9]))
        self.failUnlessEqual(Set(sel.uniquelabels), Set([3, 4, 8, 9]))
        self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
Exemple #33
0
        datasets[basename] = dataset

    # sample 3D
    total = 2 * spec['perlabel']
    nchunks = spec['nchunks']
    data = np.random.standard_normal((total, 3, 6, 6))
    labels = np.concatenate(
        (np.repeat(0, spec['perlabel']), np.repeat(1, spec['perlabel'])))
    data[:, 1, 0, 0] += 2 * labels  # add some signal
    chunks = np.asarray(range(nchunks) * (total / nchunks))
    mask = np.ones((3, 6, 6), dtype='bool')
    mask[0, 0, 0] = 0
    mask[1, 3, 2] = 0
    ds = Dataset.from_wizard(samples=data,
                             targets=labels,
                             chunks=chunks,
                             mask=mask,
                             space='myspace')
    datasets['3d%s' % kind] = ds

# some additional datasets
datasets['dumb2'] = dumb_feature_binary_dataset()
datasets['dumb'] = dumb_feature_dataset()
# dataset with few invariant features
_dsinv = dumb_feature_dataset()
_dsinv.samples = np.hstack((_dsinv.samples, np.zeros(
    (_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1))))
datasets['dumbinv'] = _dsinv

# Datasets for regressions testing
datasets['sin_modulated'] = multiple_chunks(sin_modulated, 4, 30, 1)
Exemple #34
0
def test_polydetrend():
    samples_forwhole = np.array(
        [[1.0, 2, 3, 4, 5, 6], [-2.0, -4, -6, -8, -10, -12]], ndmin=2).T
    samples_forchunks = np.array(
        [[1.0, 2, 3, 3, 2, 1], [-2.0, -4, -6, -6, -4, -2]], ndmin=2).T
    chunks = [0, 0, 0, 1, 1, 1]
    chunks_bad = [0, 0, 1, 1, 1, 0]
    target_whole = np.array([[-3.0, -2, -1, 1, 2, 3], [-6, -4, -2, 2, 4, 6]],
                            ndmin=2).T
    target_chunked = np.array([[-1.0, 0, 1, 1, 0, -1], [2, 0, -2, -2, 0, 2]],
                              ndmin=2).T

    ds = Dataset(samples_forwhole)

    # this one will auto-train the mapper on first use
    dm = PolyDetrendMapper(polyord=1, inspace='police')
    mds = dm(ds)
    # features are linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials
    assert_array_equal(mds.sa.police, np.arange(len(ds)))

    # hackish way to get the previous regressors into a dataset
    ds.sa['opt_reg_const'] = dm._regs[:, 0]
    ds.sa['opt_reg_lin'] = dm._regs[:, 1]
    # using these precomputed regressors, we should get the same result as
    # before even if we do not generate a regressor for linear
    dm_optreg = PolyDetrendMapper(polyord=0,
                                  opt_regs=['opt_reg_const', 'opt_reg_lin'])
    mds_optreg = dm_optreg(ds)
    assert_array_almost_equal(mds_optreg, np.zeros(mds.shape))

    ds = Dataset(samples_forchunks)
    # 'constant' detrending removes the mean
    mds = PolyDetrendMapper(polyord=0)(ds)
    assert_array_almost_equal(
        mds.samples, samples_forchunks - np.mean(samples_forchunks, axis=0))
    # if there is no GLOBAL linear trend it should be identical to mean removal
    # even if trying to remove linear
    mds2 = PolyDetrendMapper(polyord=1)(ds)
    assert_array_almost_equal(mds, mds2)

    # chunk-wise detrending
    ds = dataset_wizard(samples_forchunks, chunks=chunks)
    dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, inspace='police')
    mds = dm(ds)
    # features are chunkswise linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials, which is the identical linspace in both
    # chunks
    assert_array_equal(mds.sa.police, range(3) * 2)
    # non-matching number of samples cannot be mapped
    assert_raises(ValueError, dm, ds[:-1])
    # however, if the dataset knows about the space it is possible
    ds.sa['police'] = mds.sa.police
    # XXX this should be
    #mds2 = dm(ds[1:-1])
    #assert_array_equal(mds[1:-1], mds2)
    # XXX but right now is
    assert_raises(NotImplementedError, dm, ds[1:-1])

    # Detrend must preserve the size of dataset
    assert_equal(mds.shape, ds.shape)

    # small additional test for break points
    # although they are no longer there
    ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T,
                        targets=chunks,
                        chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1)(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # test of different polyord on each chunk
    target_mixed = np.array([[-1.0, 0, 1, 0, 0, 0], [2.0, 0, -2, 0, 0, 0]],
                            ndmin=2).T
    ds = dataset_wizard(samples_forchunks.copy(),
                        targets=chunks,
                        chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0, 1])(ds)
    assert_array_almost_equal(mds, target_mixed)

    # test irregluar spacing of samples, but with corrective time info
    samples_forwhole = np.array(
        [[1.0, 4, 6, 8, 2, 9], [-2.0, -8, -12, -16, -4, -18]], ndmin=2).T
    ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:, 0]})
    # linear detrending that makes use of temporal info from dataset
    dm = PolyDetrendMapper(polyord=1, inspace='time')
    mds = dm(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # and now the same stuff, but with chunking and ordered by time
    samples_forchunks = np.array(
        [[1.0, 3, 3, 2, 2, 1], [-2.0, -6, -6, -4, -4, -2]], ndmin=2).T
    chunks = [0, 1, 0, 1, 0, 1]
    time = [4, 4, 12, 8, 8, 12]
    ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time})
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1,
                            inspace='time')(ds)

    # the whole thing must not affect the source data
    assert_array_equal(ds, samples_forchunks)
    # but if done inplace that is no longer true
    poly_detrend(ds, chunks_attr='chunks', polyord=1, inspace='time')
    assert_array_equal(ds, mds)
Exemple #35
0
    def _call(self, dataset):
        """Computes featurewise I-RELIEF-2 weights. Online version."""
        NS = dataset.samples.shape[0]
        NF = dataset.samples.shape[1]
        if self.w_guess == None:
            self.w = np.ones(NF, 'd')
        # do normalization in all cases to be safe :)
        self.w = self.w / (self.w**2).sum()

        M, H = self.compute_M_H(dataset.targets)

        ni = np.zeros(NF, 'd')
        pi = np.zeros(NF, 'd')

        if self.permute:
            # indices to go through samples in random order
            random_sequence = np.random.permutation(NS)
        else:
            random_sequence = np.arange(NS)

        change = self.threshold + 1.0
        iteration = 0
        counter = 0.0
        while change > self.threshold and iteration < self.max_iter:
            if __debug__:
                debug('IRELIEF', "Iteration %d" % iteration)

            for t in range(NS):
                counter += 1.0
                n = random_sequence[t]

                self.k = self.kernel(length_scale=self.kernel_width / self.w)
                d_w_k_xn_Mn = self.k.computed(
                    dataset.samples[None, n, :],
                    dataset.samples[M[n], :]).as_raw_np().squeeze()
                d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum()
                d_w_k_xn_x = self.k.computed(
                    dataset.samples[None, n, :],
                    dataset.samples).as_raw_np().squeeze()
                gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum()
                alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum

                d_w_k_xn_Hn = self.k.computed(
                    dataset.samples[None, n, :],
                    dataset.samples[H[n], :]).as_raw_np().squeeze()
                beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum()

                m_n = (np.abs(dataset.samples[n, :] - dataset.samples[M[n], :]) \
                        * alpha_n[:, np.newaxis]).sum(0)
                h_n = (np.abs(dataset.samples[n, :] - dataset.samples[H[n], :]) \
                        * beta_n[:, np.newaxis]).sum(0)
                pi = gamma_n * (m_n - h_n)
                learning_rate = 1.0 / (counter * self.a + 1.0)
                ni_new = ni + learning_rate * (pi - ni)
                ni = ni_new

                # set all negative elements to zero
                ni_plus = np.clip(ni, 0.0, np.inf)
                w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
                change = np.abs(w_new - self.w).sum()
                if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active:
                    debug(
                        'IRELIEF',
                        "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f"
                        " #nan=%d" %
                        (t, change, w_new.max(), w_new.min(), w_new.mean(),
                         w_new.std(), np.isnan(w_new).sum()))

                self.w = w_new

                if change < self.threshold and iteration > 0:
                    break

            iteration += 1

        return Dataset(self.w[np.newaxis])
Exemple #36
0
            datasets["%s_%s" % (basename, replication)] = dataset_

        # full dataset
        datasets[basename] = dataset

    # sample 3D
    total = 2*spec['perlabel']
    nchunks = spec['nchunks']
    data = np.random.standard_normal(( total, 3, 6, 6 ))
    labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ),
                              np.repeat( 1, spec['perlabel'] ) ) )
    chunks = np.asarray(range(nchunks)*(total/nchunks))
    mask = np.ones((3, 6, 6), dtype='bool')
    mask[0, 0, 0] = 0
    mask[1, 3, 2] = 0
    ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks,
                             mask=mask, space='myspace')
    datasets['3d%s' % kind] = ds


# some additional datasets
datasets['dumb2'] = dumb_feature_binary_dataset()
datasets['dumb'] = dumb_feature_dataset()
# dataset with few invariant features
_dsinv = dumb_feature_dataset()
_dsinv.samples = np.hstack((_dsinv.samples,
                           np.zeros((_dsinv.nsamples, 1)),
                           np.ones((_dsinv.nsamples, 1))))
datasets['dumbinv'] = _dsinv

# Datasets for regressions testing
datasets['sin_modulated'] = multiple_chunks(sin_modulated, 4, 30, 1)
Exemple #37
0
    def _call(self, dataset):
        """Computes featurewise I-RELIEF-2 weights. Online version."""
        # local bindings
        samples = dataset.samples
        NS, NF = samples.shape[:2]
        threshold = self.threshold
        a = self.a

        if self.w_guess == None:
            w = np.ones(NF, 'd')

        # do normalization in all cases to be safe :)
        w /= (w**2).sum()

        M, H = self.compute_M_H(dataset.targets)

        ni = np.zeros(NF, 'd')
        pi = np.zeros(NF, 'd')

        if self.permute:
            # indices to go through x in random order
            random_sequence = np.random.permutation(NS)
        else:
            random_sequence = np.arange(NS)

        change = threshold + 1.0
        iteration = 0
        counter = 0.0
        while change > threshold and iteration < self.max_iter:
            if __debug__:
                debug('IRELIEF', "Iteration %d" % iteration)
            for t in range(NS):
                counter += 1.0
                n = random_sequence[t]

                d_xn_x = np.abs(samples[n, :] - samples)
                d_w_k_xn_x = self.k((d_xn_x * w).sum(1))

                d_w_k_xn_Mn = d_w_k_xn_x[M[n]]
                d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum()

                gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum()
                alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum

                d_w_k_xn_Hn = d_w_k_xn_x[H[n]]
                beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum()

                m_n = (d_xn_x[M[n], :] * alpha_n[:, None]).sum(0)
                h_n = (d_xn_x[H[n], :] * beta_n[:, None]).sum(0)
                pi = gamma_n * (m_n - h_n)
                learning_rate = 1.0 / (counter * a + 1.0)
                ni_new = ni + learning_rate * (pi - ni)
                ni = ni_new

                # set all negative elements to zero
                ni_plus = np.clip(ni, 0.0, np.inf)
                w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
                change = np.abs(w_new - w).sum()
                if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active:
                    debug(
                        'IRELIEF',
                        "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f"
                        " #nan=%d" %
                        (t, change, w_new.max(), w_new.min(), w_new.mean(),
                         w_new.std(), np.isnan(w_new).sum()))

                w = w_new

                if change < threshold and iteration > 0:
                    break

            iteration += 1

        self.w = w
        return Dataset(self.w[np.newaxis])
Exemple #38
0
lbp = np.asarray(lbp)
i3_histo = np.asarray(i3_histo)
rgb_histo = np.asarray(rgb_histo)

id_index = 15
lbp_predictdata = lbp[[id_index]]
i3_histo_predictdata = lbp[[id_index]]

print
#print predictdata
print classID[id_index]
#print "len lbp:", len(lbp)
#print "shape:", lbp.shape

#mvpa
lbp_training = Dataset(samples=lbp,labels=classID)
i3_histo_training = Dataset(samples=lbp,labels=classID)
clf = kNN(k=1, voting='majority')
print "clf = ", clf
clf.train(lbp_training)
lbp_predicted_classID =  clf.predict(lbp_predictdata)
clf.train(i3_histo_training)
i3_histo_predicted_classID =  clf.predict(i3_histo_predictdata)



print "lbp_predicted_classID: ", lbp_predicted_classID 
print "i3_histo__predicted_classID :", i3_histo_predicted_classID
#if predicted_classID[0]  == 1.0: print "Image is of class: GRASS"
#if predicted_classID[0]  == 2.0: print "Image is of class: DIRT/GRAVEL"
#if predicted_classID[0]  == 3.0: print "Image is of class: CEMENT/ASPHALT"
Exemple #39
0
from mvpa.datasets import Dataset

#pymvpa stuff
f_handle = open("classdatafile.txt", 'r')
f_handle2 = open("classidfile.txt", 'r')
f_handle3 = open("predictdata.txt", 'r')
features = genfromtxt(f_handle, dtype=float)
classes = genfromtxt(f_handle2, dtype=int)
predictdata = genfromtxt(f_handle3, dtype=float)
predictdata = np.expand_dims(predictdata, axis=0)
print predictdata
print np.shape(features), features.ndim, features.dtype
print np.shape(classes), classes.ndim, classes.dtype
print np.shape(predictdata), predictdata.ndim, predictdata.dtype
f_handle.close()
f_handle2.close()
f_handle3.close()

training = Dataset(samples=features, labels=classes)
clf = kNN(k=2)
print "clf = ", clf
clf.train(training)
#print np.mean(clf.predict(training.samples) == training.labels)
classID = clf.predict(predictdata)
print "classID = ", classID
#print clf.trained_labels
if classID[0] == 1: print "Image is of class: GRASS"
if classID[0] == 2: print "Image is of class: DIRT/GRAVEL"
if classID[0] == 3:
    print "Image is of class: CEMENT/ASPHALT"