Exemple #1
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        idhashes = (idhash(ds1), idhash(ds1.samples))
        zsm.train(ds1)
        idhashes_train = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_train)

        # forward dataset
        ds1z_ds = zsm.forward(ds1)
        idhashes_forwardds = (idhash(ds1), idhash(ds1.samples))
        # must not modify samples in place!
        assert_equal(idhashes, idhashes_forwardds)

        # forward samples explicitly
        ds1z = zsm.forward(ds1.samples)
        idhashes_forward = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_forward)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Exemple #2
0
    def __call__(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        if self.commonspace is None:
            self.train(datasets)
        else:
            # Check to make sure we get a list of datasets as input.
            if not isinstance(datasets, (list, tuple, np.ndarray)):
                raise TypeError("Input datasets should be a sequence "
                                "(of type list, tuple, or ndarray) of datasets.")

        # place datasets into a copy of the list since items
        # will be reassigned
        datasets = list(datasets)

        params = self.params            # for quicker access ;)
        alpha = params.alpha             # for letting me be lazy ;)
        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            # zscore them once while storing corresponding ZScoreMapper's
            # so we can assemble a comprehensive mapper at the end
            # (together with procrustes)
            zmappers = []
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmappers.append(zmapper)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        #
        # Level 3 -- final, from-scratch, alignment to final common space
        #
        mappers = self._level3(datasets)
        # return trained mappers for projection from all datasets into the
        # common space
        if params.zscore_all:
            # We need to construct new mappers which would chain
            # zscore and then final transformation
            if params.alpha < 1:
                mappers = [ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers)]
            else:
                mappers = [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)]
        elif params.alpha < 1:
            mappers = [ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers)]
        if params.output_dim is not None:
            mappers = [ChainMapper([m, self._svd_mapper]) for m in mappers]
        return mappers
Exemple #3
0
class FeatureWiseNormalizer(Transformer):
    def __init__(self, chunks_attr='chunks', param_est=None, **kwargs):
        self.node = ZScoreMapper(chunks_attr=chunks_attr, param_est=param_est)
        Transformer.__init__(self, name='feature_normalizer', **kwargs)

    def transform(self, ds):
        logger.info('Dataset preprocessing: Normalization feature-wise...')
        self.node.train(ds)
        return self.node.forward(ds)
Exemple #4
0
    def __call__(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        if self.commonspace is None:
            self.train(datasets)

        # place datasets into a copy of the list since items
        # will be reassigned
        datasets = list(datasets)

        params = self.params            # for quicker access ;)
        alpha = params.alpha             # for letting me be lazy ;)
        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            # zscore them once while storing corresponding ZScoreMapper's
            # so we can assemble a comprehensive mapper at the end
            # (together with procrustes)
            zmappers = []
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmappers.append(zmapper)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        #
        # Level 3 -- final, from-scratch, alignment to final common space
        #
        mappers = self._level3(datasets)
        # return trained mappers for projection from all datasets into the
        # common space
        if params.zscore_all:
            # We need to construct new mappers which would chain
            # zscore and then final transformation
            if params.alpha < 1:
                return [ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers)]
            else:
                return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)]
        else:
            if params.alpha < 1:
                return [ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers)]
            else:
                return mappers
Exemple #5
0
class FeatureWiseNormalizer(Transformer):
    
    def __init__(self, chunks_attr='chunks', param_est=None, **kwargs):
        self.node = ZScoreMapper(chunks_attr=chunks_attr, param_est=param_est)
        Transformer.__init__(self, name='feature_normalizer', **kwargs)
        
    
    def transform(self, ds):
        logger.info('Dataset preprocessing: Normalization feature-wise...')
        self.node.train(ds)
        return self.node.forward(ds)
Exemple #6
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate([np.arange(40)
                                       for i in range(20)]).reshape(20, -1).T,
                       targets=1,
                       chunks=1),
    ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        idhashes = (idhash(ds1), idhash(ds1.samples))
        zsm.train(ds1)
        idhashes_train = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_train)

        # forward dataset
        ds1z_ds = zsm.forward(ds1)
        idhashes_forwardds = (idhash(ds1), idhash(ds1.samples))
        # must not modify samples in place!
        assert_equal(idhashes, idhashes_forwardds)

        # forward samples explicitly
        ds1z = zsm.forward(ds1.samples)
        idhashes_forward = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_forward)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        params = self.params  # for quicker access ;)
        ca = self.ca
        # Check to make sure we get a list of datasets as input.
        if not isinstance(datasets, (list, tuple, np.ndarray)):
            raise TypeError("Input datasets should be a sequence "
                            "(of type list, tuple, or ndarray) of datasets.")

        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha

        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                samples=residuals,
                sa={
                    'levels':
                    ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]
                })

        if __debug__:
            debug('HPAL',
                  "Hyperalignment %s for %i datasets" % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            # Making sure that ref_ds is within range.
            #Parameter() already checks for it being a non-negative integer
            if ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.chosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug(
                    'HPAL_', "Creating copy of a commonspace and assuring "
                    "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)
        # If there is only one dataset in training phase, there is nothing to be done
        # just use that data as the common space
        if len(datasets) < 2:
            self.commonspace = commonspace
        else:
            # create a mapper per dataset
            # might prefer some other way to initialize... later
            mappers = [deepcopy(params.alignment) for ds in datasets]

            #
            # Level 1 -- initial projection
            #
            lvl1_projdata = self._level1(datasets, commonspace, ref_ds,
                                         mappers, residuals)
            #
            # Level 2 -- might iterate multiple times
            #
            # this is the final common space
            self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
                                            residuals)
        if params.output_dim is not None:
            mappers = self._level3(datasets)
            self._svd_mapper = SVDMapper()
            self._svd_mapper.train(self._map_and_mean(datasets, mappers))
            self._svd_mapper = StaticProjectionMapper(
                proj=self._svd_mapper.proj[:, :params.output_dim])
Exemple #8
0
    def train(self, datasets):
        """Derive a common feature space from a series of datasets.

        Parameters
        ----------
        datasets : sequence of datasets

        Returns
        -------
        A list of trained Mappers matching the number of input datasets.
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]
        alpha = params.alpha
        
        residuals = None
        if ca['training_residual_errors'].enabled:
            residuals = np.zeros((1 + params.level2_niter, ndatasets))
            ca.training_residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)]})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        if alpha < 1:
            datasets, wmappers = self._regularize(datasets, alpha)

        # initial common space is the reference dataset
        commonspace = datasets[ref_ds].samples
        # the reference dataset might have been zscored already, don't do it
        # twice
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug('HPAL_',
                      "Creating copy of a commonspace and assuring "
                      "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)

        # create a mapper per dataset
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]

        #
        # Level 1 -- initial projection
        #
        lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers,
                                     residuals)
        #
        # Level 2 -- might iterate multiple times
        #
        # this is the final common space
        self.commonspace = self._level2(datasets, lvl1_projdata, mappers,
                                        residuals)
Exemple #9
0
def test_zscore():
    """Test z-scoring transformation
    """
    # dataset: mean=2, std=1
    samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\
        reshape((16, 1))
    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    assert_equal(data.samples.mean(), 2.0)
    assert_equal(data.samples.std(), 1.0)
    data_samples = data.samples.copy()
    zscore(data, chunks_attr='chunks')

    # copy should stay intact
    assert_equal(data_samples.mean(), 2.0)
    assert_equal(data_samples.std(), 1.0)
    # we should be able to operate on ndarrays
    # But we can't change type inplace for an array, can't we?
    assert_raises(TypeError, zscore, data_samples, chunks_attr=None)
    # so lets do manually
    data_samples = data_samples.astype(float)
    zscore(data_samples, chunks_attr=None)
    assert_array_equal(data.samples, data_samples)

    # check z-scoring
    check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0],
                    dtype='float64').reshape(16, 1)
    assert_array_equal(data.samples, check)

    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    zscore(data, chunks_attr=None)
    assert_array_equal(data.samples, check)

    # check z-scoring taking set of labels as a baseline
    data = dataset_wizard(samples.copy(),
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples + 1.0)

    # check that zscore modifies in-place; only guaranteed if no upcasting is
    # necessary
    samples = samples.astype('float')
    data = dataset_wizard(samples,
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples)

    # verify that if param_est is set but chunks_attr is None
    # performs zscoring across entire dataset correctly
    data = data.copy()
    data_01 = data.select({'targets': [0, 1]})
    zscore(data_01, chunks_attr=None)
    zscore(data, chunks_attr=None, param_est=('targets', [0, 1]))
    assert_array_equal(data_01.samples, data.select({'targets': [0, 1]}))

    # these might be duplicating code above -- but twice is better than nothing

    # dataset: mean=2, std=1
    raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2))
    # dataset: mean=12, std=1
    raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10
    # zscore target
    check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0]

    ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)
    pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)

    zm = ZScoreMapper()
    # should do global zscore by default
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]))
    # should not modify the source
    assert_array_equal(pristine, ds)

    # if we tell it a different mean it should obey the order
    zm = ZScoreMapper(params=(3,1))
    zm.train(ds)
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 )
    assert_array_equal(pristine, ds)

    # let's look at chunk-wise z-scoring
    ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())),
                        targets=range(32),
                        chunks=[0] * 16 + [1] * 16)
    # by default chunk-wise
    zm = ZScoreMapper()
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
    # we should be able to do that same manually
    zm = ZScoreMapper(params={0: (2,1), 1: (12,1)})
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))

    # And just a smoke test for warnings reporting whenever # of
    # samples per chunk is low.
    # on 1 sample per chunk
    zds1 = ZScoreMapper(chunks_attr='chunks', auto_train=True)(
        ds[[0, -1]])
    ok_(np.all(zds1.samples == 0))   # they all should be 0
    # on 2 samples per chunk
    zds2 = ZScoreMapper(chunks_attr='chunks', auto_train=True)(
        ds[[0, 1, -10, -1]])
    assert_array_equal(np.unique(zds2.samples), [-1., 1]) # they all should be -1 or 1
    # on 3 samples per chunk -- different warning
    ZScoreMapper(chunks_attr='chunks', auto_train=True)(
        ds[[0, 1, 2, -3, -2, -1]])

    # test if std provided as a list not as an array is handled
    # properly -- should zscore all features (not just first/none
    # as it was before)
    ds = dataset_wizard(np.arange(32).reshape((8,-1)),
                        targets=range(8), chunks=[0] * 8)
    means = [0, 1, -10, 10]
    std0 = np.std(ds[:, 0])             # std deviation of first one
    stds = [std0, 10, .1, 1]

    zm = ZScoreMapper(params=(means, stds),
                      auto_train=True)
    dsz = zm(ds)

    assert_array_almost_equal((np.mean(ds, axis=0) - np.asanyarray(means))/np.array(stds),
                              np.mean(dsz, axis=0))

    assert_array_almost_equal(np.std(ds, axis=0)/np.array(stds),
                              np.std(dsz, axis=0))
Exemple #10
0
    def __call__(self, datasets):
        """Estimate mappers for each dataset

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained Mappers of the same length as datasets
        """
        params = self.params            # for quicker access ;)
        ca = self.ca
        ndatasets = len(datasets)
        nfeatures = [ds.nfeatures for ds in datasets]

        residuals = None
        if ca['residual_errors'].enabled:
            residuals = np.zeros((2 + params.level2_niter, ndatasets))
            ca.residual_errors = Dataset(
                samples = residuals,
                sa = {'levels' :
                       ['1'] +
                       ['2:%i' % i for i in xrange(params.level2_niter)] +
                       ['3']})

        if __debug__:
            debug('HPAL', "Hyperalignment %s for %i datasets"
                  % (self, ndatasets))

        if params.ref_ds is None:
            ref_ds = np.argmax(nfeatures)
        else:
            ref_ds = params.ref_ds
            if ref_ds < 0 and ref_ds >= ndatasets:
                raise ValueError, "Requested reference dataset %i is out of " \
                      "bounds. We have only %i datasets provided" \
                      % (ref_ds, ndatasets)
        ca.choosen_ref_ds = ref_ds
        # might prefer some other way to initialize... later
        mappers = [deepcopy(params.alignment) for ds in datasets]
        # zscore all data sets
        # ds = [ zscore(ds, chunks_attr=None) for ds in datasets]

        # Level 1 (first)

        # TODO since we are doing in-place zscoring create deep copies
        # of the datasets with pruned targets and shallow copies of
        # the collections (if they would come needed in the transformation)
        # TODO: handle floats and non-floats differently to prevent
        #       waste of memory if there is no need (e.g. no z-scoring)
        #otargets = [ds.sa.targets for ds in datasets]
        datasets = [ds.copy(deep=False) for ds in datasets]
        #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)})
        #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)})
        #            for ds in datasets]

        if params.zscore_all:
            if __debug__:
                debug('HPAL', "Z-scoring all datasets")
            # zscore them once while storing corresponding ZScoreMapper's
            zmappers = []
            for ids in xrange(len(datasets)):
                zmapper = ZScoreMapper(chunks_attr=None)
                zmappers.append(zmapper)
                zmapper.train(datasets[ids])
                datasets[ids] = zmapper.forward(datasets[ids])

        commonspace = np.asanyarray(datasets[ref_ds])
        if params.zscore_common and not params.zscore_all:
            if __debug__:
                debug('HPAL_',
                      "Creating copy of a commonspace and assuring "
                      "it is of a floating type")
            commonspace = commonspace.astype(float)
            zscore(commonspace, chunks_attr=None)

        data_mapped = [np.asanyarray(ds) for ds in datasets]
        #zscore(data_mapped[ref_ds],chunks_attr=None)
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 1: ds #%i" % i)
            if i == ref_ds:
                continue
            #ds_new = ds.copy()
            #zscore(ds_new, chunks_attr=None);
            ds_new.targets = commonspace
            m.train(ds_new)
            ds_ = m.forward(np.asanyarray(ds_new))
            if params.zscore_common:
                zscore(ds_, chunks_attr=None)
            data_mapped[i] = ds_

            if residuals is not None:
                residuals[0, i] = np.linalg.norm(ds_ - commonspace)

            ## if ds_mapped == []:
            ##     ds_mapped = [zscore(m.forward(d), chunks_attr=None)]
            ## else:
            ##     ds_mapped += [zscore(m.forward(d), chunks_attr=None)]

            # zscore before adding
            # TODO: make just a function so we dont' waste space
            commonspace = params.combiner1(data_mapped[i], commonspace)
            if params.zscore_common:
                zscore(commonspace, chunks_attr=None)

        # update commonspace to mean of ds_mapped
        commonspace = params.combiner2(data_mapped)
        #if params.zscore_common:
        #zscore(commonspace, chunks_attr=None)
        # Level 2 -- might iterate multiple times
        for loop in xrange(params.level2_niter):
            for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
                if __debug__:
                    debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i))

                ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1)
                if params.zscore_common:
                    zscore(ds_temp, chunks_attr=None)
                #ds_new = ds.copy()
                #zscore(ds_new, chunks_attr=None)
                ds_new.targets = ds_temp #commonspace #PRJ ds_temp
                m.train(ds_new) # ds_temp)
                ds_ =  m.forward(np.asanyarray(ds_new))
                if params.zscore_common:
                    zscore(ds_, chunks_attr=None)
                data_mapped[i] = ds_
                if residuals is not None:
                    residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace)

                #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None)

            commonspace = params.combiner2(data_mapped)
            #if params.zscore_common:
                #zscore(commonspace, chunks_attr=None)

        # Level 3 (last) to params.levels
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            #ds_new = ds.copy()     # shallow copy so we could assign new labels
            #zscore(ds_new, chunks_attr=None)
            ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1)
            if params.zscore_common:
                zscore(ds_temp, chunks_attr=None)
            ds_new.targets = ds_temp #commonspace #PRJ ds_temp#
            m.train(ds_new) #ds_temp)
            data_mapped[i] = m.forward(np.asanyarray(ds_new))
            if residuals is not None:
                residuals[-1, i] = np.linalg.norm(data_mapped[i] - commonspace)

        if params.zscore_all:
            # We need to construct new mappers which would chain
            # zscore and then final transformation
            return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)]
        else:
            return mappers
def test_zscore():
    """Test z-scoring transformation
    """
    # dataset: mean=2, std=1
    samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\
        reshape((16, 1))
    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    assert_equal(data.samples.mean(), 2.0)
    assert_equal(data.samples.std(), 1.0)
    data_samples = data.samples.copy()
    zscore(data, chunks_attr='chunks')

    # copy should stay intact
    assert_equal(data_samples.mean(), 2.0)
    assert_equal(data_samples.std(), 1.0)
    # we should be able to operate on ndarrays
    # But we can't change type inplace for an array, can't we?
    assert_raises(TypeError, zscore, data_samples, chunks_attr=None)
    # so lets do manually
    data_samples = data_samples.astype(float)
    zscore(data_samples, chunks_attr=None)
    assert_array_equal(data.samples, data_samples)

    # check z-scoring
    check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0],
                    dtype='float64').reshape(16, 1)
    assert_array_equal(data.samples, check)

    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    zscore(data, chunks_attr=None)
    assert_array_equal(data.samples, check)

    # check z-scoring taking set of labels as a baseline
    data = dataset_wizard(samples.copy(),
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples + 1.0)

    # check that zscore modifies in-place; only guaranteed if no upcasting is
    # necessary
    samples = samples.astype('float')
    data = dataset_wizard(samples,
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples)

    # these might be duplicating code above -- but twice is better than nothing

    # dataset: mean=2, std=1
    raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2))
    # dataset: mean=12, std=1
    raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10
    # zscore target
    check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0]

    ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)
    pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)

    zm = ZScoreMapper()
    # should do global zscore by default
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]))
    # should not modify the source
    assert_array_equal(pristine, ds)

    # if we tell it a different mean it should obey the order
    zm = ZScoreMapper(params=(3,1))
    zm.train(ds)
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 )
    assert_array_equal(pristine, ds)

    # let's look at chunk-wise z-scoring
    ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())),
                        targets=range(32),
                        chunks=[0] * 16 + [1] * 16)
    # by default chunk-wise
    zm = ZScoreMapper()
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
    # we should be able to do that same manually
    zm = ZScoreMapper(params={0: (2,1), 1: (12,1)})
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
Exemple #12
0
def test_zscore():
    """Test z-scoring transformation
    """
    # dataset: mean=2, std=1
    samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\
        reshape((16, 1))
    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    assert_equal(data.samples.mean(), 2.0)
    assert_equal(data.samples.std(), 1.0)
    data_samples = data.samples.copy()
    zscore(data, chunks_attr='chunks')

    # copy should stay intact
    assert_equal(data_samples.mean(), 2.0)
    assert_equal(data_samples.std(), 1.0)
    # we should be able to operate on ndarrays
    # But we can't change type inplace for an array, can't we?
    assert_raises(TypeError, zscore, data_samples, chunks_attr=None)
    # so lets do manually
    data_samples = data_samples.astype(float)
    zscore(data_samples, chunks_attr=None)
    assert_array_equal(data.samples, data_samples)

    # check z-scoring
    check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0],
                     dtype='float64').reshape(16, 1)
    assert_array_equal(data.samples, check)

    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    zscore(data, chunks_attr=None)
    assert_array_equal(data.samples, check)

    # check z-scoring taking set of labels as a baseline
    data = dataset_wizard(samples.copy(),
                          targets=[0, 2, 2, 2, 1] + [2] * 11,
                          chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples + 1.0)

    # check that zscore modifies in-place; only guaranteed if no upcasting is
    # necessary
    samples = samples.astype('float')
    data = dataset_wizard(samples,
                          targets=[0, 2, 2, 2, 1] + [2] * 11,
                          chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples)

    # these might be duplicating code above -- but twice is better than nothing

    # dataset: mean=2, std=1
    raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2))
    # dataset: mean=12, std=1
    raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10
    # zscore target
    check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0]

    ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)
    pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)

    zm = ZScoreMapper()
    # should do global zscore by default
    zm.train(ds)  # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]))
    # should not modify the source
    assert_array_equal(pristine, ds)

    # if we tell it a different mean it should obey the order
    zm = ZScoreMapper(params=(3, 1))
    zm.train(ds)
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1)
    assert_array_equal(pristine, ds)

    # let's look at chunk-wise z-scoring
    ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())),
                        targets=range(32),
                        chunks=[0] * 16 + [1] * 16)
    # by default chunk-wise
    zm = ZScoreMapper()
    zm.train(ds)  # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
    # we should be able to do that same manually
    zm = ZScoreMapper(params={0: (2, 1), 1: (12, 1)})
    zm.train(ds)  # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))

    # And just a smoke test for warnings reporting whenever # of
    # samples per chunk is low.
    # on 1 sample per chunk
    zds1 = ZScoreMapper(chunks_attr='chunks', auto_train=True)(ds[[0, -1]])
    ok_(np.all(zds1.samples == 0))  # they all should be 0
    # on 2 samples per chunk
    zds2 = ZScoreMapper(chunks_attr='chunks',
                        auto_train=True)(ds[[0, 1, -10, -1]])
    assert_array_equal(np.unique(zds2.samples),
                       [-1., 1])  # they all should be -1 or 1
    # on 3 samples per chunk -- different warning
    ZScoreMapper(chunks_attr='chunks',
                 auto_train=True)(ds[[0, 1, 2, -3, -2, -1]])

    # test if std provided as a list not as an array is handled
    # properly -- should zscore all features (not just first/none
    # as it was before)
    ds = dataset_wizard(np.arange(32).reshape((8, -1)),
                        targets=range(8),
                        chunks=[0] * 8)
    means = [0, 1, -10, 10]
    std0 = np.std(ds[:, 0])  # std deviation of first one
    stds = [std0, 10, .1, 1]

    zm = ZScoreMapper(params=(means, stds), auto_train=True)
    dsz = zm(ds)

    assert_array_almost_equal(
        (np.mean(ds, axis=0) - np.asanyarray(means)) / np.array(stds),
        np.mean(dsz, axis=0))

    assert_array_almost_equal(
        np.std(ds, axis=0) / np.array(stds), np.std(dsz, axis=0))
Exemple #13
0
def test_zcore_repr():
    # Just basic test if everything is sane... no proper comparison
    for m in (ZScoreMapper(chunks_attr=None), ZScoreMapper(params=(3, 1)),
              ZScoreMapper()):
        mr = eval(repr(m))
        ok_(isinstance(mr, ZScoreMapper))
Exemple #14
0
 def __init__(self, chunks_attr='chunks', param_est=None, **kwargs):
     self.node = ZScoreMapper(chunks_attr=chunks_attr, param_est=param_est)
     Transformer.__init__(self, name='feature_normalizer', **kwargs)
Exemple #15
0
 def __init__(self, chunks_attr='chunks', param_est=None, **kwargs):
     self.node = ZScoreMapper(chunks_attr=chunks_attr, param_est=param_est)
     Transformer.__init__(self, name='feature_normalizer', **kwargs)