def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def __call__(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ if self.commonspace is None: self.train(datasets) else: # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") # place datasets into a copy of the list since items # will be reassigned datasets = list(datasets) params = self.params # for quicker access ;) alpha = params.alpha # for letting me be lazy ;) if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's # so we can assemble a comprehensive mapper at the end # (together with procrustes) zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # # Level 3 -- final, from-scratch, alignment to final common space # mappers = self._level3(datasets) # return trained mappers for projection from all datasets into the # common space if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation if params.alpha < 1: mappers = [ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers)] else: mappers = [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)] elif params.alpha < 1: mappers = [ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers)] if params.output_dim is not None: mappers = [ChainMapper([m, self._svd_mapper]) for m in mappers] return mappers
class FeatureWiseNormalizer(Transformer): def __init__(self, chunks_attr='chunks', param_est=None, **kwargs): self.node = ZScoreMapper(chunks_attr=chunks_attr, param_est=param_est) Transformer.__init__(self, name='feature_normalizer', **kwargs) def transform(self, ds): logger.info('Dataset preprocessing: Normalization feature-wise...') self.node.train(ds) return self.node.forward(ds)
def __call__(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ if self.commonspace is None: self.train(datasets) # place datasets into a copy of the list since items # will be reassigned datasets = list(datasets) params = self.params # for quicker access ;) alpha = params.alpha # for letting me be lazy ;) if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's # so we can assemble a comprehensive mapper at the end # (together with procrustes) zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # # Level 3 -- final, from-scratch, alignment to final common space # mappers = self._level3(datasets) # return trained mappers for projection from all datasets into the # common space if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation if params.alpha < 1: return [ChainMapper([zm, wm, m]) for zm, wm, m in zip(zmappers, wmappers, mappers)] else: return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)] else: if params.alpha < 1: return [ChainMapper([wm, m]) for wm, m in zip(wmappers, mappers)] else: return mappers
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate([np.arange(40) for i in range(20)]).reshape(20, -1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds # Making sure that ref_ds is within range. #Parameter() already checks for it being a non-negative integer if ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.chosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # If there is only one dataset in training phase, there is nothing to be done # just use that data as the common space if len(datasets) < 2: self.commonspace = commonspace else: # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) if params.output_dim is not None: mappers = self._level3(datasets) self._svd_mapper = SVDMapper() self._svd_mapper.train(self._map_and_mean(datasets, mappers)) self._svd_mapper = StaticProjectionMapper( proj=self._svd_mapper.proj[:, :params.output_dim])
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals)
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # verify that if param_est is set but chunks_attr is None # performs zscoring across entire dataset correctly data = data.copy() data_01 = data.select({'targets': [0, 1]}) zscore(data_01, chunks_attr=None) zscore(data, chunks_attr=None, param_est=('targets', [0, 1])) assert_array_equal(data_01.samples, data.select({'targets': [0, 1]})) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3,1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 ) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2,1), 1: (12,1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # And just a smoke test for warnings reporting whenever # of # samples per chunk is low. # on 1 sample per chunk zds1 = ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, -1]]) ok_(np.all(zds1.samples == 0)) # they all should be 0 # on 2 samples per chunk zds2 = ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, 1, -10, -1]]) assert_array_equal(np.unique(zds2.samples), [-1., 1]) # they all should be -1 or 1 # on 3 samples per chunk -- different warning ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, 1, 2, -3, -2, -1]]) # test if std provided as a list not as an array is handled # properly -- should zscore all features (not just first/none # as it was before) ds = dataset_wizard(np.arange(32).reshape((8,-1)), targets=range(8), chunks=[0] * 8) means = [0, 1, -10, 10] std0 = np.std(ds[:, 0]) # std deviation of first one stds = [std0, 10, .1, 1] zm = ZScoreMapper(params=(means, stds), auto_train=True) dsz = zm(ds) assert_array_almost_equal((np.mean(ds, axis=0) - np.asanyarray(means))/np.array(stds), np.mean(dsz, axis=0)) assert_array_almost_equal(np.std(ds, axis=0)/np.array(stds), np.std(dsz, axis=0))
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] #zscore(data_mapped[ref_ds],chunks_attr=None) for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None); ds_new.targets = commonspace m.train(ds_new) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) #ds_new = ds.copy() # shallow copy so we could assign new labels #zscore(ds_new, chunks_attr=None) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds_new)) if residuals is not None: residuals[-1, i] = np.linalg.norm(data_mapped[i] - commonspace) if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)] else: return mappers
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3,1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 ) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2,1), 1: (12,1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3, 1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2, 1), 1: (12, 1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # And just a smoke test for warnings reporting whenever # of # samples per chunk is low. # on 1 sample per chunk zds1 = ZScoreMapper(chunks_attr='chunks', auto_train=True)(ds[[0, -1]]) ok_(np.all(zds1.samples == 0)) # they all should be 0 # on 2 samples per chunk zds2 = ZScoreMapper(chunks_attr='chunks', auto_train=True)(ds[[0, 1, -10, -1]]) assert_array_equal(np.unique(zds2.samples), [-1., 1]) # they all should be -1 or 1 # on 3 samples per chunk -- different warning ZScoreMapper(chunks_attr='chunks', auto_train=True)(ds[[0, 1, 2, -3, -2, -1]]) # test if std provided as a list not as an array is handled # properly -- should zscore all features (not just first/none # as it was before) ds = dataset_wizard(np.arange(32).reshape((8, -1)), targets=range(8), chunks=[0] * 8) means = [0, 1, -10, 10] std0 = np.std(ds[:, 0]) # std deviation of first one stds = [std0, 10, .1, 1] zm = ZScoreMapper(params=(means, stds), auto_train=True) dsz = zm(ds) assert_array_almost_equal( (np.mean(ds, axis=0) - np.asanyarray(means)) / np.array(stds), np.mean(dsz, axis=0)) assert_array_almost_equal( np.std(ds, axis=0) / np.array(stds), np.std(dsz, axis=0))