def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def get_nsamples_per_attr(dataset, attr): """Returns the number of samples per unique value of a sample attribute. Parameters ---------- attr : str Name of the sample attribute Returns ------- dict with the number of samples (value) per unique attribute (key). """ return get_nelements_per_value(dataset.sa[attr])
def _call(self, ds): # local binding amount = self._amount attr, collection = ds.get_attr(self._attr) # get filter if not set already (maybe from generate()) if self._limit_filter is None: limit_filter = get_limit_filter(self._limit, collection) else: limit_filter = self._limit_filter # ids of elements that are part of the balanced set balanced_set = [] # for each chunk in the filter (might be just the selected ones) for limit_value in np.unique(limit_filter): if limit_filter.dtype == np.bool: # simple boolean filter -> do nothing on False if not limit_value: continue # otherwise get indices of "selected ones" limit_idx = limit_filter.nonzero()[0] else: # non-boolean limiter -> determine "chunk" and balance within limit_idx = (limit_filter == limit_value).nonzero()[0] # apply the current limit to the target attribute # need list to index properly attr_limited = attr[list(limit_idx)] uattr_limited = np.unique(attr_limited) # handle all types of supported arguments if amount == 'equal': # go for maximum possible number of samples provided # by each label in this dataset # determine the min number of samples per class epa = get_nelements_per_value(attr_limited) min_epa = min(epa.values()) for k in epa: epa[k] = min_epa elif isinstance(amount, float): epa = get_nelements_per_value(attr_limited) for k in epa: epa[k] = int(round(epa[k] * amount)) elif isinstance(amount, int): epa = dict(zip(uattr_limited, [amount] * len(uattr_limited))) else: raise ValueError("Unknown type of amount argument '%s'" % amount) # select determined number of elements per unique attribute value selected = [] for ua in uattr_limited: selected += random.sample((attr_limited == ua).nonzero()[0], epa[ua]) # determine the final indices of selected elements and store # as part of the balanced set balanced_set += list(limit_idx[selected]) # make full-sized boolean selection attribute and put it into # the right collection of the output dataset battr = np.zeros(len(attr), dtype=np.bool) battr[balanced_set] = True if self._apply_selection: if collection is ds.sa: return ds[battr] elif collection is ds.fa: return ds[:, battr] else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") else: # shallow copy of the dataset for output out = ds.copy(deep=False) if collection is ds.sa: out.sa[self.get_space()] = battr elif collection is ds.fa: out.fa[self.get_space()] = battr else: # paranoid raise RuntimeError( "Don't know where this collection comes from. " "This should never happen!") return out