Ejemplo n.º 1
0
def test_balancer():
    ds = give_data()
    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)
    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3,))
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [2] * 4)
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
            np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
            np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1,2], 5)
    ds.fa['chk'] = np.repeat([1,2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(),
                 [4] * 2)
Ejemplo n.º 2
0
def test_balancer():
    ds = give_data()
    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)
    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3, ))
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3},
                   include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(
        get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
        [2] * 4)
    assert_equal(
        get_nelements_per_value(res.sa.chunks).values(),
        [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
        np.round(
            np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
        np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
Ejemplo n.º 3
0
def get_nsamples_per_attr(dataset, attr):
    """Returns the number of samples per unique value of a sample attribute.

    Parameters
    ----------
    attr : str
      Name of the sample attribute

    Returns
    -------
    dict with the number of samples (value) per unique attribute (key).
    """
    return get_nelements_per_value(dataset.sa[attr])
Ejemplo n.º 4
0
def get_nsamples_per_attr(dataset, attr):
    """Returns the number of samples per unique value of a sample attribute.

    Parameters
    ----------
    attr : str
      Name of the sample attribute

    Returns
    -------
    dict with the number of samples (value) per unique attribute (key).
    """
    return get_nelements_per_value(dataset.sa[attr])
Ejemplo n.º 5
0
def test_balancer():
    ds = give_data()
    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)
    # with limit
    bal = Balancer(limit={"chunks": 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa["chunks"].unique, (3,))
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={"chunks": 3}, include_offlimit=True, apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa["chunks"].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4)
    assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={"chunks": 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
        np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
        np.array(get_nelements_per_value(res.sa.targets).values()),
    )
    # check on feature attribute
    ds.fa["one"] = np.tile([1, 2], 5)
    ds.fa["chk"] = np.repeat([1, 2], 5)
    bal = Balancer(attr="one", amount=2, limit="chk", apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
Ejemplo n.º 6
0
    def _call(self, ds):
        # local binding
        amount = self._amount
        attr, collection = ds.get_attr(self._attr)

        # get filter if not set already (maybe from generate())
        if self._limit_filter is None:
            limit_filter = get_limit_filter(self._limit, collection)
        else:
            limit_filter = self._limit_filter

        # ids of elements that are part of the balanced set
        balanced_set = []
        full_limit_set = []
        # for each chunk in the filter (might be just the selected ones)
        for limit_value in np.unique(limit_filter):
            if limit_filter.dtype == np.bool:
                # simple boolean filter -> do nothing on False
                if not limit_value:
                    continue
                # otherwise get indices of "selected ones"
                limit_idx = limit_filter.nonzero()[0]
            else:
                # non-boolean limiter -> determine "chunk" and balance within
                limit_idx = (limit_filter == limit_value).nonzero()[0]
            full_limit_set += list(limit_idx)

            # apply the current limit to the target attribute
            # need list to index properly
            attr_limited = attr[list(limit_idx)]
            uattr_limited = np.unique(attr_limited)

            # handle all types of supported arguments
            if amount == 'equal':
                # go for maximum possible number of samples provided
                # by each label in this dataset
                # determine the min number of samples per class
                epa = get_nelements_per_value(attr_limited)
                min_epa = min(epa.values())
                for k in epa:
                    epa[k] = min_epa
            elif isinstance(amount, float):
                epa = get_nelements_per_value(attr_limited)
                for k in epa:
                    epa[k] = int(round(epa[k] * amount))
            elif isinstance(amount, int):
                epa = dict(zip(uattr_limited, [amount] * len(uattr_limited)))
            else:
                raise ValueError("Unknown type of amount argument '%s'" %
                                 amount)

            # select determined number of elements per unique attribute value
            selected = []
            for ua in uattr_limited:
                selected += random.sample(
                    list((attr_limited == ua).nonzero()[0]), epa[ua])

            # determine the final indices of selected elements and store
            # as part of the balanced set
            balanced_set += list(limit_idx[selected])

        # make full-sized boolean selection attribute and put it into
        # the right collection of the output dataset
        if self._include_offlimit:
            # start with all-in
            battr = np.ones(len(attr), dtype=np.bool)
            # throw out all samples that could have been limited
            battr[full_limit_set] = False
            # put back the ones that got into the balanced set
            battr[balanced_set] = True
        else:
            # start with nothing
            battr = np.zeros(len(attr), dtype=np.bool)
            # only keep the balanced set
            battr[balanced_set] = True

        if self._apply_selection:
            if collection is ds.sa:
                return ds[battr]
            elif collection is ds.fa:
                return ds[:, battr]
            else:
                # paranoid
                raise RuntimeError(
                    "Don't know where this collection comes from. "
                    "This should never happen!")
        else:
            # shallow copy of the dataset for output
            out = ds.copy(deep=False)
            if collection is ds.sa:
                out.sa[self.get_space()] = battr
            elif collection is ds.fa:
                out.fa[self.get_space()] = battr
            else:
                # paranoid
                raise RuntimeError(
                    "Don't know where this collection comes from. "
                    "This should never happen!")
            return out
Ejemplo n.º 7
0
    def _call(self, ds):
        # local binding
        amount = self._amount
        attr, collection = ds.get_attr(self._attr)

        # get filter if not set already (maybe from generate())
        if self._limit_filter is None:
            limit_filter = get_limit_filter(self._limit, collection)
        else:
            limit_filter = self._limit_filter

        # ids of elements that are part of the balanced set
        balanced_set = []
        full_limit_set = []
        # for each chunk in the filter (might be just the selected ones)
        for limit_value in np.unique(limit_filter):
            if limit_filter.dtype == np.bool:
                # simple boolean filter -> do nothing on False
                if not limit_value:
                    continue
                # otherwise get indices of "selected ones"
                limit_idx = limit_filter.nonzero()[0]
            else:
                # non-boolean limiter -> determine "chunk" and balance within
                limit_idx = (limit_filter == limit_value).nonzero()[0]
            full_limit_set += list(limit_idx)

            # apply the current limit to the target attribute
            # need list to index properly
            attr_limited = attr[list(limit_idx)]
            uattr_limited = np.unique(attr_limited)

            # handle all types of supported arguments
            if amount == 'equal':
                # go for maximum possible number of samples provided
                # by each label in this dataset
                # determine the min number of samples per class
                epa = get_nelements_per_value(attr_limited)
                min_epa = min(epa.values())
                for k in epa:
                    epa[k] = min_epa
            elif isinstance(amount, float):
                epa = get_nelements_per_value(attr_limited)
                for k in epa:
                    epa[k] = int(round(epa[k] * amount))
            elif isinstance(amount, int):
                epa = dict(zip(uattr_limited, [amount] * len(uattr_limited)))
            else:
                raise ValueError("Unknown type of amount argument '%s'" % amount)

            # select determined number of elements per unique attribute value
            selected = []
            for ua in uattr_limited:
                selected += random.sample(list((attr_limited == ua).nonzero()[0]),
                                          epa[ua])

            # determine the final indices of selected elements and store
            # as part of the balanced set
            balanced_set += list(limit_idx[selected])

        # make full-sized boolean selection attribute and put it into
        # the right collection of the output dataset
        if self._include_offlimit:
            # start with all-in
            battr = np.ones(len(attr), dtype=np.bool)
            # throw out all samples that could have been limited
            battr[full_limit_set] = False
            # put back the ones that got into the balanced set
            battr[balanced_set] = True
        else:
            # start with nothing
            battr = np.zeros(len(attr), dtype=np.bool)
            # only keep the balanced set
            battr[balanced_set] = True

        if self._apply_selection:
            if collection is ds.sa:
                return ds[battr]
            elif collection is ds.fa:
                return ds[:, battr]
            else:
                # paranoid
                raise RuntimeError(
                        "Don't know where this collection comes from. "
                        "This should never happen!")
        else:
            # shallow copy of the dataset for output
            out = ds.copy(deep=False)
            if collection is ds.sa:
                out.sa[self.get_space()] = battr
            elif collection is ds.fa:
                out.fa[self.get_space()] = battr
            else:
                # paranoid
                raise RuntimeError(
                        "Don't know where this collection comes from. "
                        "This should never happen!")
            return out
Ejemplo n.º 8
0
def test_balancer():
    ds = give_data()
    ds.sa['ids'] = np.arange(len(ds))  # some sa to ease tracking of samples

    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)

    # if we rerun again, it would be a different selection
    res2 = bal(ds)
    assert_true(np.any(res.sa.ids != bal(ds).sa.ids))

    # but if we create a balancer providing seed rng int,
    # should be identical results
    bal = Balancer(apply_selection=True, count=5, rng=1)
    assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids))

    # But results should differ if we use .generate to produce those multiple
    # balanced datasets
    b = Balancer(apply_selection=True, count=3, rng=1)
    balanced = list(b.generate(ds))
    assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids))
    assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids))
    assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids))

    # And should be exactly the same
    for ds_a, ds_b in zip(balanced, b.generate(ds)):
        assert_datasets_equal(ds_a, ds_b)

    # Contribution by Chris Markiewicz
    # And interleaving __call__ and generator fetches
    gen1 = b.generate(ds)
    gen2 = b.generate(ds)

    seq1, seq2, seq3 = [], [], []

    for i in xrange(3):
        seq1.append(gen1.next())
        seq2.append(gen2.next())
        seq3.append(b(ds))

    # Produces expected sequences

    for i in xrange(3):
        assert_datasets_equal(balanced[i], seq1[i])
        assert_datasets_equal(balanced[i], seq2[i])

    # And all __call__s return the same result
    ds_a = seq3[0]
    for ds_b in seq3[1:]:
        assert_array_equal(ds_a.sa.ids, ds_b.sa.ids)

    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3, ))
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3},
                   include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(
        get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
        [2] * 4)
    assert_equal(
        get_nelements_per_value(res.sa.chunks).values(),
        [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
        np.round(
            np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
        np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
Ejemplo n.º 9
0
def test_balancer():
    ds = give_data()
    ds.sa['ids'] = np.arange(len(ds))  # some sa to ease tracking of samples

    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)

    # if we rerun again, it would be a different selection
    res2 = bal(ds)
    assert_true(np.any(res.sa.ids != bal(ds).sa.ids))

    # but if we create a balancer providing seed rng int,
    # should be identical results
    bal = Balancer(apply_selection=True, count=5, rng=1)
    assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids))

    # But results should differ if we use .generate to produce those multiple
    # balanced datasets
    b = Balancer(apply_selection=True, count=3, rng=1)
    balanced = list(b.generate(ds))
    assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids))
    assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids))
    assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids))

    # And should be exactly the same
    for ds_a, ds_b in zip(balanced, b.generate(ds)):
        assert_datasets_equal(ds_a, ds_b)

    # Contribution by Chris Markiewicz
    # And interleaving __call__ and generator fetches
    gen1 = b.generate(ds)
    gen2 = b.generate(ds)

    seq1, seq2, seq3 = [], [], []

    for i in xrange(3):
        seq1.append(gen1.next())
        seq2.append(gen2.next())
        seq3.append(b(ds))

    # Produces expected sequences

    for i in xrange(3):
        assert_datasets_equal(balanced[i], seq1[i])
        assert_datasets_equal(balanced[i], seq2[i])

    # And all __call__s return the same result
    ds_a = seq3[0]
    for ds_b in seq3[1:]:
        assert_array_equal(ds_a.sa.ids, ds_b.sa.ids)

    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3,))
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3}, include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
                 [2] * 4)
    assert_equal(get_nelements_per_value(res.sa.chunks).values(),
                 [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
            np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
            np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(),
                 [4] * 2)