def test_random_upsample(sampleset):
    samples = [('pos', 1), ('pos', 1), ('neg', 0)]
    stratified = sorted(util.upsample(samples, 1, rand=StableRandom(0)))
    assert stratified == [('neg', 0), ('neg', 0), ('pos', 1), ('pos', 1)]

    stratified1 = util.upsample(sampleset, 0, rand=StableRandom(0))
    _, labelcnts = util.group_samples(stratified1, 0)
    assert labelcnts == {0: 50, 1: 50}

    stratified2 = util.upsample(sampleset, 0, rand=StableRandom(1))
    assert stratified1 != stratified2, 'Order should be random'
Example #2
0
def test_Shuffle():
    data = list(range(50))
    assert data >> Shuffle(100) >> Collect() != data
    assert data >> Shuffle(100) >> Collect(set) == set(data)

    assert data >> Shuffle(20) >> Collect() != data
    assert data >> Shuffle(20) >> Collect(set) == set(data)

    assert data >> Shuffle(1) >> Collect() == data

    shuffled1 = data >> Shuffle(10, rand=StableRandom(0)) >> Collect()
    shuffled2 = data >> Shuffle(10, rand=StableRandom(0)) >> Collect()
    assert shuffled1 == shuffled2
Example #3
0
def test_Stratify():
    samples = [('pos', 1), ('pos', 1), ('neg', 0)]
    stratify = Stratify(1, mode='up', rand=StableRandom(0))
    stratified = samples >> stratify >> Sort()
    assert stratified == [('neg', 0), ('neg', 0), ('pos', 1), ('pos', 1)]

    samples = [('pos', 1), ('pos', 1), ('pos', 1), ('neg1', 0), ('neg2', 0)]
    stratify = Stratify(1, mode='downrnd', rand=StableRandom(0))
    stratified = samples >> stratify >> Sort()
    assert stratified == [('neg1', 0), ('neg2', 0), ('pos', 1), ('pos', 1)]

    with pytest.raises(ValueError) as ex:
        samples >> Stratify(1, mode='invalid') >> Collect()
    assert str(ex.value).startswith('Unknown mode')
Example #4
0
def test_SplitRandom_constraint():
    same_letter = lambda t: t[0]
    data = zip('aabbccddee', range(10))
    train, val = data >> SplitRandom(rand=StableRandom(0), ratio=0.6,
                                     constraint=same_letter) >> Collect()
    print(train)
    print(val)
    assert train == [('a', 1), ('a', 0), ('d', 7), ('b', 2), ('d', 6), ('b', 3)]
    assert val == [('c', 5), ('e', 8), ('e', 9), ('c', 4)]
Example #5
0
def test_Pick():
    assert Range(5) >> Pick(1) >> Collect() == [0, 1, 2, 3, 4]
    assert Range(5) >> Pick(2) >> Collect() == [0, 2, 4]

    with pytest.raises(ValueError) as ex:
        [1, 2, 3] >> Pick(-1) >> Consume()
    assert str(ex.value).startswith('p_n must not be negative')

    assert Range(5) >> Pick(0.5, StableRandom(1)) >> Collect() == [0, 4]
    assert Range(5) >> Pick(0.7, StableRandom(0)) >> Collect() == [0, 1, 4]

    assert Range(10) >> Pick(1.0) >> Count() == 10
    assert Range(10) >> Pick(0.0) >> Count() == 0
    assert (Range(100) >> Pick(0.3) >> Collect(set)).issubset(set(range(100)))

    with pytest.raises(ValueError) as ex:
        [1, 2, 3] >> Pick(1.1) >> Consume()
    assert str(ex.value).startswith('Probability must be in [0, 1]')

    with pytest.raises(ValueError) as ex:
        [1, 2, 3] >> Pick(-0.1) >> Consume()
    assert str(ex.value).startswith('Probability must be in [0, 1]')
Example #6
0
def test_Stratify():
    samples = [('pos', 1)] * 1000 + [('neg', 0)] * 100
    dist = samples >> CountValues(1)

    stratify = Stratify(1, dist, rand=StableRandom(0))
    stratified1 = samples >> stratify >> Collect()
    stratified2 = samples >> stratify >> Collect()

    assert stratified1 != stratified2

    dist1 = stratified1 >> Get(1) >> CountValues()
    print(dist1)
    assert dist1[0] == 100
    assert 90 < dist1[1] < 110

    dist2 = stratified2 >> Get(1) >> CountValues()
    print(dist2)
    assert dist1[0] == 100
    assert 90 < dist1[1] < 110
Example #7
0
def test_SplitRandom_seed():
    split1 = range(10) >> SplitRandom(rand=StableRandom(0))
    split2 = range(10) >> SplitRandom(rand=StableRandom(0))
    split3 = range(10) >> SplitRandom(rand=StableRandom(1))
    assert split1 == split2
    assert split1 != split3
Example #8
0
def test_StableRandom():
    rnd = StableRandom(1)
    assert rnd.randint(1, 10) == 5
    assert rnd.uniform(-10, 10) == approx(9.943696167306904)
    assert rnd.random() == approx(0.7203244894557457)
    assert rnd.sample(range(10), 3) == [9, 0, 1]

    lst = [1, 2, 3, 4, 5]
    rnd.shuffle(lst)
    assert lst == [5, 3, 1, 4, 2]

    rnd = StableRandom()
    assert max(rnd.random() for _ in range(1000)) < 1.0
    assert min(rnd.random() for _ in range(1000)) >= 0.0

    rnd1, rnd2 = StableRandom(0), StableRandom(0)
    for _ in range(100):
        assert rnd1.random() == rnd2.random()

    rnd1, rnd2 = StableRandom(0), StableRandom(0)
    rnd2.jumpahead(10)
    for _ in range(100):
        assert rnd1.random() != rnd2.random()
    rnd2.setstate(rnd1.getstate())
    for _ in range(100):
        assert rnd1.random() == rnd2.random()

    rnd1, rnd2 = StableRandom(0), StableRandom(1)
    for _ in range(100):
        assert rnd1.random() != rnd2.random()

    rnd1 = StableRandom()
    sleep(0.5)  # seed is based on system time.
    rnd2 = StableRandom()
    for _ in range(100):
        assert rnd1.random() != rnd2.random()

    rnd = StableRandom()
    numbers = [rnd._randbelow(10) for _ in range(1000)]
    assert max(numbers) < 10
    assert min(numbers) >= 0

    rnd = StableRandom()
    numbers = [rnd.gauss_next() for _ in range(10000)]
    my, std = numbers >> MeanStd()
    assert 0.0 == approx(my, abs=0.1)
    assert 1.0 == approx(std, abs=0.1)
Example #9
0
def SplitRandom(iterable, ratio=0.7, constraint=None, rand=None):
    """
    Randomly split iterable into partitions.

    For the same input data the same split is created every time and is stable
    across different Python version 2.x or 3.x. A random number generator
    can be provided to create varying splits.

    >>> train, val = range(10) >> SplitRandom(ratio=0.7)
    >>> train, val
    ([6, 3, 1, 7, 0, 2, 4], [5, 9, 8])

    >>> range(10) >> SplitRandom(ratio=0.7)  # Same split again
    [[6, 3, 1, 7, 0, 2, 4], [5, 9, 8]]

    >>> train, val, test = range(10) >> SplitRandom(ratio=(0.6, 0.3, 0.1))
    >>> train, val, test
    ([6, 1, 4, 0, 3, 2], [8, 7, 9], [5])

    >>> data = zip('aabbccddee', range(10))
    >>> same_letter = lambda t: t[0]
    >>> train, val = data >> SplitRandom(ratio=0.6, constraint=same_letter)
    >>> train
    [('a', 1), ('a', 0), ('d', 7), ('b', 2), ('d', 6), ('b', 3)]
    >>> val
    [('c', 5), ('e', 8), ('e', 9), ('c', 4)]

    :param iterable iterable: Iterable over anything. Will be consumed!
    :param float|tuple ratio: Ratio of two partition e.g. a ratio of 0.7
            means 70%, 30% split.
            Alternatively a list or ratios can be provided, e.g.
            ratio=(0.6, 0.3, 0.1). Note that ratios must sum up to one.
    :param function|None constraint: Function that returns key the elements of
        the iterable are grouped by before partitioning. Useful to ensure
        that a partition contains related elements, e.g. left and right eye
        images are not scattered across partitions.
        Note that constrains have precedence over ratios.
    :param Random|None rand: Random number generator. The default None
            ensures that the same split is created every time SplitRandom
            is called. This is important when continuing an interrupted
            training session or running the same training on machines with
            different Python versions. Note that Python's random.Random(0)
            generates different number for Python 2.x and 3.x!
    :return: partitions of iterable with sizes according to provided ratios.
    :rtype: (list, list, ...)
    """
    rand = StableRandom(0) if rand is None else rand
    samples = list(iterable)
    if hasattr(ratio, '__iter__'):
        ratios = tuple(ratio)
        if abs(sum(ratios) - 1.0) > 1e-6:
            raise ValueError('Ratios must sum up to one: ' + str(ratios))
    else:
        ratios = (ratio, 1.0 - ratio)
    ns = [int(len(samples) * r) for r in ratios]

    if constraint is None:
        groups = [[s] for s in samples]
    else:
        # sort to make stable across python 2.x, 3.x
        groups = sorted(group_by(samples, constraint).values())
    rand.shuffle(groups)
    groups = iter(groups)
    splits = []

    def append(split):
        rand.shuffle(split)
        splits.append(split)

    for n in ns[:-1]:
        split = []
        for group in groups:
            split.extend(group)
            if len(split) >= n:
                append(split)
                break
    append([e for g in groups for e in g])  # append remaining groups
    return splits
Example #10
0
def test_shuffle_sublists():
    sublists = [[1, 2, 3], [4, 5, 6, 7]]
    util.shuffle_sublists(sublists, StableRandom(0))
    assert sublists == [[1, 3, 2], [4, 5, 7, 6]]