def test_init():
    pc = ProbabilisticCounter(10)
    assert pc.sizeof() == 40, "Unexpected size in bytes"

    with pytest.raises(ValueError) as excinfo:
        pc = ProbabilisticCounter(0)
    assert str(excinfo.value) == 'At least one simple counter is required'
def test_count():
    num_of_counters = 256
    pc = ProbabilisticCounter(num_of_counters)
    std = 0.78 / sqrt(num_of_counters)

    errors = []

    boundary = 20 * num_of_counters

    cardinality = 0
    for i in range(10000):
        cardinality += 1
        element = "element_{}".format(i)
        pc.add(element)

        if cardinality < boundary:
            # For small cardinalities we need to use correction,
            # that we will test in another case.
            continue

        error = (cardinality - pc.count()) / float(cardinality)
        errors.append(error)

    avg_error = abs(sum(errors)) / float(len(errors))

    assert avg_error >= 0
    assert avg_error <= std
def test_correction():
    pc_with_corr = ProbabilisticCounter(256,
                                        with_small_cardinality_correction=True)
    pc = ProbabilisticCounter(256)

    errors = []
    errors_with_corr = []

    cardinality = 0
    for i in range(100):
        cardinality += 1
        element = "element_{}".format(i)
        pc_with_corr.add(element)
        pc.add(element)

        error_with_corr = (cardinality -
                           pc_with_corr.count()) / float(cardinality)
        errors_with_corr.append(error_with_corr)

        error = abs(cardinality - pc.count()) / float(cardinality)
        errors.append(error)

    avg_error_with_corr = abs(sum(errors_with_corr)) / \
        float(len(errors_with_corr))
    avg_error = abs(sum(errors)) / float(len(errors))

    assert avg_error_with_corr < avg_error
def test_count_small():
    num_of_counters = 256
    pc = ProbabilisticCounter(num_of_counters,
                              with_small_cardinality_correction=True)

    # Actually, for small cardinalities we have no estimate. It is
    # just seems that the the errors have to be bigger.
    std = 0.78 / sqrt(num_of_counters)

    boundary = 2 * num_of_counters

    errors = []

    cardinality = 0
    for i in range(boundary):
        cardinality += 1
        element = "element_{}".format(i)
        pc.add(element)

        error = (cardinality - pc.count()) / float(cardinality)
        errors.append(error)

    avg_error = abs(sum(errors)) / float(len(errors))

    assert avg_error >= 0
    assert avg_error <= 3 * std  # There is no known theoretical expectation.
def test_count_big():
    pc = ProbabilisticCounter(256)

    # NOTE: make n/m > 50 to avoid correction for small cardinalities usage
    boost = 50 * LOREM_TEXT["num_of_unique_words"] // 64 + 1
    num_of_unique_words = boost * LOREM_TEXT["num_of_unique_words"]

    for i in range(boost):
        for word in LOREM_TEXT["text"].split():
            pc.add("{}_{}".format(word, i))

    cardinality = pc.count()
    assert cardinality >= 0.9 * num_of_unique_words
    assert cardinality <= 1.1 * num_of_unique_words
def test_count_small():
    num_of_counters = 256
    pc = ProbabilisticCounter(num_of_counters,
                              with_small_cardinality_correction=True)

    std = 0.78 / sqrt(num_of_counters)

    errors = []

    cardinality = 0
    for i in range(1000):
        cardinality += 1
        element = "element_{}".format(i)
        pc.add(element)

        error = (cardinality - pc.count()) / float(cardinality)
        errors.append(error)

    avg_error = abs(sum(errors)) / float(len(errors))

    assert avg_error >= 0
    assert avg_error <= 2 * std  # Even with correction, still not so good
def test_count_small():
    pc = ProbabilisticCounter(64, True)
    assert pc.count() == 0

    for word in LOREM_TEXT["text"].split():
        pc.add(word)

    num_of_unique_words = LOREM_TEXT["num_of_unique_words"]

    cardinality = pc.count()
    assert cardinality >= 0.5 * num_of_unique_words
    assert cardinality <= 1.5 * num_of_unique_words
def test_len():
    pc = ProbabilisticCounter(10)
    assert len(pc) == 320
def test_add():
    pc = ProbabilisticCounter(10)

    for word in ["test", 1, {"hello": "world"}]:
        pc.add(word)
def test_repr():
    pc = ProbabilisticCounter(10)

    assert repr(pc) == (
        "<ProbabilisticCounter (length: 320, num_of_counters: 10)>")
"""Example how to use ProbabilisticCounter."""

from pdsa.cardinality.probabilistic_counter import ProbabilisticCounter

LOREM_IPSUM = (
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
    " Mauris consequat leo ut vehicula placerat. In lacinia, nisl"
    " id maximus auctor, sem elit interdum urna, at efficitur tellus"
    " turpis at quam. Pellentesque eget iaculis turpis. Nam ac ligula"
    " ut nunc porttitor pharetra in non lorem. In purus metus,"
    " sollicitudin tristique sapien.")

if __name__ == '__main__':
    pc = ProbabilisticCounter(2048, with_small_cardinality_correction=True)

    print(pc)
    print("PC counter uses {} bytes in the memory".format(pc.sizeof()))

    print("Counter contains approx. {} unique elements".format(pc.count()))

    words = set(LOREM_IPSUM.split())
    for word in words:
        pc.add(word.strip(" .,"))

    print("Added {} words, in the counter approx. {} unique elements".format(
        len(words), pc.count()))