Beispiel #1
0
def test_ttopi():
    """ example code from the readme as "reversing a pivot tablite". """
    from random import seed, choice
    seed(11)

    records = 9
    t = Table()
    t.add_column('record id',
                 int,
                 allow_empty=False,
                 data=[i for i in range(records)])
    for column in [f"4.{i}.a" for i in range(5)]:
        t.add_column(
            column,
            str,
            allow_empty=True,
            data=[choice(['a', 'h', 'e', None]) for i in range(records)])

    print("\nshowing raw data:")
    t.show()
    # +=====+=====+
    # |  A  |  B  |
    # | str | str |
    # |False|False|
    # +-----+-----+
    # |4.2.a|e    |
    # |4.3.a|h    |
    # |4.2.a|h    |
    # |4.2.a|e    |
    # |4.3.a|e    |
    # |4.3.a|e    |
    # |4.1.a|e    |
    # |4.1.a|a    |
    # |4.3.a|e    |
    # |4.2.a|a    |
    # |4.3.a|e    |
    # |4.3.a|a    |
    # |4.1.a|a    |
    # |4.1.a|a    |
    # |4.2.a|a    |
    # |4.2.a|a    |
    # |4.1.a|e    |
    # |4.1.a|a    |
    # |4.3.a|h    |
    # |4.3.a|h    |
    # |4.3.a|h    |
    # |4.1.a|e    |
    # +=====+=====+

    # wanted output:
    # +=====+=====
    # |  A  | Count(A,B=a) | Count(A,B=h) | Count(A,B=e) |
    # | str |     int      |     int      |     int      |
    # |False|    False     |    False     |    False     |
    # +-----+--------------+--------------+--------------+
    # |4.1.a|            3 |            0 |            3 |
    # |4.2.a|            3 |            1 |            2 |
    # |4.3.a|            1 |            4 |            4 |
    # +=====+==============+==============+==============+

    reverse_pivot = Table()
    records = t['record id']
    reverse_pivot.add_column('record id', records.datatype, allow_empty=False)
    reverse_pivot.add_column('4.x', str, allow_empty=False)
    reverse_pivot.add_column('ahe', str, allow_empty=True)

    for name in t.columns:
        if not name.startswith('4.'):
            continue
        column = t[name]
        for index, entry in enumerate(column):
            new_row = records[index], name, entry  # record id, 4.x, ahe
            reverse_pivot.add_row(new_row)

    print("\nshowing reversed pivot of the raw data:")
    reverse_pivot.show()

    g = reverse_pivot.groupby(['4.x', 'ahe'],
                              functions=[('ahe', GroupBy.count)])
    print("\nshowing basic groupby of the reversed pivot")
    g.table.show()
    t2 = g.pivot('ahe')
    print("\nshowing the wanted output:")
    t2.show()
Beispiel #2
0
def test_add_rows():
    """ Example from docstring. """
    t = Table()
    t.add_column('test', int)
    t.add_column('A', int)
    t.add_column('B', int)
    t.add_column('C', int)
    test_number = count(1)

    # The following examples are all valid and append the row (1,2,3) to the tablite.

    t.add_row(next(test_number), 1, 2, 3)
    t.add_row([next(test_number), 1, 2, 3])
    t.add_row((next(test_number), 1, 2, 3))
    t.add_row(*(next(test_number), 1, 2, 3))
    t.add_row(test=next(test_number), A=1, B=2, C=3)
    t.add_row(**{'test': next(test_number), 'A': 1, 'B': 2, 'C': 3})

    # The following examples add two rows to the tablite

    t.add_row((next(test_number), 1, 2, 3), (next(test_number), 4, 5, 6))
    t.add_row([next(test_number), 1, 2, 3], [next(test_number), 4, 5, 6])
    t.add_row({
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }, {
        'test': next(test_number),
        'A': 4,
        'B': 5,
        'C': 6
    })  # two (or more) dicts as args.
    t.add_row(*[{
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }, {
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }])  # list of dicts.
    t.show()  # expects:
    # +=====+=====+=====+=====+
    # | test|  A  |  B  |  C  |
    # | int | int | int | int |
    # |False|False|False|False|
    # +-----+-----+-----+-----+
    # |    1|    1|    2|    3|
    # |    2|    1|    2|    3|
    # |    3|    1|    2|    3|
    # |    4|    1|    2|    3|
    # |    5|    1|    2|    3|
    # |    6|    1|    2|    3|
    # |    7|    1|    2|    3|
    # |    8|    4|    5|    6|
    # |    9|    1|    2|    3|
    # |   10|    4|    5|    6|
    # |   11|    1|    2|    3|
    # |   12|    4|    5|    6|
    # |   13|    1|    2|    3|
    # |   14|    1|    2|    3|
    # +=====+=====+=====+=====+
    assert next(test_number) == 14 + 1
Beispiel #3
0
def test_basic_table():
    # creating a tablite incrementally is straight forward:
    table = Table()
    table.add_column('A', int, False)
    assert 'A' in table

    table.add_column('B', str, allow_empty=False)
    assert 'B' in table

    # appending rows is easy:
    table.add_row((1, 'hello'))
    table.add_row((2, 'world'))

    # converting to and from json is easy:
    table_as_json = table.to_json()
    table2 = Table.from_json(table_as_json)

    zipped = zlib.compress(table_as_json.encode())
    a, b = len(zipped), len(table_as_json)
    print("zipping reduces to", a, "from", b, "bytes, e.g.",
          round(100 * a / b, 0), "% of original")

    # copying is easy:
    table3 = table.copy()

    table.rename_column('A', 'aa')
    assert list(table.columns) == ['aa', 'B']
    table.rename_column('aa', 'A')
    assert list(table.columns) == ['A', 'B']

    # and checking for headers is simple:
    assert 'A' in table
    assert 'Z' not in table

    # comparisons are straight forward:
    assert table == table2 == table3

    # even if you only want to check metadata:
    table.compare(table3)  # will raise exception if they're different.

    # append is easy as + also work:
    table3x2 = table3 + table3
    assert len(table3x2) == len(table3) * 2

    # and so does +=
    table3x2 += table3
    assert len(table3x2) == len(table3) * 3

    # type verification is included:
    try:
        table.columns['A'][0] = 'Hallo'
        assert False, "A TypeError should have been raised."
    except TypeError:
        assert True

    # updating values is familiar to any user who likes a list:
    assert 'A' in table.columns
    assert isinstance(table.columns['A'], (StoredList, list))
    last_row = -1
    table['A'][last_row] = 44
    table['B'][last_row] = "Hallo"

    assert table != table2

    # if you try to loop and forget the direction, Table will tell you
    try:
        for row in table:  # wont pass
            assert False, "not possible. Use for row in tablite.rows or for column in tablite.columns"
    except AttributeError:
        assert True

    _ = [table2.add_row(row) for row in table.rows]

    before = [r for r in table2.rows]
    assert before == [(1, 'hello'), (2, 'world'), (1, 'hello'), (44, 'Hallo')]

    # as is filtering for ALL that match:
    filter_1 = lambda x: 'llo' in x
    filter_2 = lambda x: x > 3

    after = table2.all(**{'B': filter_1, 'A': filter_2})

    assert list(after.rows) == [(44, 'Hallo')]

    # as is filtering or for ANY that match:
    after = table2.any(**{'B': filter_1, 'A': filter_2})

    assert list(after.rows) == [(1, 'hello'), (1, 'hello'), (44, 'Hallo')]

    # Imagine a tablite with columns a,b,c,d,e (all integers) like this:
    t = Table()
    for c in 'abcde':
        t.add_column(header=c,
                     datatype=int,
                     allow_empty=False,
                     data=[i for i in range(5)])

    # we want to add two new columns using the functions:
    def f1(a, b, c):
        return a + b + c + 1

    def f2(b, c, d):
        return b * c * d

    # and we want to compute two new columns 'f' and 'g':
    t.add_column(header='f', datatype=int, allow_empty=False)
    t.add_column(header='g', datatype=int, allow_empty=True)

    # we can now use the filter, to iterate over the tablite:
    for row in t.filter('a', 'b', 'c', 'd'):
        a, b, c, d = row

        # ... and add the values to the two new columns
        t['f'].append(f1(a, b, c))
        t['g'].append(f2(b, c, d))

    assert len(t) == 5
    assert list(t.columns) == list('abcdefg')
    t.show()

    # slicing is easy:
    table_chunk = table2[2:4]
    assert isinstance(table_chunk, Table)

    # we will handle duplicate names gracefully.
    table2.add_column('B', int, allow_empty=True)
    assert set(table2.columns) == {'A', 'B', 'B_1'}

    # you can delete a column as key...
    del table2['B_1']
    assert set(table2.columns) == {'A', 'B'}

    # adding a computed column is easy:
    table.add_column('new column',
                     str,
                     allow_empty=False,
                     data=[f"{r}" for r in table.rows])

    # part of or the whole tablite is easy:
    table.show()

    table.show('A', slice(0, 1))

    # updating a column with a function is easy:
    f = lambda x: x * 10
    table['A'] = [f(r) for r in table['A']]

    # using regular indexing will also work.
    for ix, r in enumerate(table['A']):
        table['A'][ix] = r * 10

    # and it will tell you if you're not allowed:
    try:
        f = lambda x: f"'{x} as text'"
        table['A'] = [f(r) for r in table['A']]
        assert False, "The line above must raise a TypeError"
    except TypeError as error:
        print("The error is:", str(error))

    # works with all datatypes:
    now = datetime.now()

    table4 = Table()
    table4.add_column('A', int, allow_empty=False, data=[-1, 1])
    table4.add_column('A', int, allow_empty=True, data=[None, 1])  # None!
    table4.add_column('A', float, False, data=[-1.1, 1.1])
    table4.add_column(
        'A', str, False,
        data=["", "1"])  # Empty string is not a None, when dtype is str!
    table4.add_column(
        'A', str, True,
        data=[None, "1"])  # Empty string is not a None, when dtype is str!
    table4.add_column('A', bool, False, data=[False, True])
    table4.add_column('A', datetime, False, data=[now, now])
    table4.add_column('A', date, False, data=[now.date(), now.date()])
    table4.add_column('A', time, False, data=[now.time(), now.time()])

    table4_json = table4.to_json()
    table5 = Table.from_json(table4_json)

    # .. to json and back.
    assert table4 == table5

    # And finally: I can add metadata:
    table5.metadata['db_mapping'] = {
        'A': 'customers.customer_name',
        'A_2': 'product.sku',
        'A_4': 'locations.sender'
    }

    # which also jsonifies without fuzz.
    table5_json = table5.to_json()
    table5_from_json = Table.from_json(table5_json)
    assert table5 == table5_from_json
Beispiel #4
0
def test_recreate_readme_comparison():
    try:
        import os
        import psutil
    except ImportError:
        return
    process = psutil.Process(os.getpid())
    baseline_memory = process.memory_info().rss
    from time import process_time

    from tablite import Table

    digits = 1_000_000

    records = Table()
    records.add_column('method', str)
    records.add_column('memory', int)
    records.add_column('time', float)

    records.add_row(('python', baseline_memory, 0.0))

    # Let's now use the common and convenient "row" based format:

    start = process_time()
    L = []
    for _ in range(digits):
        L.append(tuple([11 for _ in range(10)]))
    end = process_time()

    # go and check taskmanagers memory usage.
    # At this point we're using ~154.2 Mb to store 1 million lists with 10 items.
    records.add_row(
        ('1e6 lists w. 10 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    L.clear()

    # Let's now use a columnar format instead:
    start = process_time()
    L = [[11 for i in range(digits)] for _ in range(10)]
    end = process_time()

    # go and check taskmanagers memory usage.
    # at this point we're using ~98.2 Mb to store 10 lists with 1 million items.
    records.add_row(
        ('10 lists with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))
    L.clear()

    # We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists.

    # Q: But why didn't I just use an array? It would have even lower memory footprint.
    # A: First, array's don't handle None's and we get that frequently in dirty csv data.
    # Second, Table needs even less memory.

    # Let's start with an array:

    import array
    start = process_time()
    L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]
    end = process_time()
    # go and check taskmanagers memory usage.
    # at this point we're using 60.0 Mb to store 10 lists with 1 million integers.

    records.add_row(
        ('10 lists with 1e6 integers in arrays',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))
    L.clear()

    # Now let's use Table:

    start = process_time()
    t = Table()
    for i in range(10):
        t.add_column(str(i),
                     int,
                     allow_empty=False,
                     data=[11 for _ in range(digits)])
    end = process_time()

    records.add_row(
        ('Table with 10 columns with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    # go and check taskmanagers memory usage.
    # At this point we're using  97.5 Mb to store 10 columns with 1 million integers.

    # Next we'll use the api `use_stored_lists` to drop to disk:
    start = process_time()
    t.use_disk = True
    end = process_time()
    records.add_row(
        ('Table on disk with 10 columns with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    # go and check taskmanagers memory usage.
    # At this point we're using  24.5 Mb to store 10 columns with 1 million integers.
    # Only the metadata remains in pythons memory.

    records.show()