def test_filereader_book1xlsx(): path = Path(__file__).parent / "data" / 'book1.xlsx' assert path.exists() start = process_time_ns() tables = list(file_reader(path)) end = process_time_ns() fields = sum(len(t) * len(t.columns) for t in tables) print("{:,} fields/seccond".format( round(1e9 * fields / max(1, end - start), 0))) sheet1 = Table(filename=path.name, sheet_name='Sheet1') for column_name in list('abcdef'): sheet1.add_column(column_name, int, False) sheet2 = Table( filename=path.name, sheet_name='Sheet2 ') # there's a deliberate white space at the end! sheet2.add_column('a', int, False) for column_name in list('bcdef'): sheet2.add_column(column_name, float, False) books = [sheet1, sheet2] for book, table in zip(books, tables): table.show(slice(5)) assert table.compare(book) assert len(table) == 45, len(table)
def test_filereader_gdocs1csv_no_header(): path = Path(__file__).parent / "data" / 'gdocs1.csv' assert path.exists() table = list(file_reader(path, has_headers=False))[0] table.show(slice(0, 10)) book1_csv = Table(filename=path.name) for idx, _ in enumerate(list('abcdef'), 1): book1_csv.add_column(f"_{idx}", str) assert table.compare(book1_csv), table.compare(book1_csv) assert len(table) == 46
def test_filereader_gdocs1xlsx(): path = Path(__file__).parent / "data" / 'gdocs1.xlsx' assert path.exists() table = list(file_reader(path))[0] table.show(slice(0, 10)) gdocs1xlsx = Table(filename=path.name, sheet_name='Sheet1') for float_type in list('abcdef'): gdocs1xlsx.add_column(float_type, int) assert table.compare(gdocs1xlsx), table.compare(gdocs1xlsx) assert len(table) == 45
def test_filereader_gdocs1xlsx_no_header(): path = Path(__file__).parent / "data" / 'gdocs1.xlsx' assert path.exists() table = list(file_reader(path, has_headers=False))[0] table.show(slice(0, 10)) gdocs1xlsx = Table(filename=path.name, sheet_name='Sheet1') for idx, _ in enumerate(list('abcdef'), 1): gdocs1xlsx.add_column(f"_{idx}", str) assert table.compare(gdocs1xlsx), table.compare(gdocs1xlsx) assert len(table) == 46
def test_filereader_exceldatesxlsx(): path = Path(__file__).parent / "data" / 'excel_dates.xlsx' assert path.exists() table = list(file_reader(path))[0] table.show() sheet1 = Table(filename=path.name, sheet_name='Sheet1') sheet1.add_column('Date', datetime, False) sheet1.add_column('numeric value', int, False) sheet1.add_column('string', date, False) sheet1.add_column('bool', bool, False) assert table.compare(sheet1) assert len(table) == 2, len(table)
def test_filereader_gdocsc1ods(): path = Path(__file__).parent / "data" / 'gdocs1.ods' assert path.exists() tables = file_reader(path) sheet1 = Table(filename=path.name, sheet_name='Sheet1') for int_type in list('abcdef'): sheet1.add_column(int_type, int) sheet2 = Table(filename=path.name, sheet_name='Sheet2') sheet2.add_column('a', int) for float_type in list('bcdef'): sheet2.add_column(float_type, float) sheets = [sheet1, sheet2] for sheet, table in zip(sheets, tables): table.compare(sheet) assert len(table) == 45, table.show(blanks="")
def test_groupby_02(): t = Table() t.add_column('A', int, data=[1, 1, 2, 2, 3, 3] * 2) t.add_column('B', int, data=[1, 2, 3, 4, 5, 6] * 2) t.add_column('C', int, data=[6, 5, 4, 3, 2, 1] * 2) t.show() # +=====+=====+=====+ # | A | B | C | # | int | int | int | # |False|False|False| # +-----+-----+-----+ # | 1| 1| 6| # | 1| 2| 5| # | 2| 3| 4| # | 2| 4| 3| # | 3| 5| 2| # | 3| 6| 1| # | 1| 1| 6| # | 1| 2| 5| # | 2| 3| 4| # | 2| 4| 3| # | 3| 5| 2| # | 3| 6| 1| # +=====+=====+=====+ g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)]) g.table.show() t2 = g.pivot('A') t2.show() # +=====+==========+==========+==========+ # | C |Sum(B,A=1)|Sum(B,A=2)|Sum(B,A=3)| # | int | int | int | int | # |False| True | True | True | # +-----+----------+----------+----------+ # | 5| 4| None| None| # | 6| 2| None| None| # | 3| None| 8| None| # | 4| None| 6| None| # | 1| None| None| 12| # | 2| None| None| 10| # +=====+==========+==========+==========+ assert len(t2) == 6 and len(t2.columns) == 4
def test_lookup_functions(): # doing lookups is supported by indexing: table6 = Table() table6.add_column( 'A', str, data=['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben', 'Albert']) table6.add_column('B', str, data=[ 'Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein' ]) index = table6.index('A') # single key. assert index[('Bob', )] == {1, 2} index2 = table6.index('A', 'B') # multiple keys. assert index2[('Bob', 'Dylan')] == {2}
def test_filereader_utf8sig_encoding_csv(): path = Path(__file__).parent / "data" / 'utf8sig.csv' assert path.exists() table = list(file_reader(path, sep=','))[0] table.show(slice(0, 10)) table.show(slice(-15)) book1_csv = Table(filename=path.name) book1_csv.add_column('432', int) book1_csv.add_column('1', int) assert table.compare(book1_csv), table.compare(book1_csv) assert len(table) == 2, len(table)
def test_filereader_gdocsc1ods_no_header(): path = Path(__file__).parent / "data" / 'gdocs1.ods' assert path.exists() tables = file_reader(path, has_headers=False) sheet1 = Table(filename=path.name, sheet_name='Sheet1') for idx, _ in enumerate(list('abcdef'), 1): sheet1.add_column(f"_{idx}", str) sheet2 = Table(filename=path.name, sheet_name='Sheet2') for idx, _ in enumerate(list('abcdef'), 1): sheet2.add_column(f"_{idx}", str) sheets = [sheet1, sheet2] for sheet, table in zip(sheets, tables): table.show(slice(0, 10)) table.compare(sheet) assert len(table) == 46, table.show(blanks="")
def test_filereader_book1tsv(): path = Path(__file__).parent / "data" / 'book1.tsv' assert path.exists() table = list(file_reader(path))[0] table.show(slice(0, 10)) book1 = Table(filename=path.name) book1.add_column('a', int) for float_type in list('bcdef'): book1.add_column(float_type, float) assert table.compare(book1), table.compare(book1) assert len(table) == 45
def test_ttopi(): """ example code from the readme as "reversing a pivot tablite". """ from random import seed, choice seed(11) records = 9 t = Table() t.add_column('record id', int, allow_empty=False, data=[i for i in range(records)]) for column in [f"4.{i}.a" for i in range(5)]: t.add_column( column, str, allow_empty=True, data=[choice(['a', 'h', 'e', None]) for i in range(records)]) print("\nshowing raw data:") t.show() # +=====+=====+ # | A | B | # | str | str | # |False|False| # +-----+-----+ # |4.2.a|e | # |4.3.a|h | # |4.2.a|h | # |4.2.a|e | # |4.3.a|e | # |4.3.a|e | # |4.1.a|e | # |4.1.a|a | # |4.3.a|e | # |4.2.a|a | # |4.3.a|e | # |4.3.a|a | # |4.1.a|a | # |4.1.a|a | # |4.2.a|a | # |4.2.a|a | # |4.1.a|e | # |4.1.a|a | # |4.3.a|h | # |4.3.a|h | # |4.3.a|h | # |4.1.a|e | # +=====+=====+ # wanted output: # +=====+===== # | A | Count(A,B=a) | Count(A,B=h) | Count(A,B=e) | # | str | int | int | int | # |False| False | False | False | # +-----+--------------+--------------+--------------+ # |4.1.a| 3 | 0 | 3 | # |4.2.a| 3 | 1 | 2 | # |4.3.a| 1 | 4 | 4 | # +=====+==============+==============+==============+ reverse_pivot = Table() records = t['record id'] reverse_pivot.add_column('record id', records.datatype, allow_empty=False) reverse_pivot.add_column('4.x', str, allow_empty=False) reverse_pivot.add_column('ahe', str, allow_empty=True) for name in t.columns: if not name.startswith('4.'): continue column = t[name] for index, entry in enumerate(column): new_row = records[index], name, entry # record id, 4.x, ahe reverse_pivot.add_row(new_row) print("\nshowing reversed pivot of the raw data:") reverse_pivot.show() g = reverse_pivot.groupby(['4.x', 'ahe'], functions=[('ahe', GroupBy.count)]) print("\nshowing basic groupby of the reversed pivot") g.table.show() t2 = g.pivot('ahe') print("\nshowing the wanted output:") t2.show()
def test_lookup(): friends = Table() friends.add_column( "name", str, data=['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred']) friends.add_column("stop", str, data=[ 'Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago' ]) friends.show() random.seed(11) table_size = 40 times = [ DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size) ] stops = [ 'Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2', 'Central station' ] * 2 + [f'Random Road-{i}' for i in range(table_size)] route = [random.choice([1, 2, 3]) for i in stops] bustable = Table() bustable.add_column("time", DataTypes.time, data=times) bustable.add_column("stop", str, data=stops[:table_size]) bustable.add_column("route", int, data=route[:table_size]) bustable.sort(**{'time': False}) print("Departures from Concert Hall towards ...") bustable[:10].show() lookup_1 = friends.lookup(bustable, ('time', ">=", DataTypes.time(21, 10)), ('stop', "==", 'stop')) lookup_1.sort(**{'time': False}) lookup_1.show() expected = [ ('Fred', 'Chicago', None, None, None), ('Dorethy', 'Hillside Crescent', time(21, 5), 'Hillside Crescent', 2), ('Betty', 'Downtown-2', time(21, 51), 'Downtown-2', 1), ('Edward', 'Downtown-2', time(21, 51), 'Downtown-2', 1), ('Charlie', 'Hillside View', time(22, 19), 'Hillside View', 2), ('Alice', 'Downtown-1', time(23, 12), 'Downtown-1', 3), ] for row in lookup_1.rows: expected.remove(row) assert expected == []
def test_clipboard(): t = Table() t.add_column('dog', int, data=[1, 4]) t.add_column('cat', int, data=[2, 5]) t.add_column('hat', int, data=[3, 6]) t.show() try: t.copy_to_clipboard() t2 = t.copy_from_clipboard() assert t == t2 except pyperclip.PyperclipException: pass # travis VMs can't handle this.
def test_sortation(): # Sortation table7 = Table() table7.add_column('A', int, data=[1, None, 8, 3, 4, 6, 5, 7, 9], allow_empty=True) table7.add_column('B', int, data=[10, 100, 1, 1, 1, 1, 10, 10, 10]) table7.add_column('C', int, data=[0, 1, 0, 1, 0, 1, 0, 1, 0]) assert not table7.is_sorted() sort_order = {'B': False, 'C': False, 'A': False} table7.sort(**sort_order) assert list(table7.rows) == [(4, 1, 0), (8, 1, 0), (3, 1, 1), (6, 1, 1), (1, 10, 0), (5, 10, 0), (9, 10, 0), (7, 10, 1), (None, 100, 1)] assert list(table7.filter('A', 'B', slice(4, 8))) == [(1, 10), (5, 10), (9, 10), (7, 10)] assert table7.is_sorted(**sort_order)
def test_sql_joins(): # a couple of examples with SQL join: numbers = Table() numbers.add_column('number', int, allow_empty=True, data=[1, 2, 3, 4, None]) numbers.add_column('colour', str, data=['black', 'blue', 'white', 'white', 'blue']) letters = Table() letters.add_column('letter', str, allow_empty=True, data=['a', 'b', 'c', 'd', None]) letters.add_column('color', str, data=['blue', 'white', 'orange', 'white', 'blue']) # left join # SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color left_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], columns=['number', 'letter']) left_join.show() # +======+======+ # |number|letter| # | int | str | # | True | True | # +------+------+ # | 1|None | # | 2|a | # | 2|None | # | 3|b | # | 3|d | # | 4|b | # | 4|d | # |None |a | # |None |None | # +======+======+ assert [i for i in left_join['number']] == [1, 2, 2, 3, 3, 4, 4, None, None] assert [i for i in left_join['letter'] ] == [None, 'a', None, 'b', 'd', 'b', 'd', 'a', None] # inner join # SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color inner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], columns=['number', 'letter']) inner_join.show() # +======+======+ # |number|letter| # | int | str | # | True | True | # +------+------+ # | 2|a | # | 2|None | # |None |a | # |None |None | # | 3|b | # | 3|d | # | 4|b | # | 4|d | # +======+======+ assert [i for i in inner_join['number']] == [2, 2, None, None, 3, 3, 4, 4] assert [i for i in inner_join['letter'] ] == ['a', None, 'a', None, 'b', 'd', 'b', 'd'] # outer join # SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color outer_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], columns=['number', 'letter']) outer_join.show() # +======+======+ # |number|letter| # | int | str | # | True | True | # +------+------+ # | 1|None | # | 2|a | # | 2|None | # | 3|b | # | 3|d | # | 4|b | # | 4|d | # |None |a | # |None |None | # |None |c | # +======+======+ assert [i for i in outer_join['number'] ] == [1, 2, 2, 3, 3, 4, 4, None, None, None] assert [i for i in outer_join['letter'] ] == [None, 'a', None, 'b', 'd', 'b', 'd', 'a', None, 'c'] assert left_join != inner_join assert inner_join != outer_join assert left_join != outer_join
def test_basic_table(): # creating a tablite incrementally is straight forward: table = Table() table.add_column('A', int, False) assert 'A' in table table.add_column('B', str, allow_empty=False) assert 'B' in table # appending rows is easy: table.add_row((1, 'hello')) table.add_row((2, 'world')) # converting to and from json is easy: table_as_json = table.to_json() table2 = Table.from_json(table_as_json) zipped = zlib.compress(table_as_json.encode()) a, b = len(zipped), len(table_as_json) print("zipping reduces to", a, "from", b, "bytes, e.g.", round(100 * a / b, 0), "% of original") # copying is easy: table3 = table.copy() table.rename_column('A', 'aa') assert list(table.columns) == ['aa', 'B'] table.rename_column('aa', 'A') assert list(table.columns) == ['A', 'B'] # and checking for headers is simple: assert 'A' in table assert 'Z' not in table # comparisons are straight forward: assert table == table2 == table3 # even if you only want to check metadata: table.compare(table3) # will raise exception if they're different. # append is easy as + also work: table3x2 = table3 + table3 assert len(table3x2) == len(table3) * 2 # and so does += table3x2 += table3 assert len(table3x2) == len(table3) * 3 # type verification is included: try: table.columns['A'][0] = 'Hallo' assert False, "A TypeError should have been raised." except TypeError: assert True # updating values is familiar to any user who likes a list: assert 'A' in table.columns assert isinstance(table.columns['A'], (StoredList, list)) last_row = -1 table['A'][last_row] = 44 table['B'][last_row] = "Hallo" assert table != table2 # if you try to loop and forget the direction, Table will tell you try: for row in table: # wont pass assert False, "not possible. Use for row in tablite.rows or for column in tablite.columns" except AttributeError: assert True _ = [table2.add_row(row) for row in table.rows] before = [r for r in table2.rows] assert before == [(1, 'hello'), (2, 'world'), (1, 'hello'), (44, 'Hallo')] # as is filtering for ALL that match: filter_1 = lambda x: 'llo' in x filter_2 = lambda x: x > 3 after = table2.all(**{'B': filter_1, 'A': filter_2}) assert list(after.rows) == [(44, 'Hallo')] # as is filtering or for ANY that match: after = table2.any(**{'B': filter_1, 'A': filter_2}) assert list(after.rows) == [(1, 'hello'), (1, 'hello'), (44, 'Hallo')] # Imagine a tablite with columns a,b,c,d,e (all integers) like this: t = Table() for c in 'abcde': t.add_column(header=c, datatype=int, allow_empty=False, data=[i for i in range(5)]) # we want to add two new columns using the functions: def f1(a, b, c): return a + b + c + 1 def f2(b, c, d): return b * c * d # and we want to compute two new columns 'f' and 'g': t.add_column(header='f', datatype=int, allow_empty=False) t.add_column(header='g', datatype=int, allow_empty=True) # we can now use the filter, to iterate over the tablite: for row in t.filter('a', 'b', 'c', 'd'): a, b, c, d = row # ... and add the values to the two new columns t['f'].append(f1(a, b, c)) t['g'].append(f2(b, c, d)) assert len(t) == 5 assert list(t.columns) == list('abcdefg') t.show() # slicing is easy: table_chunk = table2[2:4] assert isinstance(table_chunk, Table) # we will handle duplicate names gracefully. table2.add_column('B', int, allow_empty=True) assert set(table2.columns) == {'A', 'B', 'B_1'} # you can delete a column as key... del table2['B_1'] assert set(table2.columns) == {'A', 'B'} # adding a computed column is easy: table.add_column('new column', str, allow_empty=False, data=[f"{r}" for r in table.rows]) # part of or the whole tablite is easy: table.show() table.show('A', slice(0, 1)) # updating a column with a function is easy: f = lambda x: x * 10 table['A'] = [f(r) for r in table['A']] # using regular indexing will also work. for ix, r in enumerate(table['A']): table['A'][ix] = r * 10 # and it will tell you if you're not allowed: try: f = lambda x: f"'{x} as text'" table['A'] = [f(r) for r in table['A']] assert False, "The line above must raise a TypeError" except TypeError as error: print("The error is:", str(error)) # works with all datatypes: now = datetime.now() table4 = Table() table4.add_column('A', int, allow_empty=False, data=[-1, 1]) table4.add_column('A', int, allow_empty=True, data=[None, 1]) # None! table4.add_column('A', float, False, data=[-1.1, 1.1]) table4.add_column( 'A', str, False, data=["", "1"]) # Empty string is not a None, when dtype is str! table4.add_column( 'A', str, True, data=[None, "1"]) # Empty string is not a None, when dtype is str! table4.add_column('A', bool, False, data=[False, True]) table4.add_column('A', datetime, False, data=[now, now]) table4.add_column('A', date, False, data=[now.date(), now.date()]) table4.add_column('A', time, False, data=[now.time(), now.time()]) table4_json = table4.to_json() table5 = Table.from_json(table4_json) # .. to json and back. assert table4 == table5 # And finally: I can add metadata: table5.metadata['db_mapping'] = { 'A': 'customers.customer_name', 'A_2': 'product.sku', 'A_4': 'locations.sender' } # which also jsonifies without fuzz. table5_json = table5.to_json() table5_from_json = Table.from_json(table5_json) assert table5 == table5_from_json
def test_filereader_win1251_encoding_csv(): path = Path(__file__).parent / "data" / 'win1250_test.csv' assert path.exists() table = list(file_reader(path, sep=';'))[0] table.show(slice(0, 10)) table.show(slice(-15)) book1_csv = Table(filename=path.name) book1_csv.add_column('Item', int) book1_csv.add_column('Materiál', str) book1_csv.add_column('Objem', float) book1_csv.add_column('Jednotka objemu', str) book1_csv.add_column('Free Inv Pcs', int) assert table.compare(book1_csv), table.compare(book1_csv) assert len(table) == 99, len(table)
def test_add_rows(): """ Example from docstring. """ t = Table() t.add_column('test', int) t.add_column('A', int) t.add_column('B', int) t.add_column('C', int) test_number = count(1) # The following examples are all valid and append the row (1,2,3) to the tablite. t.add_row(next(test_number), 1, 2, 3) t.add_row([next(test_number), 1, 2, 3]) t.add_row((next(test_number), 1, 2, 3)) t.add_row(*(next(test_number), 1, 2, 3)) t.add_row(test=next(test_number), A=1, B=2, C=3) t.add_row(**{'test': next(test_number), 'A': 1, 'B': 2, 'C': 3}) # The following examples add two rows to the tablite t.add_row((next(test_number), 1, 2, 3), (next(test_number), 4, 5, 6)) t.add_row([next(test_number), 1, 2, 3], [next(test_number), 4, 5, 6]) t.add_row({ 'test': next(test_number), 'A': 1, 'B': 2, 'C': 3 }, { 'test': next(test_number), 'A': 4, 'B': 5, 'C': 6 }) # two (or more) dicts as args. t.add_row(*[{ 'test': next(test_number), 'A': 1, 'B': 2, 'C': 3 }, { 'test': next(test_number), 'A': 1, 'B': 2, 'C': 3 }]) # list of dicts. t.show() # expects: # +=====+=====+=====+=====+ # | test| A | B | C | # | int | int | int | int | # |False|False|False|False| # +-----+-----+-----+-----+ # | 1| 1| 2| 3| # | 2| 1| 2| 3| # | 3| 1| 2| 3| # | 4| 1| 2| 3| # | 5| 1| 2| 3| # | 6| 1| 2| 3| # | 7| 1| 2| 3| # | 8| 4| 5| 6| # | 9| 1| 2| 3| # | 10| 4| 5| 6| # | 11| 1| 2| 3| # | 12| 4| 5| 6| # | 13| 1| 2| 3| # | 14| 1| 2| 3| # +=====+=====+=====+=====+ assert next(test_number) == 14 + 1
def test_filereader_saptxt(): path = Path(__file__).parent / "data" / 'sap.txt' assert path.exists() # test part 1: split using user defined sequence. header = " | Delivery | Item|Pl.GI date|Route |SC|Ship-to |SOrg.|Delivery quantity|SU| TO Number|Material |Dest.act.qty.|BUn|Typ|Source Bin|Cty" split_sequence = ["|"] * header.count('|') table = list(file_reader(path, split_sequence=split_sequence))[0] table.show(slice(5)) sap_sample = Table(filename=path.name) sap_sample.add_column('None', str, True) sap_sample.add_column('Delivery', int, False) sap_sample.add_column('Item', int, False) sap_sample.add_column('Pl.GI date', date, False) sap_sample.add_column('Route', str, False) sap_sample.add_column('SC', str, False) sap_sample.add_column('Ship-to', str, False) sap_sample.add_column('SOrg.', str, False) sap_sample.add_column('Delivery quantity', int, False) sap_sample.add_column('SU', str, False) sap_sample.add_column('TO Number', int, False) sap_sample.add_column('Material', str, False) sap_sample.add_column('Dest.act.qty.', int, False) sap_sample.add_column('BUn', str, False) sap_sample.add_column('Typ', str, False) sap_sample.add_column('Source Bin', str, False) sap_sample.add_column('Cty|', str, False) assert table.compare(sap_sample) assert len(table) == 20, len(table)
def test_groupby(): t = Table() for c in 'abcde': t.add_column(header=c, datatype=int, allow_empty=False, data=[i for i in range(5)]) # we want to add two new columns using the functions: def f1(a, b, c): return a + b + c + 1 def f2(b, c, d): return b * c * d # and we want to compute two new columns 'f' and 'g': t.add_column(header='f', datatype=int, allow_empty=False) t.add_column(header='g', datatype=int, allow_empty=True) # we can now use the filter, to iterate over the tablite: for row in t.filter('a', 'b', 'c', 'd'): a, b, c, d = row # ... and add the values to the two new columns t['f'].append(f1(a, b, c)) t['g'].append(f2(b, c, d)) assert len(t) == 5 assert list(t.columns) == list('abcdefg') t.show() g = GroupBy(keys=['a', 'b'], functions=[('f', gb.max), ('f', gb.min), ('f', gb.sum), ('f', gb.first), ('f', gb.last), ('f', gb.count), ('f', gb.count_unique), ('f', gb.avg), ('f', gb.stdev), ('a', gb.stdev), ('f', gb.median), ('f', gb.mode), ('g', gb.median)]) t2 = t + t assert len(t2) == 2 * len(t) t2.show() g += t2 assert list(g.rows) == [ (0, 0, 1, 1, 2, 1, 1, 2, 1, 1.0, 0.0, 0.0, 1, 1, 0), (1, 1, 4, 4, 8, 4, 4, 2, 1, 4.0, 0.0, 0.0, 4, 4, 1), (2, 2, 7, 7, 14, 7, 7, 2, 1, 7.0, 0.0, 0.0, 7, 7, 8), (3, 3, 10, 10, 20, 10, 10, 2, 1, 10.0, 0.0, 0.0, 10, 10, 27), (4, 4, 13, 13, 26, 13, 13, 2, 1, 13.0, 0.0, 0.0, 13, 13, 64) ] g.table.show() g2 = GroupBy(keys=['a', 'b'], functions=[('f', gb.max), ('f', g.sum)]) g2 += t + t + t g2.table.show() pivot_table = g2.pivot('b') pivot_table.show()
def test_filereader_123csv(): csv_file = Path(__file__).parent / "data" / "123.csv" table7 = Table(filename=csv_file.name) table7.metadata['filename'] = '123.csv' table7.add_column('A', int, data=[1, None, 8, 3, 4, 6, 5, 7, 9], allow_empty=True) table7.add_column('B', int, data=[10, 100, 1, 1, 1, 1, 10, 10, 10]) table7.add_column('C', int, data=[0, 1, 0, 1, 0, 1, 0, 1, 0]) sort_order = {'B': False, 'C': False, 'A': False} table7.sort(**sort_order) headers = ", ".join([c for c in table7.columns]) data = [headers] for row in table7.rows: data.append(", ".join(str(v) for v in row)) s = "\n".join(data) print(s) csv_file.write_text(s) # write tr_table = list(file_reader(csv_file))[0] # read csv_file.unlink() # cleanup tr_table.show() find_format(tr_table) assert tr_table == table7
def test_recreate_readme_comparison(): try: import os import psutil except ImportError: return process = psutil.Process(os.getpid()) baseline_memory = process.memory_info().rss from time import process_time from tablite import Table digits = 1_000_000 records = Table() records.add_column('method', str) records.add_column('memory', int) records.add_column('time', float) records.add_row(('python', baseline_memory, 0.0)) # Let's now use the common and convenient "row" based format: start = process_time() L = [] for _ in range(digits): L.append(tuple([11 for _ in range(10)])) end = process_time() # go and check taskmanagers memory usage. # At this point we're using ~154.2 Mb to store 1 million lists with 10 items. records.add_row( ('1e6 lists w. 10 integers', process.memory_info().rss - baseline_memory, round(end - start, 4))) L.clear() # Let's now use a columnar format instead: start = process_time() L = [[11 for i in range(digits)] for _ in range(10)] end = process_time() # go and check taskmanagers memory usage. # at this point we're using ~98.2 Mb to store 10 lists with 1 million items. records.add_row( ('10 lists with 1e6 integers', process.memory_info().rss - baseline_memory, round(end - start, 4))) L.clear() # We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists. # Q: But why didn't I just use an array? It would have even lower memory footprint. # A: First, array's don't handle None's and we get that frequently in dirty csv data. # Second, Table needs even less memory. # Let's start with an array: import array start = process_time() L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)] end = process_time() # go and check taskmanagers memory usage. # at this point we're using 60.0 Mb to store 10 lists with 1 million integers. records.add_row( ('10 lists with 1e6 integers in arrays', process.memory_info().rss - baseline_memory, round(end - start, 4))) L.clear() # Now let's use Table: start = process_time() t = Table() for i in range(10): t.add_column(str(i), int, allow_empty=False, data=[11 for _ in range(digits)]) end = process_time() records.add_row( ('Table with 10 columns with 1e6 integers', process.memory_info().rss - baseline_memory, round(end - start, 4))) # go and check taskmanagers memory usage. # At this point we're using 97.5 Mb to store 10 columns with 1 million integers. # Next we'll use the api `use_stored_lists` to drop to disk: start = process_time() t.use_disk = True end = process_time() records.add_row( ('Table on disk with 10 columns with 1e6 integers', process.memory_info().rss - baseline_memory, round(end - start, 4))) # go and check taskmanagers memory usage. # At this point we're using 24.5 Mb to store 10 columns with 1 million integers. # Only the metadata remains in pythons memory. records.show()