def test_flatten_column_with_differing_size_vectors(self): data = [[1,[1,2,3],[8,7]],[2,[4,5,6],[6,5]],[3,[7,8,9],[4,3]],[4,[10,11,12],[2,1]]] schema = [('a', ta.int32), ('b', ta.vector(3)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data,schema)) test_frame.flatten_columns(['b','c']) # expected data after flattening expected_data = [ [1,1.0,8.0], [1,2.0,7.0], [1,3.0,0.0], [2,4.0,6.0], [2,5.0,5.0], [2,6.0,0.0], [3,7.0,4.0], [3,8.0,3.0], [3,9.0,0.0], [4,10.0,2.0], [4,11.0,1.0], [4,12.0,0.0] ] self.assertEqual(test_frame.row_count, 12) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_multiple_vectors(self): data = [[1, [1, 2], [8, 7]], [2, [3, 4], [6, 5]], [3, [5, 6], [4, 3]], [4, [7, 8], [2, 1]]] schema = [('a', ta.int32), ('b', ta.vector(2)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns(['b', 'c']) # expected data after flattening expected_data = [[1, 1.0, 8.0], [1, 2.0, 7.0], [2, 3.0, 6.0], [2, 4.0, 5.0], [3, 5.0, 4.0], [3, 6.0, 3.0], [4, 7.0, 2.0], [4, 8.0, 1.0]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_inspect_nones(self): schema = [('s', str), ('v', ta.vector(2))] rows = [['super', [1.0095, 2.034]], [None, None]] result = repr(ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings(wrap=2, round=2, truncate=4))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) print result
def test_flatten_columns_with_strings_and_vectors_with_one_delimiter(self): data = [ [1, "1:2", [1, 2], "a:b"], [2, "3:4", [3, 4], "c:d"], [3, "5:6", [5, 6], "e:f"], [4, "7:8", [7, 8], "g:h"], ] schema = [("a", ta.int32), ("b", str), ("c", ta.vector(2)), ("d", str)] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns(["b", "c", "d"], ":") # expected data after flattening expected_data = [ [1, "1", 1.0, "a"], [1, "2", 2.0, "b"], [2, "3", 3.0, "c"], [2, "4", 4.0, "d"], [3, "5", 5.0, "e"], [3, "6", 6.0, "f"], [4, "7", 7.0, "g"], [4, "8", 8.0, "h"], ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_strings_and_vectors_with_default_delimiter(self): data = [[1,"1,2",[1,2],"a,b"],[2,"3,4",[3,4],"c,d"],[3,"5,6",[5,6],"e,f"],[4,"7,8",[7,8],"g,h"]] schema = [('a', ta.int32),('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data,schema)) # there are only 2 string columns. giving 3 delimiters should give an exception. with self.assertRaises(Exception): test_frame.flatten_columns(['b', 'c', 'd'], [',',',',',']) test_frame.flatten_columns(['b', 'c', 'd']) # expected data after flattening expected_data = [ [1,"1",1.0,"a"], [1,"2",2.0,"b"], [2,"3",3.0,"c"], [2,"4",4.0,"d"], [3,"5",5.0,"e"], [3,"6",6.0,"f"], [4,"7",7.0,"g"], [4,"8",8.0,"h"] ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_add_columns_and_copy_where(self): """ Tests UDFs for add_columns and copy(where), and uses the vector type Changes the 2 population strings to a vector, and then uses the vector to compute the change, and then copy out all the incorrect ones """ frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") frame.add_columns( lambda row: [ float(row['pop_2010'].translate({ord(','): None})), float(row['population_2013'].translate({ord(','): None})) ], ("vpops", ta.vector(2))) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") self.assertEquals(frame.column_names, [ 'rank', 'city', 'population_2013', 'pop_2010', 'change', 'county', 'vpops' ]) frame.add_columns( lambda row: (row.vpops[1] - row.vpops[0]) / row.vpops[0], ("comp_change", ta.float64)) #print frame.inspect(20) bad_cities = frame.copy(columns=['city', 'change', 'comp_change'], where=lambda row: row.change != "%.2f%%" % round(100 * row.comp_change, 2)) self.assertEquals(bad_cities.column_names, ['city', 'change', 'comp_change']) self.assertEquals(bad_cities.row_count, 1) #print bad_cities.inspect() row = bad_cities.take(1)[0] row[2] = round(row[2], 5) self.assertEquals(row, [u'Tualatin', u'4.17%', 0.03167 ]) # should just be one bad one, Tualatin
def test_inspect_round(self): schema = [('f32', ta.float32), ('f64', ta.float64), ('v', ta.vector(2))] rows = [[0.1234, 9.87654321, [1.0095, 2.034]], [1234.5, 9876.54321, [99.999, 33.33]]] result = repr( ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings(wrap=2, round=2))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) expected = '''[#] f32 f64 v ====================================== [0] 0.12 9.88 [1.01, 2.03] [1] 1234.50 9876.54 [100.00, 33.33]''' self.assertEqual(expected, result) result = repr( ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings( wrap='stripes', round=3))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) expected = '''[0]- f32=0.123 f64=9.877 v =[1.010, 2.034] [1]- f32=1234.500 f64=9876.543 v =[99.999, 33.330]''' self.assertEqual(expected, result)
def test_frame_upload_raw_list_data(self): """does round trip with list data --> upload to frame --> 'take' back to list and compare""" data = [[1, 'one', [1.0, 1.1]], [2, 'two', [2.0, 2.2]], [3, 'three', [3.0, 3.3]]] schema = [('n', int), ('s', str), ('v', ta.vector(2))] frame = ta.Frame(ta.UploadRows(data, schema)) taken = frame.take(5) self.assertEqual(len(data),len(taken)) for r, row in enumerate(taken): self.assertEqual(len(data[r]),len(row)) for c, column in enumerate(row): self.assertEqual(data[r][c], column)
def test_inspect_nones(self): schema = [('s', str), ('v', ta.vector(2))] rows = [['super', [1.0095, 2.034]], [None, None]] result = repr( ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings(wrap=2, round=2, truncate=4))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) print result
def test_frame_upload_raw_list_data(self): """does round trip with list data --> upload to frame --> 'take' back to list and compare""" data = [[1, 'one', [1.0, 1.1]], [2, 'two', [2.0, 2.2]], [3, 'three', [3.0, 3.3]]] schema = [('n', int), ('s', str), ('v', ta.vector(2))] frame = ta.Frame(ta.UploadRows(data, schema)) taken = frame.take(5) self.assertEqual(len(data), len(taken)) for r, row in enumerate(taken): self.assertEqual(len(data[r]), len(row)) for c, column in enumerate(row): self.assertEqual(data[r][c], column)
def test_flatten_columns_with_multiple_vectors(self): data = [[1,[1,2],[8,7]],[2,[3,4],[6,5]],[3,[5,6],[4,3]],[4,[7,8],[2,1]]] schema = [('a', ta.int32), ('b', ta.vector(2)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data,schema)) test_frame.flatten_columns(['b','c']) # expected data after flattening expected_data = [ [1,1.0,8.0], [1,2.0,7.0], [2,3.0,6.0], [2,4.0,5.0], [3,5.0,4.0], [3,6.0,3.0], [4,7.0,2.0], [4,8.0,1.0] ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_single_vector(self): data = [[1, [1, 2]], [2, [3, 4]], [3, [5, 6]], [4, [7, 8]]] schema = [("a", ta.int32), ("b", ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns("b") # expected data after flattening expected_data = [[1, 1.0], [1, 2.0], [2, 3.0], [2, 4.0], [3, 5.0], [3, 6.0], [4, 7.0], [4, 8.0]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_single_vector(self): data = [[1, [1, 2]], [2, [3, 4]], [3, [5, 6]], [4, [7, 8]]] schema = [('a', ta.int32), ('b', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns('b') # expected data after flattening expected_data = [[1, 1.0], [1, 2.0], [2, 3.0], [2, 4.0], [3, 5.0], [3, 6.0], [4, 7.0], [4, 8.0]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_flatten_columns_with_strings_and_vectors_with_one_delimiter(self): data = [[1, "1:2", [1, 2], "a:b"], [2, "3:4", [3, 4], "c:d"], [3, "5:6", [5, 6], "e:f"], [4, "7:8", [7, 8], "g:h"]] schema = [('a', ta.int32), ('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data, schema)) test_frame.flatten_columns(['b', 'c', 'd'], ':') # expected data after flattening expected_data = [[1, "1", 1.0, "a"], [1, "2", 2.0, "b"], [2, "3", 3.0, "c"], [2, "4", 4.0, "d"], [3, "5", 5.0, "e"], [3, "6", 6.0, "f"], [4, "7", 7.0, "g"], [4, "8", 8.0, "h"]] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_frame_upload_pandas(self): """does round trip pandas DF --> upload to frame --> download back to pandas and compare""" import pandas as pd from pandas.util.testing import assert_frame_equal import numpy as np data = [[1, 'one', [1.0, 1.1]], [2, 'two', [2.0, 2.2]], [3, 'three', [3.0, 3.3]]] schema = [('n', ta.int64), ('s', str), ('v', ta.vector(2))] # 'n' is int64, pandas default source = dict(zip(zip(*schema)[0], zip(*data))) df0 = pd.DataFrame(source) self.assertEqual(np.int64, df0['n'].dtype) self.assertEqual(np.object, df0['s'].dtype) self.assertEqual(np.object, df0['v'].dtype) p = ta.Pandas(df0, schema) frame = ta.Frame(p) df1 = frame.download() # print repr(df0) # print repr(df1) assert_frame_equal(df0, df1)
def test_flatten_columns_with_strings_and_vectors(self): data = [[1,"1:2",[1,2],"a|b"],[2,"3:4",[3,4],"c|d"],[3,"5:6",[5,6],"e|f"],[4,"7:8",[7,8],"g|h"]] schema = [('a', ta.int32),('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data,schema)) test_frame.flatten_columns(['b', 'c', 'd'], [':','|']) # expected data after flattening expected_data = [ [1,"1",1.0,"a"], [1,"2",2.0,"b"], [2,"3",3.0,"c"], [2,"4",4.0,"d"], [3,"5",5.0,"e"], [3,"6",6.0,"f"], [4,"7",7.0,"g"], [4,"8",8.0,"h"] ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_add_columns_and_copy_where(self): """ Tests UDFs for add_columns and copy(where), and uses the vector type Changes the 2 population strings to a vector, and then uses the vector to compute the change, and then copy out all the incorrect ones """ frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") frame.add_columns(lambda row: [float(row['pop_2010'].translate({ord(','): None})), float(row['population_2013'].translate({ord(','): None}))], ("vpops", ta.vector(2))) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") self.assertEquals(frame.column_names, ['rank', 'city', 'population_2013', 'pop_2010', 'change', 'county', 'vpops']) frame.add_columns(lambda row: (row.vpops[1] - row.vpops[0])/row.vpops[0], ("comp_change", ta.float64)) #print frame.inspect(20) bad_cities = frame.copy(columns=['city', 'change', 'comp_change'], where=lambda row: row.change != "%.2f%%" % round(100*row.comp_change, 2)) self.assertEquals(bad_cities.column_names, ['city', 'change', 'comp_change']) self.assertEquals(bad_cities.row_count, 1) #print bad_cities.inspect() row = bad_cities.take(1)[0] row[2] = round(row[2], 5) self.assertEquals(row, [u'Tualatin', u'4.17%', 0.03167]) # should just be one bad one, Tualatin
def test_inspect_round(self): schema = [('f32', ta.float32), ('f64', ta.float64), ('v', ta.vector(2))] rows = [[0.1234, 9.87654321, [1.0095, 2.034]], [1234.5, 9876.54321, [99.999, 33.33]]] result = repr(ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings(wrap=2, round=2))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) expected = '''[#] f32 f64 v ====================================== [0] 0.12 9.88 [1.01, 2.03] [1] 1234.50 9876.54 [100.00, 33.33]''' self.assertEqual(expected, result) result = repr(ui.RowsInspection(rows, schema, offset=0, format_settings=ui.InspectSettings(wrap='stripes', round=3))) result = '\n'.join([line.rstrip() for line in result.splitlines()]) expected = '''[0]- f32=0.123 f64=9.877 v =[1.010, 2.034] [1]- f32=1234.500 f64=9876.543 v =[99.999, 33.330]''' self.assertEqual(expected, result)