def test_frame_iter_group_items_c(self) -> None: # Test optimized sorting approach. Data must have a non-object dtype and key must be single data = np.array([[0, 1, 1, 3], [3, 3, 2, 3], [5, 5, 1, 3], [7, 2, 2, 4]]) frame = sf.Frame(data, columns=tuple('abcd'), index=tuple('wxyz')) # Column groups = list(frame.iter_group_items('c', axis=0)) expected_pairs = [ (('a', (('w', 0), ('y', 5))), ('b', (('w', 1), ('y', 5))), ('c', (('w', 1), ('y', 1))), ('d', (('w', 3), ('y', 3)))), (('a', (('x', 3), ('z', 7))), ('b', (('x', 3), ('z', 2))), ('c', (('x', 2), ('z', 2))), ('d', (('x', 3), ('z', 4)))) ] self.assertEqual([1, 2], [group[0] for group in groups]) self.assertEqual(expected_pairs, [group[1].to_pairs(axis=0) for group in groups]) # Index groups = list(frame.iter_group_items('w', axis=1)) expected_pairs = [ (('a', (('w', 0), ('x', 3), ('y', 5), ('z', 7))), ), #type: ignore ( ('b', (('w', 1), ('x', 3), ('y', 5), ('z', 2))), #type: ignore ('c', (('w', 1), ('x', 2), ('y', 1), ('z', 2)))), (('d', (('w', 3), ('x', 3), ('y', 3), ('z', 4))), ) ] #type: ignore self.assertEqual([0, 1, 3], [group[0] for group in groups]) self.assertEqual(expected_pairs, [group[1].to_pairs(axis=0) for group in groups])
def test_frame_iter_group_items_d1(self) -> None: # Test iterating with multiple key selection data = np.array([[0, 1, 1, 3], [3, 3, 2, 3], [5, 5, 1, 3], [7, 2, 2, 4]]) frame = sf.Frame(data, columns=tuple('abcd'), index=tuple('wxyz')) # Column groups = list(frame.iter_group_items(['c', 'd'], axis=0)) self.assertEqual([(1, 3), (2, 3), (2, 4)], [group[0] for group in groups]) expected_pairs = [ (('a', (('w', 0), ('y', 5))), ('b', (('w', 1), ('y', 5))), ('c', (('w', 1), ('y', 1))), ('d', (('w', 3), ('y', 3)))), (('a', (('x', 3),)), ('b', (('x', 3),)), ('c', (('x', 2),)), ('d', (('x', 3),))), (('a', (('z', 7),)), ('b', (('z', 2),)), ('c', (('z', 2),)), ('d', (('z', 4),)))] self.assertEqual(expected_pairs, [group[1].to_pairs(axis=0) for group in groups])
def test_frame_iter_group_f(self) -> None: f = sf.Frame(np.arange(3).reshape(1, 3), columns=tuple('abc')) f = f.drop.loc[0] post1 = tuple(f.iter_group(['b', 'c'])) self.assertEqual(post1, ()) post2 = tuple(f.iter_group('a')) self.assertEqual(post2, ())
def sf(cls): data_func = SampleData.get('data_func') columns = SampleData.get('columns') index = SampleData.get('index') post = sf.Frame({i: data_func[i % 2]() for i in range(len(columns))}, index=index) assert post.shape == (10000, 1000)
def create(cls) -> None: print(f'({datetime.now().strftime(HMS) }) Building cache.') rows = 20_000_000 cols = 9 num_groups = 100_000 columns = tuple('abcdefghi') + (GROUPBY_COL, ) arr = np.random.random(rows * cols).reshape(rows, cols) groups = np.array([ i % num_groups for i in np.random.permutation(rows) ]).reshape(rows, 1) int_arr = np.hstack((arr, groups)) df_int = pd.DataFrame(int_arr, columns=columns) frame_int = sf.Frame(int_arr, columns=columns) obj_arr = np.hstack((arr, groups)).astype(object) df_obj = pd.DataFrame(obj_arr, columns=columns).astype({GROUPBY_COL: int}) frame_obj = sf.Frame(obj_arr, columns=columns).astype[GROUPBY_COL](int) print(f'({datetime.now().strftime(HMS) }) Finished building cache.') cls._store['pdf_20mil_int'] = df_int cls._store['sff_20mil_int'] = frame_int cls._store['pdf_20mil_obj'] = df_obj cls._store['sff_20mil_obj'] = frame_obj print(f'({datetime.now().strftime(HMS) }) Priming generators.') df_int_iterable_primed = iter(df_int.groupby(GROUPBY_COL, sort=False)) next(df_int_iterable_primed) frame_int_iterable_primed = iter( frame_int.iter_group_items(GROUPBY_COL)) next(frame_int_iterable_primed) df_obj_iterable_primed = iter(df_obj.groupby(GROUPBY_COL, sort=False)) next(df_obj_iterable_primed) frame_obj_iterable_primed = iter( frame_obj.iter_group_items(GROUPBY_COL)) next(frame_obj_iterable_primed) print( f'({datetime.now().strftime(HMS) }) Finisehd priming generators.') cls._store['pdf_20mil_int_iterable_primed'] = df_int_iterable_primed cls._store['sff_20mil_int_iterable_primed'] = frame_int_iterable_primed cls._store['pdf_20mil_obj_iterable_primed'] = df_obj_iterable_primed cls._store['sff_20mil_obj_iterable_primed'] = frame_obj_iterable_primed
def test_frame_iter_group_labels_array_items_b(self) -> None: f = sf.Frame(np.arange(8).reshape(2, 4), columns=sf.IndexHierarchy.from_labels( ((1, 'a'), (2, 'b'), (1, 'b'), (2, 'a')))) post1 = tuple(f.iter_group_labels_array_items(1, axis=1)) self.assertEqual(len(post1), 2) self.assertEqual([a[1].__class__ for a in post1], [np.ndarray, np.ndarray]) self.assertEqual([a[1].shape for a in post1], [(2, 2), (2, 2)]) self.assertEqual([a[0] for a in post1], ['a', 'b'])
def create(cls): if not os.path.exists(cls.FP_CSV): with request.urlopen(cls.URL_CSV) as response: with open(cls.FP_CSV, 'w') as f: f.write(response.read().decode('utf-8')) cls._store['data_csv_fp'] = cls.FP_CSV cls._store['data_json_url'] = cls.URL_JSON labels_src = list(''.join(x) for x in it.combinations(string.ascii_lowercase, 4)) assert len(labels_src) > 10000 index = labels_src[:10000] columns = labels_src[:1000] data_float = np.random.rand(len(index), len(columns)) # alt floats, Bools data_func = [ lambda: np.random.rand(len(index)), lambda: np.random.randint(-1, 1, len(index)).astype(bool) ] cls._store['index'] = index cls._store['index_target'] = [idx for idx in index if 'd' in idx] cls._store['columns'] = columns cls._store['columns_target'] = [c for c in columns if 'd' in c] cls._store['data_float'] = data_float cls._store['data_func'] = data_func cls._store['sf.FrameFloat'] = sf.Frame(data_float, index=index, columns=columns) cls._store['pd.FrameFloat'] = pd.DataFrame(data_float, index=index, columns=columns) data_cols = {i: data_func[i % 2]() for i in range(len(columns))} cls._store['sf.FrameMixed'] = sf.Frame(data_cols, index=index) cls._store['pd.FrameMixed'] = pd.DataFrame(data_cols, index=index)
def test_display_html_table_a(self): f = sf.Frame( dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst'))) f = f.set_index_hierarchy(['a', 'b']) f = f.reindex_add_level(columns='I') f = f.reindex_add_level(columns='J') expected = f.display( sf.DisplayConfig(display_format='html_table', type_color=False)) html = ''' <table border="1"><thead><tr><th><Frame></th><th></th><th></th><th></th><th></th><th></th></tr><tr><th><IndexHierarchy></th><th></th><th>J</th><th>J</th><th>J</th><th><<U1></th></tr><tr><th></th><th></th><th>I</th><th>I</th><th>I</th><th><<U1></th></tr><tr><th></th><th></th><th>a</th><th>b</th><th>c</th><th><<U1></th></tr><tr><th><IndexHierarchy></th><th></th><th></th><th></th><th></th><th></th></tr></thead><tbody><tr><th>1</th><th>True</th><td>1</td><td>True</td><td>q</td><td></td></tr><tr><th>2</th><th>False</th><td>2</td><td>False</td><td>r</td><td></td></tr><tr><th>3</th><th>True</th><td>3</td><td>True</td><td>s</td><td></td></tr><tr><th>4</th><th>False</th><td>4</td><td>False</td><td>t</td><td></td></tr><tr><th><object></th><th><object></th><td><int64></td><td><bool></td><td><<U1></td><td></td></tr></tbody></table> ''' # import ipdb; ipdb.set_trace() self.assertEqual(html.strip(), str(expected).strip())
def get_sample_frame_float_string_index(size=10000, columns=100): a1 = (np.arange(size * columns)).reshape((size, columns)) * .001 # insert random nan in very other columns for col in range(0, 100, 2): a1[:100, col] = np.nan index = [ hashlib.sha224(str(x).encode('utf-8')).hexdigest() for x in range(size) ] columns = [ hashlib.sha224(str(x).encode('utf-8')).hexdigest() for x in range(columns) ] sff = sf.Frame(a1, index=index, columns=columns) pdf = pd.DataFrame(a1, index=index, columns=columns) return pdf, sff, a1
def test_frame_iter_array_f(self) -> None: f = sf.Frame(np.arange(12).reshape(3,4), index=IndexDate.from_date_range('2020-01-01', '2020-01-03')) post = f.iter_array(axis=0).apply(np.sum, name='foo') self.assertEqual(post.name, 'foo') self.assertEqual( f.iter_array(axis=0).apply(np.sum).to_pairs(), ((0, 12), (1, 15), (2, 18), (3, 21)) ) self.assertEqual( f.iter_array(axis=1).apply(np.sum).to_pairs(), ((np.datetime64('2020-01-01'), 6), (np.datetime64('2020-01-02'), 22), (np.datetime64('2020-01-03'), 38)) )
def test_display_type_color_a(self): f = Frame(dict(a=(1, 2), b=(1.2, 3.4), c=(False, True), d=(object(), []), e=(1j, 3j), f=(np.datetime64('2014'), np.datetime64('2015')), g=(np.datetime64('2014') - np.datetime64('2015'), np.datetime64('2014') - np.datetime64('2015'))), index=tuple('xy')) print(f) print(f.loc['x']) print(f.display(DisplayConfigs.COLOR)) print(f.loc['x'].display(DisplayConfigs.COLOR)) f = sf.Frame( dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst'))) f = f.set_index_hierarchy(['a', 'b']) f = f.reindex_add_level(columns='I') f = f.reindex_add_level(columns='J') print(f)
import numpy as np import static_frame as sf import frame_fixtures as ff if __name__ == '__main__': # prlimit --as=850000000 python3 doc/source/articles/memmap_slice.py fp = '/tmp/big_frame' a1 = np.arange(10_000_000).reshape(1_000_000, 10) columns = tuple('abcdefghij') f1 = sf.Frame(a1, columns=columns) print('to npy') f1.to_npy(fp) # # loading two of these fails # print('start from_npy f2') # f2 = sf.Frame.from_npy(fp) # print('start from_npy f3') # f3 = sf.Frame.from_npy(fp) # we can create two of these print('start from_npy f2 from memmap') f2 = sf.Frame.from_npy(fp, memory_map=True) print('start from_npy f3 from memmap')
def sf() -> None: data = SampleData.get('npf_float_10k') labels = SampleData.get('label_str') post = sf.Frame(data, index=labels[:data.shape[0]], columns=labels[:data.shape[1]])
def origin_data() -> TypeIterFrameItems: for label, i in ((chr(i), i) for i in range(65, 75)): # A, B, ... f = sf.Frame(np.arange(100000).reshape(1000, 100) * i, name=label) yield label, f
def sf(cls) -> None: post = sf.Frame(SampleData.get('data_float'), index=SampleData.get('index'), columns=SampleData.get('columns')) assert post.shape == (10000, 1000)
def test_display_type_color_a(self) -> None: f = sf.Frame.from_dict(dict( a=(1, 2), b=(1.2, 3.4), c=(False, True), e=(1j, 3j), f=(np.datetime64('2014'), np.datetime64('2015')), ), index=tuple('xy')) print(f) print(f.loc['x']) sf.DisplayActive.set(sf.DisplayConfigs.COLOR) print(f.display(sf.DisplayConfigs.COLOR)) print(f.loc['x'].display(sf.DisplayConfigs.COLOR)) f = sf.Frame.from_dict( dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst'))) f = f.set_index_hierarchy(['a', 'b']) f = f.relabel_add_level(columns='I') f = f.relabel_add_level(columns='J') print(f) # columns = sf.IndexHierarchy.from_product((96361, 96345), (0, 1)) # index = sf.IndexHierarchy.from_product((32155, 32175), (0, 4)) # columns = range(4) # index = range(4) # f = sf.Frame.from_records( # ([y for y in range(x, x + 4)] for x in range(4)), # index=index, columns=columns) from itertools import product index: tp.Iterable[tp.Hashable] = (0x2210, 0x2330) columns: tp.Iterable[tp.Hashable] = (0x1, 0xe) f = sf.Frame.from_element_loc_items( ( (x, chr(sum(x))) for x in product(index, columns) ), # type: ignore # Should probably open a typeshed issue for this. index=index, columns=columns, dtype=str) print(f) columns = list('abcdefgh') index = range(1, 9) f = sf.Frame(np.empty((8, 8), dtype='U1'), columns=columns, index=index) print(f) columns = tuple('efgh') index = range(3, 0, -1) f = sf.Frame.from_element_loc_items( ( ((2, 'f'), chr(0x265F)), # pawn ((2, 'g'), chr(0x265F)), ((2, 'h'), chr(0x265F)), ((1, 'e'), chr(0x265A)), # king ((1, 'h'), chr(0x265C)), # rook ), index=index, columns=columns, dtype=str) #part of Sicilian Defense Najdorf Variation columns = tuple('hgfe') index = range(6, 9) f = Frame.from_element_loc_items( ( ((7, 'h'), chr(0x265F)), # pawn ((6, 'g'), chr(0x265F)), ((7, 'f'), chr(0x265F)), ((7, 'e'), chr(0x265F)), ((8, 'e'), chr(0x265A)), # king ((7, 'g'), chr(0x265D)), # biship ((6, 'f'), chr(0x265E)), # horse ((8, 'h'), chr(0x265C)), # rook ), index=index, columns=columns, dtype=str) s1 = Series.from_items((('f', chr(0x265C)), ('g', chr(0x265A)))) f.assign.loc[8, :](s1, fill_value='')
def sf(): post = sf.Frame(SampleData.get('npf_float_10k'))
def test_display_type_color_a(self): f = Frame(dict(a=(1, 2), b=(1.2, 3.4), c=(False, True), d=(object(), []), e=(1j, 3j), f=(np.datetime64('2014'), np.datetime64('2015')), g=(np.datetime64('2014') - np.datetime64('2015'), np.datetime64('2014') - np.datetime64('2015'))), index=tuple('xy')) print(f) print(f.loc['x']) print(f.display(DisplayConfigs.COLOR)) print(f.loc['x'].display(DisplayConfigs.COLOR)) f = sf.Frame( dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst'))) f = f.set_index_hierarchy(['a', 'b']) f = f.reindex_add_level(columns='I') f = f.reindex_add_level(columns='J') print(f) # columns = sf.IndexHierarchy.from_product((96361, 96345), (0, 1)) # index = sf.IndexHierarchy.from_product((32155, 32175), (0, 4)) # columns = range(4) # index = range(4) # f = sf.Frame.from_records( # ([y for y in range(x, x + 4)] for x in range(4)), # index=index, columns=columns) from itertools import product index = (0x2210, 0x2330) columns = (0x1, 0xe) f = Frame.from_element_loc_items( ((x, chr(sum(x))) for x in product(index, columns)), index=index, columns=columns, dtype=str) print(f) columns = list('abcdefgh') index = range(1, 9) f = sf.Frame(np.empty((8, 8), dtype='U1'), columns=columns, index=index) print(f) # f.display(sf.DisplayActive.get(display_format='html_datatables')) # f.to_html_datatables() columns = tuple('efgh') index = range(3, 0, -1) f = Frame.from_element_loc_items( ( ((2, 'f'), chr(0x265F)), # pawn ((2, 'g'), chr(0x265F)), ((2, 'h'), chr(0x265F)), ((1, 'e'), chr(0x265A)), # king ((1, 'h'), chr(0x265C)), # rook ), index=index, columns=columns, dtype=str) #part of Sicilian Defense Najdorf Variation columns = tuple('hgfe') index = range(6, 9) f = Frame.from_element_loc_items( ( ((7, 'h'), chr(0x265F)), # pawn ((6, 'g'), chr(0x265F)), ((7, 'f'), chr(0x265F)), ((7, 'e'), chr(0x265F)), ((8, 'e'), chr(0x265A)), # king ((7, 'g'), chr(0x265D)), # biship ((6, 'f'), chr(0x265E)), # horse ((8, 'h'), chr(0x265C)), # rook ), index=index, columns=columns, dtype=str) # s = Series(('', chr(0x265C), '', chr(0x265A)), index=tuple('efgh')) # s = Series.from_items((('f', chr(0x265C)), ('h', chr(0x265A)))).reindex(tuple('efgh'), fill_value='') s = Series.from_items((('f', chr(0x265C)), ('g', chr(0x265A)))) f.assign.loc[8, :](s, fill_value='')