Ejemplo n.º 1
0
    def test_frame_iter_group_items_c(self) -> None:
        # Test optimized sorting approach. Data must have a non-object dtype and key must be single
        data = np.array([[0, 1, 1, 3], [3, 3, 2, 3], [5, 5, 1, 3],
                         [7, 2, 2, 4]])

        frame = sf.Frame(data, columns=tuple('abcd'), index=tuple('wxyz'))

        # Column
        groups = list(frame.iter_group_items('c', axis=0))
        expected_pairs = [
            (('a', (('w', 0), ('y', 5))), ('b', (('w', 1), ('y', 5))),
             ('c', (('w', 1), ('y', 1))), ('d', (('w', 3), ('y', 3)))),
            (('a', (('x', 3), ('z', 7))), ('b', (('x', 3), ('z', 2))),
             ('c', (('x', 2), ('z', 2))), ('d', (('x', 3), ('z', 4))))
        ]

        self.assertEqual([1, 2], [group[0] for group in groups])
        self.assertEqual(expected_pairs,
                         [group[1].to_pairs(axis=0) for group in groups])

        # Index
        groups = list(frame.iter_group_items('w', axis=1))
        expected_pairs = [
            (('a', (('w', 0), ('x', 3), ('y', 5), ('z', 7))), ),  #type: ignore
            (
                ('b', (('w', 1), ('x', 3), ('y', 5), ('z', 2))),  #type: ignore
                ('c', (('w', 1), ('x', 2), ('y', 1), ('z', 2)))),
            (('d', (('w', 3), ('x', 3), ('y', 3), ('z', 4))), )
        ]  #type: ignore

        self.assertEqual([0, 1, 3], [group[0] for group in groups])
        self.assertEqual(expected_pairs,
                         [group[1].to_pairs(axis=0) for group in groups])
Ejemplo n.º 2
0
    def test_frame_iter_group_items_d1(self) -> None:
        # Test iterating with multiple key selection
        data = np.array([[0, 1, 1, 3],
                         [3, 3, 2, 3],
                         [5, 5, 1, 3],
                         [7, 2, 2, 4]])

        frame = sf.Frame(data, columns=tuple('abcd'), index=tuple('wxyz'))

        # Column
        groups = list(frame.iter_group_items(['c', 'd'], axis=0))
        self.assertEqual([(1, 3), (2, 3), (2, 4)], [group[0] for group in groups])
        expected_pairs = [
                (('a', (('w', 0), ('y', 5))),
                 ('b', (('w', 1), ('y', 5))),
                 ('c', (('w', 1), ('y', 1))),
                 ('d', (('w', 3), ('y', 3)))),
                (('a', (('x', 3),)),
                 ('b', (('x', 3),)),
                 ('c', (('x', 2),)),
                 ('d', (('x', 3),))),
                (('a', (('z', 7),)),
                 ('b', (('z', 2),)),
                 ('c', (('z', 2),)),
                 ('d', (('z', 4),)))]

        self.assertEqual(expected_pairs, [group[1].to_pairs(axis=0) for group in groups])
    def test_frame_iter_group_f(self) -> None:
        f = sf.Frame(np.arange(3).reshape(1, 3), columns=tuple('abc'))
        f = f.drop.loc[0]
        post1 = tuple(f.iter_group(['b', 'c']))
        self.assertEqual(post1, ())

        post2 = tuple(f.iter_group('a'))
        self.assertEqual(post2, ())
Ejemplo n.º 4
0
 def sf(cls):
     data_func = SampleData.get('data_func')
     columns = SampleData.get('columns')
     index = SampleData.get('index')
     post = sf.Frame({i: data_func[i % 2]()
                      for i in range(len(columns))},
                     index=index)
     assert post.shape == (10000, 1000)
Ejemplo n.º 5
0
    def create(cls) -> None:
        print(f'({datetime.now().strftime(HMS) }) Building cache.')
        rows = 20_000_000
        cols = 9
        num_groups = 100_000
        columns = tuple('abcdefghi') + (GROUPBY_COL, )

        arr = np.random.random(rows * cols).reshape(rows, cols)
        groups = np.array([
            i % num_groups for i in np.random.permutation(rows)
        ]).reshape(rows, 1)

        int_arr = np.hstack((arr, groups))
        df_int = pd.DataFrame(int_arr, columns=columns)
        frame_int = sf.Frame(int_arr, columns=columns)

        obj_arr = np.hstack((arr, groups)).astype(object)
        df_obj = pd.DataFrame(obj_arr,
                              columns=columns).astype({GROUPBY_COL: int})
        frame_obj = sf.Frame(obj_arr, columns=columns).astype[GROUPBY_COL](int)
        print(f'({datetime.now().strftime(HMS) }) Finished building cache.')

        cls._store['pdf_20mil_int'] = df_int
        cls._store['sff_20mil_int'] = frame_int
        cls._store['pdf_20mil_obj'] = df_obj
        cls._store['sff_20mil_obj'] = frame_obj

        print(f'({datetime.now().strftime(HMS) }) Priming generators.')
        df_int_iterable_primed = iter(df_int.groupby(GROUPBY_COL, sort=False))
        next(df_int_iterable_primed)
        frame_int_iterable_primed = iter(
            frame_int.iter_group_items(GROUPBY_COL))
        next(frame_int_iterable_primed)
        df_obj_iterable_primed = iter(df_obj.groupby(GROUPBY_COL, sort=False))
        next(df_obj_iterable_primed)
        frame_obj_iterable_primed = iter(
            frame_obj.iter_group_items(GROUPBY_COL))
        next(frame_obj_iterable_primed)
        print(
            f'({datetime.now().strftime(HMS) }) Finisehd priming generators.')

        cls._store['pdf_20mil_int_iterable_primed'] = df_int_iterable_primed
        cls._store['sff_20mil_int_iterable_primed'] = frame_int_iterable_primed
        cls._store['pdf_20mil_obj_iterable_primed'] = df_obj_iterable_primed
        cls._store['sff_20mil_obj_iterable_primed'] = frame_obj_iterable_primed
 def test_frame_iter_group_labels_array_items_b(self) -> None:
     f = sf.Frame(np.arange(8).reshape(2, 4),
                  columns=sf.IndexHierarchy.from_labels(
                      ((1, 'a'), (2, 'b'), (1, 'b'), (2, 'a'))))
     post1 = tuple(f.iter_group_labels_array_items(1, axis=1))
     self.assertEqual(len(post1), 2)
     self.assertEqual([a[1].__class__ for a in post1],
                      [np.ndarray, np.ndarray])
     self.assertEqual([a[1].shape for a in post1], [(2, 2), (2, 2)])
     self.assertEqual([a[0] for a in post1], ['a', 'b'])
Ejemplo n.º 7
0
    def create(cls):

        if not os.path.exists(cls.FP_CSV):
            with request.urlopen(cls.URL_CSV) as response:
                with open(cls.FP_CSV, 'w') as f:
                    f.write(response.read().decode('utf-8'))

        cls._store['data_csv_fp'] = cls.FP_CSV
        cls._store['data_json_url'] = cls.URL_JSON

        labels_src = list(''.join(x)
                          for x in it.combinations(string.ascii_lowercase, 4))
        assert len(labels_src) > 10000

        index = labels_src[:10000]
        columns = labels_src[:1000]

        data_float = np.random.rand(len(index), len(columns))

        # alt floats, Bools
        data_func = [
            lambda: np.random.rand(len(index)),
            lambda: np.random.randint(-1, 1, len(index)).astype(bool)
        ]

        cls._store['index'] = index
        cls._store['index_target'] = [idx for idx in index if 'd' in idx]
        cls._store['columns'] = columns
        cls._store['columns_target'] = [c for c in columns if 'd' in c]
        cls._store['data_float'] = data_float
        cls._store['data_func'] = data_func

        cls._store['sf.FrameFloat'] = sf.Frame(data_float,
                                               index=index,
                                               columns=columns)
        cls._store['pd.FrameFloat'] = pd.DataFrame(data_float,
                                                   index=index,
                                                   columns=columns)

        data_cols = {i: data_func[i % 2]() for i in range(len(columns))}
        cls._store['sf.FrameMixed'] = sf.Frame(data_cols, index=index)
        cls._store['pd.FrameMixed'] = pd.DataFrame(data_cols, index=index)
Ejemplo n.º 8
0
    def test_display_html_table_a(self):
        f = sf.Frame(
            dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst')))
        f = f.set_index_hierarchy(['a', 'b'])
        f = f.reindex_add_level(columns='I')
        f = f.reindex_add_level(columns='J')

        expected = f.display(
            sf.DisplayConfig(display_format='html_table', type_color=False))
        html = '''
<table border="1"><thead><tr><th>&lt;Frame&gt;</th><th></th><th></th><th></th><th></th><th></th></tr><tr><th>&lt;IndexHierarchy&gt;</th><th></th><th>J</th><th>J</th><th>J</th><th>&lt;&lt;U1&gt;</th></tr><tr><th></th><th></th><th>I</th><th>I</th><th>I</th><th>&lt;&lt;U1&gt;</th></tr><tr><th></th><th></th><th>a</th><th>b</th><th>c</th><th>&lt;&lt;U1&gt;</th></tr><tr><th>&lt;IndexHierarchy&gt;</th><th></th><th></th><th></th><th></th><th></th></tr></thead><tbody><tr><th>1</th><th>True</th><td>1</td><td>True</td><td>q</td><td></td></tr><tr><th>2</th><th>False</th><td>2</td><td>False</td><td>r</td><td></td></tr><tr><th>3</th><th>True</th><td>3</td><td>True</td><td>s</td><td></td></tr><tr><th>4</th><th>False</th><td>4</td><td>False</td><td>t</td><td></td></tr><tr><th>&lt;object&gt;</th><th>&lt;object&gt;</th><td>&lt;int64&gt;</td><td>&lt;bool&gt;</td><td>&lt;&lt;U1&gt;</td><td></td></tr></tbody></table>
        '''

        # import ipdb; ipdb.set_trace()
        self.assertEqual(html.strip(), str(expected).strip())
Ejemplo n.º 9
0
def get_sample_frame_float_string_index(size=10000, columns=100):
    a1 = (np.arange(size * columns)).reshape((size, columns)) * .001
    # insert random nan in very other columns
    for col in range(0, 100, 2):
        a1[:100, col] = np.nan
    index = [
        hashlib.sha224(str(x).encode('utf-8')).hexdigest() for x in range(size)
    ]
    columns = [
        hashlib.sha224(str(x).encode('utf-8')).hexdigest()
        for x in range(columns)
    ]
    sff = sf.Frame(a1, index=index, columns=columns)
    pdf = pd.DataFrame(a1, index=index, columns=columns)
    return pdf, sff, a1
Ejemplo n.º 10
0
    def test_frame_iter_array_f(self) -> None:

        f = sf.Frame(np.arange(12).reshape(3,4),
                index=IndexDate.from_date_range('2020-01-01', '2020-01-03'))

        post = f.iter_array(axis=0).apply(np.sum, name='foo')
        self.assertEqual(post.name, 'foo')

        self.assertEqual(
                f.iter_array(axis=0).apply(np.sum).to_pairs(),
                ((0, 12), (1, 15), (2, 18), (3, 21))
                )

        self.assertEqual(
                f.iter_array(axis=1).apply(np.sum).to_pairs(),
                ((np.datetime64('2020-01-01'), 6), (np.datetime64('2020-01-02'), 22), (np.datetime64('2020-01-03'), 38))
                )
Ejemplo n.º 11
0
    def test_display_type_color_a(self):

        f = Frame(dict(a=(1, 2),
                       b=(1.2, 3.4),
                       c=(False, True),
                       d=(object(), []),
                       e=(1j, 3j),
                       f=(np.datetime64('2014'), np.datetime64('2015')),
                       g=(np.datetime64('2014') - np.datetime64('2015'),
                          np.datetime64('2014') - np.datetime64('2015'))),
                  index=tuple('xy'))
        print(f)
        print(f.loc['x'])

        print(f.display(DisplayConfigs.COLOR))
        print(f.loc['x'].display(DisplayConfigs.COLOR))

        f = sf.Frame(
            dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst')))
        f = f.set_index_hierarchy(['a', 'b'])
        f = f.reindex_add_level(columns='I')
        f = f.reindex_add_level(columns='J')
        print(f)
Ejemplo n.º 12
0
import numpy as np

import static_frame as sf
import frame_fixtures as ff

if __name__ == '__main__':

    # prlimit --as=850000000 python3 doc/source/articles/memmap_slice.py

    fp = '/tmp/big_frame'

    a1 = np.arange(10_000_000).reshape(1_000_000, 10)
    columns = tuple('abcdefghij')
    f1 = sf.Frame(a1, columns=columns)

    print('to npy')
    f1.to_npy(fp)

    # # loading two of these fails
    # print('start from_npy f2')
    # f2 = sf.Frame.from_npy(fp)

    # print('start from_npy f3')
    # f3 = sf.Frame.from_npy(fp)

    # we can create two of these

    print('start from_npy f2 from memmap')
    f2 = sf.Frame.from_npy(fp, memory_map=True)

    print('start from_npy f3 from memmap')
Ejemplo n.º 13
0
 def sf() -> None:
     data = SampleData.get('npf_float_10k')
     labels = SampleData.get('label_str')
     post = sf.Frame(data,
                     index=labels[:data.shape[0]],
                     columns=labels[:data.shape[1]])
Ejemplo n.º 14
0
 def origin_data() -> TypeIterFrameItems:
     for label, i in ((chr(i), i) for i in range(65, 75)):  # A, B, ...
         f = sf.Frame(np.arange(100000).reshape(1000, 100) * i, name=label)
         yield label, f
Ejemplo n.º 15
0
 def sf(cls) -> None:
     post = sf.Frame(SampleData.get('data_float'),
                     index=SampleData.get('index'),
                     columns=SampleData.get('columns'))
     assert post.shape == (10000, 1000)
Ejemplo n.º 16
0
    def test_display_type_color_a(self) -> None:

        f = sf.Frame.from_dict(dict(
            a=(1, 2),
            b=(1.2, 3.4),
            c=(False, True),
            e=(1j, 3j),
            f=(np.datetime64('2014'), np.datetime64('2015')),
        ),
                               index=tuple('xy'))
        print(f)
        print(f.loc['x'])

        sf.DisplayActive.set(sf.DisplayConfigs.COLOR)

        print(f.display(sf.DisplayConfigs.COLOR))
        print(f.loc['x'].display(sf.DisplayConfigs.COLOR))

        f = sf.Frame.from_dict(
            dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst')))
        f = f.set_index_hierarchy(['a', 'b'])
        f = f.relabel_add_level(columns='I')
        f = f.relabel_add_level(columns='J')
        print(f)

        # columns = sf.IndexHierarchy.from_product((96361, 96345), (0, 1))
        # index = sf.IndexHierarchy.from_product((32155, 32175), (0, 4))
        # columns = range(4)
        # index = range(4)
        # f = sf.Frame.from_records(
        #     ([y for y in range(x, x + 4)] for x in range(4)),
        #     index=index, columns=columns)

        from itertools import product
        index: tp.Iterable[tp.Hashable] = (0x2210, 0x2330)
        columns: tp.Iterable[tp.Hashable] = (0x1, 0xe)
        f = sf.Frame.from_element_loc_items(
            (
                (x, chr(sum(x))) for x in product(index, columns)
            ),  # type: ignore  # Should probably open a typeshed issue for this.
            index=index,
            columns=columns,
            dtype=str)
        print(f)

        columns = list('abcdefgh')
        index = range(1, 9)

        f = sf.Frame(np.empty((8, 8), dtype='U1'),
                     columns=columns,
                     index=index)
        print(f)

        columns = tuple('efgh')
        index = range(3, 0, -1)

        f = sf.Frame.from_element_loc_items(
            (
                ((2, 'f'), chr(0x265F)),  # pawn
                ((2, 'g'), chr(0x265F)),
                ((2, 'h'), chr(0x265F)),
                ((1, 'e'), chr(0x265A)),  # king
                ((1, 'h'), chr(0x265C)),  # rook
            ),
            index=index,
            columns=columns,
            dtype=str)

        #part of Sicilian Defense Najdorf Variation
        columns = tuple('hgfe')
        index = range(6, 9)

        f = Frame.from_element_loc_items(
            (
                ((7, 'h'), chr(0x265F)),  # pawn
                ((6, 'g'), chr(0x265F)),
                ((7, 'f'), chr(0x265F)),
                ((7, 'e'), chr(0x265F)),
                ((8, 'e'), chr(0x265A)),  # king
                ((7, 'g'), chr(0x265D)),  # biship
                ((6, 'f'), chr(0x265E)),  # horse
                ((8, 'h'), chr(0x265C)),  # rook
            ),
            index=index,
            columns=columns,
            dtype=str)

        s1 = Series.from_items((('f', chr(0x265C)), ('g', chr(0x265A))))

        f.assign.loc[8, :](s1, fill_value='')
Ejemplo n.º 17
0
 def sf():
     post = sf.Frame(SampleData.get('npf_float_10k'))
Ejemplo n.º 18
0
    def test_display_type_color_a(self):

        f = Frame(dict(a=(1, 2),
                       b=(1.2, 3.4),
                       c=(False, True),
                       d=(object(), []),
                       e=(1j, 3j),
                       f=(np.datetime64('2014'), np.datetime64('2015')),
                       g=(np.datetime64('2014') - np.datetime64('2015'),
                          np.datetime64('2014') - np.datetime64('2015'))),
                  index=tuple('xy'))
        print(f)
        print(f.loc['x'])

        print(f.display(DisplayConfigs.COLOR))
        print(f.loc['x'].display(DisplayConfigs.COLOR))

        f = sf.Frame(
            dict(a=(1, 2, 3, 4), b=(True, False, True, False), c=list('qrst')))
        f = f.set_index_hierarchy(['a', 'b'])
        f = f.reindex_add_level(columns='I')
        f = f.reindex_add_level(columns='J')
        print(f)

        # columns = sf.IndexHierarchy.from_product((96361, 96345), (0, 1))
        # index = sf.IndexHierarchy.from_product((32155, 32175), (0, 4))
        # columns = range(4)
        # index = range(4)
        # f = sf.Frame.from_records(
        #     ([y for y in range(x, x + 4)] for x in range(4)),
        #     index=index, columns=columns)

        from itertools import product
        index = (0x2210, 0x2330)
        columns = (0x1, 0xe)
        f = Frame.from_element_loc_items(
            ((x, chr(sum(x))) for x in product(index, columns)),
            index=index,
            columns=columns,
            dtype=str)
        print(f)

        columns = list('abcdefgh')
        index = range(1, 9)

        f = sf.Frame(np.empty((8, 8), dtype='U1'),
                     columns=columns,
                     index=index)
        print(f)

        # f.display(sf.DisplayActive.get(display_format='html_datatables'))

        # f.to_html_datatables()
        columns = tuple('efgh')
        index = range(3, 0, -1)

        f = Frame.from_element_loc_items(
            (
                ((2, 'f'), chr(0x265F)),  # pawn
                ((2, 'g'), chr(0x265F)),
                ((2, 'h'), chr(0x265F)),
                ((1, 'e'), chr(0x265A)),  # king
                ((1, 'h'), chr(0x265C)),  # rook
            ),
            index=index,
            columns=columns,
            dtype=str)

        #part of Sicilian Defense Najdorf Variation
        columns = tuple('hgfe')
        index = range(6, 9)

        f = Frame.from_element_loc_items(
            (
                ((7, 'h'), chr(0x265F)),  # pawn
                ((6, 'g'), chr(0x265F)),
                ((7, 'f'), chr(0x265F)),
                ((7, 'e'), chr(0x265F)),
                ((8, 'e'), chr(0x265A)),  # king
                ((7, 'g'), chr(0x265D)),  # biship
                ((6, 'f'), chr(0x265E)),  # horse
                ((8, 'h'), chr(0x265C)),  # rook
            ),
            index=index,
            columns=columns,
            dtype=str)

        # s = Series(('', chr(0x265C), '', chr(0x265A)), index=tuple('efgh'))

        # s = Series.from_items((('f', chr(0x265C)), ('h', chr(0x265A)))).reindex(tuple('efgh'), fill_value='')

        s = Series.from_items((('f', chr(0x265C)), ('g', chr(0x265A))))

        f.assign.loc[8, :](s, fill_value='')