Beispiel #1
0
    def test_upload_pandas_categorical_ipc(self, con):

        con.execute("DROP TABLE IF EXISTS test_categorical;")

        df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
        df["B"] = df["A"].astype('category')

        # test that table created correctly when it doesn't exist on server
        con.load_table("test_categorical", df)
        ans = con.execute("select * from test_categorical").fetchall()

        assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')]

        assert con.get_table_details("test_categorical") == [
            ColumnDetails(
                name='A',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
            ColumnDetails(
                name='B',
                type='STR',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=32,
                encoding='DICT',
                is_array=False,
            ),
        ]

        # load row-wise
        con.load_table("test_categorical", df, method="rows")

        # load columnar
        con.load_table("test_categorical", df, method="columnar")

        # load arrow
        con.load_table("test_categorical", df, method="arrow")

        # test end result
        df_ipc = con.select_ipc("select * from test_categorical")
        assert df_ipc.shape == (16, 2)

        res = df.append([df, df, df]).reset_index(drop=True)
        res["A"] = res["A"].astype('category')
        res["B"] = res["B"].astype('category')
        assert pd.DataFrame.equals(df_ipc, res)

        # test that input df wasn't mutated
        # original input is object, categorical
        # to load via Arrow, converted internally to object, object
        assert is_object_dtype(df["A"])
        assert is_categorical_dtype(df["B"])
        con.execute("DROP TABLE IF EXISTS test_categorical;")
Beispiel #2
0
    def test_get_table_details(self, con):

        c = con.cursor()
        c.execute('drop table if exists stocks;')
        create = ('create table stocks (date_ text, trans text, symbol text, '
                  'qty int, price float, vol float);')
        c.execute(create)
        i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);"  # noqa
        i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);"  # noqa

        c.execute(i1)
        c.execute(i2)

        result = con.get_table_details('stocks')
        expected = [
            ColumnDetails(name='date_',
                          type='STR',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=32,
                          encoding='DICT'),
            ColumnDetails(name='trans',
                          type='STR',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=32,
                          encoding='DICT'),
            ColumnDetails(name='symbol',
                          type='STR',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=32,
                          encoding='DICT'),
            ColumnDetails(name='qty',
                          type='INT',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=0,
                          encoding='NONE'),
            ColumnDetails(name='price',
                          type='FLOAT',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=0,
                          encoding='NONE'),
            ColumnDetails(name='vol',
                          type='FLOAT',
                          nullable=True,
                          precision=0,
                          scale=0,
                          comp_param=0,
                          encoding='NONE')
        ]
        assert result == expected
        c.execute('drop table if exists stocks;')
Beispiel #3
0
def get_col_types(col_properties: dict):
    common_col_params = dict(
        nullable=True,
        precision=0,
        scale=0,
        comp_param=0,
        encoding='NONE',
        # is_array=True,
    )

    return [
        ColumnDetails(**properties, **common_col_params)
        for properties in col_properties
    ]
Beispiel #4
0
    def test_extract_row_details(self):
        data = [
            TColumnType(col_name='date_',
                        col_type=TTypeInfo(type=6, encoding=4, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=32),
                        is_reserved_keyword=False, src_name=''),
            TColumnType(col_name='trans',
                        col_type=TTypeInfo(type=6, encoding=4, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=32),
                        is_reserved_keyword=False, src_name=''),
            TColumnType(col_name='symbol',
                        col_type=TTypeInfo(type=6, encoding=4, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=32),
                        is_reserved_keyword=False, src_name=''),
            TColumnType(col_name='qty',
                        col_type=TTypeInfo(type=1, encoding=0, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=0),
                        is_reserved_keyword=False, src_name=''),
            TColumnType(col_name='price',
                        col_type=TTypeInfo(type=3, encoding=0, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=0),
                        is_reserved_keyword=False, src_name=''),
            TColumnType(col_name='vol',
                        col_type=TTypeInfo(type=3, encoding=0, nullable=True,
                                           is_array=False, precision=0,
                                           scale=0, comp_param=0),
                        is_reserved_keyword=False, src_name='')]
        result = _extract_column_details(data)

        expected = [
            ColumnDetails(name='date_', type='STR', nullable=True, precision=0,
                          scale=0, comp_param=32),
            ColumnDetails(name='trans', type='STR', nullable=True, precision=0,
                          scale=0, comp_param=32),
            ColumnDetails(name='symbol', type='STR', nullable=True,
                          precision=0, scale=0, comp_param=32),
            ColumnDetails(name='qty', type='INT', nullable=True, precision=0,
                          scale=0, comp_param=0),
            ColumnDetails(name='price', type='FLOAT', nullable=True,
                          precision=0, scale=0, comp_param=0),
            ColumnDetails(name='vol', type='FLOAT', nullable=True, precision=0,
                          scale=0, comp_param=0)
        ]
        assert result == expected
Beispiel #5
0
 def test_get_table_details(self, con, stocks):
     result = con.get_table_details('stocks')
     expected = [
         ColumnDetails(name='date_',
                       type='STR',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=32,
                       encoding='DICT'),
         ColumnDetails(name='trans',
                       type='STR',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=32,
                       encoding='DICT'),
         ColumnDetails(name='symbol',
                       type='STR',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=32,
                       encoding='DICT'),
         ColumnDetails(name='qty',
                       type='INT',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=0,
                       encoding='NONE'),
         ColumnDetails(name='price',
                       type='FLOAT',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=0,
                       encoding='NONE'),
         ColumnDetails(name='vol',
                       type='FLOAT',
                       nullable=True,
                       precision=0,
                       scale=0,
                       comp_param=0,
                       encoding='NONE')
     ]
     assert result == expected
Beispiel #6
0
    def test_build_table_columnar_nulls(self):
        common_col_params = dict(
            nullable=True,
            scale=0,
            comp_param=0,
            encoding='NONE',
            is_array=False,
        )

        col_types = [
            ColumnDetails(name='boolean_',
                          type='BOOL',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='int_',
                          type='INT',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='bigint_',
                          type='BIGINT',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='double_',
                          type='DOUBLE',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='varchar_',
                          type='STR',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='text_',
                          type='STR',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='time_',
                          type='TIME',
                          precision=0,
                          **common_col_params),
            ColumnDetails(
                name='timestamp_',
                type='TIMESTAMP',
                **common_col_params,
                precision=0,
            ),
            ColumnDetails(name='date_',
                          type='DATE',
                          precision=0,
                          **common_col_params),
        ]

        data = pd.DataFrame({
            'boolean_': [True, False, None],
            # Currently Pandas does not support storing None or NaN
            # in integer columns, so int cols with null
            # need to be objects. This means our type detection will be
            # unreliable since if there is no number outside the int32
            # bounds in a column with nulls then we will be assuming int
            'int_':
            np.array([0, 1, None], dtype=np.object),
            'bigint_':
            np.array([0, 9223372036854775807, None], dtype=np.object),
            'double_':
            np.array([0, 1, None], dtype=np.float64),
            'varchar_': ['a', 'b', None],
            'text_': ['a', 'b', None],
            'time_': [datetime.time(0, 11, 59),
                      datetime.time(13), None],
            'timestamp_': [
                pd.Timestamp('2016'),
                pd.Timestamp('2017'),
                None,
            ],
            'date_': [
                datetime.date(1001, 1, 1),
                datetime.date(2017, 1, 1),
                None,
            ],
        })

        result = _pandas_loaders.build_input_columnar(
            data,
            preserve_index=False,
            col_names=data.columns,
            col_types=col_types,
        )

        nulls = [False, False, True]
        bool_na = -128
        int_na = -2147483648
        bigint_na = -9223372036854775808
        ns_na = -9223372037
        double_na = 0

        expected = [
            TColumn(TColumnData(int_col=[1, 0, bool_na]), nulls=nulls),
            TColumn(
                TColumnData(int_col=np.array([0, 1, int_na], dtype=np.int32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array(
                    [0, 9223372036854775807, bigint_na], dtype=np.int64)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(
                    real_col=np.array([0, 1, double_na], dtype=np.float64)),
                nulls=nulls,
            ),  # noqa
            TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls),
            TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls),
            TColumn(TColumnData(int_col=[719, 46800, bigint_na]), nulls=nulls),
            TColumn(
                TColumnData(int_col=[1451606400, 1483228800, ns_na]),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=[-30578688000, 1483228800, bigint_na]),
                nulls=nulls,
            ),  # noqa
        ]
        assert_columnar_equal(result[0], expected)
Beispiel #7
0
    def test_build_table_columnar_pandas(self):
        common_col_params = dict(
            nullable=True,
            precision=0,
            scale=0,
            comp_param=0,
            encoding='NONE',
            is_array=False,
        )

        col_types = [
            ColumnDetails(name='boolean_', type='BOOL', **common_col_params),
            ColumnDetails(name='smallint_',
                          type='SMALLINT',
                          **common_col_params),
            ColumnDetails(name='int_', type='INT', **common_col_params),
            ColumnDetails(name='bigint_', type='BIGINT', **common_col_params),
            ColumnDetails(name='float_', type='FLOAT', **common_col_params),
            ColumnDetails(name='double_', type='DOUBLE', **common_col_params),
            ColumnDetails(name='varchar_', type='STR', **common_col_params),
            ColumnDetails(name='text_', type='STR', **common_col_params),
            ColumnDetails(name='time_', type='TIME', **common_col_params),
            ColumnDetails(
                name='timestamp_',
                type='TIMESTAMP',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=0,
                encoding='NONE',
                is_array=False,
            ),
            ColumnDetails(name='date_', type='DATE', **common_col_params),
        ]

        data = pd.DataFrame({
            'boolean_': [True, False],
            'smallint_':
            np.array([0, 1], dtype=np.int16),
            'int_':
            np.array([0, 1], dtype=np.int32),
            'bigint_':
            np.array([0, 1], dtype=np.int64),
            'float_':
            np.array([0, 1], dtype=np.float32),
            'double_':
            np.array([0, 1], dtype=np.float64),
            'varchar_': ['a', 'b'],
            'text_': ['a', 'b'],
            'time_': [datetime.time(0, 11, 59),
                      datetime.time(13)],
            'timestamp_': [pd.Timestamp('2016'),
                           pd.Timestamp('2017')],
            'date_': [
                datetime.date(2016, 1, 1),
                datetime.date(2017, 1, 1),
            ],
        })
        result = _pandas_loaders.build_input_columnar(
            data,
            preserve_index=False,
            col_names=data.columns,
            col_types=col_types,
        )

        nulls = [False, False]
        expected = [
            TColumn(TColumnData(int_col=[True, False]), nulls=nulls),
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int16)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int64)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(real_col=np.array([0, 1], dtype=np.float32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(real_col=np.array([0, 1], dtype=np.float64)),
                nulls=nulls,
            ),  # noqa
            TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls),
            TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls),
            TColumn(TColumnData(int_col=[719, 46800]), nulls=nulls),
            TColumn(TColumnData(int_col=[1451606400, 1483228800]),
                    nulls=nulls),  # noqa
            TColumn(TColumnData(int_col=[1451606400, 1483228800]),
                    nulls=nulls),
        ]
        assert_columnar_equal(result[0], expected)