def get_expected(data, col_properties): expected = [] _map_col_types = {'INT': 'int_col', 'DOUBLE': 'real_col', 'STR': 'str_col'} _map_col_types.update( {k: 'str_col' for k in _pandas_loaders.GEO_TYPE_NAMES}) isnull = data.isnull() for prop in col_properties: nulls = isnull[prop['name']].tolist() if prop['is_array']: arr_col = [] for v in data[prop['name']]: arr_col.append( TColumn(data=TColumnData( **{_map_col_types[prop['type']]: v})), ) col = TColumn(data=TColumnData(arr_col=arr_col), nulls=nulls) elif prop['type'] in _pandas_loaders.GEO_TYPE_NAMES: col = TColumn( data=TColumnData( **{ _map_col_types[prop['type']]: data[prop['name']].apply(lambda g: g.wkt) }), nulls=nulls, ) else: col = TColumn( data=TColumnData( **{_map_col_types[prop['type']]: data[prop['name']]}), nulls=nulls, ) expected.append(col) return expected
def get_expected(data, col_properties): expected = [] _map_col_types = { 'INT': 'int_col', 'DOUBLE': 'real_col', 'STR': 'str_col', 'TIMESTAMP': 'int_col', 'DECIMAL': 'int_col', } _map_col_types.update( {k: 'str_col' for k in _pandas_loaders.GEO_TYPE_NAMES}) isnull = data.isnull() for prop in col_properties: nulls = isnull[prop['name']].tolist() if prop['is_array']: arr_col = [] for v in data[prop['name']]: arr_col.append( TColumn(data=TColumnData( **{_map_col_types[prop['type']]: v})), ) col = TColumn(data=TColumnData(arr_col=arr_col), nulls=nulls) elif prop['type'] in _pandas_loaders.GEO_TYPE_NAMES: col = TColumn( data=TColumnData( **{ _map_col_types[prop['type']]: data[prop['name']].apply(lambda g: g.wkt) }), nulls=nulls, ) else: if prop['type'] == 'TIMESTAMP': # convert datetime to epoch if data[prop['name']].dt.nanosecond.sum(): data[prop['name']] = data[prop['name']].astype(int) else: data[prop['name']] = (data[prop['name']].astype(int) // 10**9) elif prop['type'] == 'DECIMAL': # data = (data * 10 ** precision).astype(int) \ # * 10 ** (scale - precision) data[prop['name']] = (data[prop['name']] * 10** prop['precision']).astype(int) * 10**( prop['scale'] - prop['precision']) col = TColumn( data=TColumnData( **{_map_col_types[prop['type']]: data[prop['name']]}), nulls=nulls, ) expected.append(col) return expected
def test_build_table_columnar_nulls(self): common_col_params = dict( nullable=True, scale=0, comp_param=0, encoding='NONE', is_array=False, ) col_types = [ ColumnDetails(name='boolean_', type='BOOL', precision=0, **common_col_params), ColumnDetails(name='int_', type='INT', precision=0, **common_col_params), ColumnDetails(name='bigint_', type='BIGINT', precision=0, **common_col_params), ColumnDetails(name='double_', type='DOUBLE', precision=0, **common_col_params), ColumnDetails(name='varchar_', type='STR', precision=0, **common_col_params), ColumnDetails(name='text_', type='STR', precision=0, **common_col_params), ColumnDetails(name='time_', type='TIME', precision=0, **common_col_params), ColumnDetails( name='timestamp_', type='TIMESTAMP', **common_col_params, precision=0, ), ColumnDetails(name='date_', type='DATE', precision=0, **common_col_params), ] data = pd.DataFrame({ 'boolean_': [True, False, None], # Currently Pandas does not support storing None or NaN # in integer columns, so int cols with null # need to be objects. This means our type detection will be # unreliable since if there is no number outside the int32 # bounds in a column with nulls then we will be assuming int 'int_': np.array([0, 1, None], dtype=np.object), 'bigint_': np.array([0, 9223372036854775807, None], dtype=np.object), 'double_': np.array([0, 1, None], dtype=np.float64), 'varchar_': ['a', 'b', None], 'text_': ['a', 'b', None], 'time_': [datetime.time(0, 11, 59), datetime.time(13), None], 'timestamp_': [ pd.Timestamp('2016'), pd.Timestamp('2017'), None, ], 'date_': [ datetime.date(1001, 1, 1), datetime.date(2017, 1, 1), None, ], }) result = _pandas_loaders.build_input_columnar( data, preserve_index=False, col_names=data.columns, col_types=col_types, ) nulls = [False, False, True] bool_na = -128 int_na = -2147483648 bigint_na = -9223372036854775808 ns_na = -9223372037 double_na = 0 expected = [ TColumn(TColumnData(int_col=[1, 0, bool_na]), nulls=nulls), TColumn( TColumnData(int_col=np.array([0, 1, int_na], dtype=np.int32)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array( [0, 9223372036854775807, bigint_na], dtype=np.int64)), nulls=nulls, ), # noqa TColumn( TColumnData( real_col=np.array([0, 1, double_na], dtype=np.float64)), nulls=nulls, ), # noqa TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls), TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls), TColumn(TColumnData(int_col=[719, 46800, bigint_na]), nulls=nulls), TColumn( TColumnData(int_col=[1451606400, 1483228800, ns_na]), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=[-30578688000, 1483228800, bigint_na]), nulls=nulls, ), # noqa ] assert_columnar_equal(result[0], expected)
def test_build_table_columnar_pandas(self): common_col_params = dict( nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ) col_types = [ ColumnDetails(name='boolean_', type='BOOL', **common_col_params), ColumnDetails(name='smallint_', type='SMALLINT', **common_col_params), ColumnDetails(name='int_', type='INT', **common_col_params), ColumnDetails(name='bigint_', type='BIGINT', **common_col_params), ColumnDetails(name='float_', type='FLOAT', **common_col_params), ColumnDetails(name='double_', type='DOUBLE', **common_col_params), ColumnDetails(name='varchar_', type='STR', **common_col_params), ColumnDetails(name='text_', type='STR', **common_col_params), ColumnDetails(name='time_', type='TIME', **common_col_params), ColumnDetails( name='timestamp_', type='TIMESTAMP', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ), ColumnDetails(name='date_', type='DATE', **common_col_params), ] data = pd.DataFrame({ 'boolean_': [True, False], 'smallint_': np.array([0, 1], dtype=np.int16), 'int_': np.array([0, 1], dtype=np.int32), 'bigint_': np.array([0, 1], dtype=np.int64), 'float_': np.array([0, 1], dtype=np.float32), 'double_': np.array([0, 1], dtype=np.float64), 'varchar_': ['a', 'b'], 'text_': ['a', 'b'], 'time_': [datetime.time(0, 11, 59), datetime.time(13)], 'timestamp_': [pd.Timestamp('2016'), pd.Timestamp('2017')], 'date_': [ datetime.date(2016, 1, 1), datetime.date(2017, 1, 1), ], }) result = _pandas_loaders.build_input_columnar( data, preserve_index=False, col_names=data.columns, col_types=col_types, ) nulls = [False, False] expected = [ TColumn(TColumnData(int_col=[True, False]), nulls=nulls), TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int16)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int32)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int64)), nulls=nulls, ), # noqa TColumn( TColumnData(real_col=np.array([0, 1], dtype=np.float32)), nulls=nulls, ), # noqa TColumn( TColumnData(real_col=np.array([0, 1], dtype=np.float64)), nulls=nulls, ), # noqa TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls), TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls), TColumn(TColumnData(int_col=[719, 46800]), nulls=nulls), TColumn(TColumnData(int_col=[1451606400, 1483228800]), nulls=nulls), # noqa TColumn(TColumnData(int_col=[1451606400, 1483228800]), nulls=nulls), ] assert_columnar_equal(result[0], expected)