def test_timestamp_with_timezone(): df = pd.DataFrame( {'A': pd.date_range('20130101', periods=3, tz='US/Eastern')} ) schema = sch.infer(df) expected = ibis.schema([('A', "timestamp('US/Eastern')")]) assert schema.equals(expected) assert schema.types[0].equals(dt.Timestamp('US/Eastern'))
def table(self, name, path): if name not in self.list_tables(path): raise AttributeError(name) # get the schema with pd.HDFStore(str(path), mode='r') as store: df = store.select(name, start=0, stop=0) schema = sch.infer(df) t = self.table_class(name, schema, self).to_expr() self.dictionary[name] = path return t
def test_insert(con, temp_table_db, exhaustive_df): tmp_db, table_name = temp_table_db schema = sch.infer(exhaustive_df) con.create_table(table_name, database=tmp_db, schema=schema) con.insert(table_name, exhaustive_df.iloc[:4], database=tmp_db) con.insert(table_name, exhaustive_df.iloc[4:], database=tmp_db) table = con.table(table_name, database=tmp_db) result = ( table.execute().sort_values(by='tinyint_col').reset_index(drop=True) ) assert_frame_equal(result, exhaustive_df)
def get_schema(self, table_name, database=None): """ Return a Schema object for the indicated table and database Parameters ---------- table_name : string May be fully qualified database : string, default None Returns ------- schema : ibis Schema """ return sch.infer(self.dictionary[table_name])
def table(self, name, path): if name not in self.list_tables(path): raise AttributeError(name) if path is None: path = self.root # get the schema f = path / "{}.parquet".format(name) parquet_file = pq.ParquetFile(str(f)) schema = sch.infer(parquet_file.schema) table = self.table_class(name, schema, self).to_expr() self.dictionary[name] = f return table
def table(self, name, path=None, schema=None, **kwargs): if name not in self.list_tables(path): raise AttributeError(name) if path is None: path = self.root # get the schema f = path / "{}.{}".format(name, self.extension) # read sample schema = schema or sch.schema([]) sample = _read_csv(f, schema=schema, header=0, nrows=50, **kwargs) # infer sample's schema and define table schema = sch.infer(sample) table = self.table_class(name, schema, self, **kwargs).to_expr() self.dictionary[name] = f return table
def get_schema(self, table_name, database=None): """ Return a Schema object for the indicated table and database Parameters ---------- table_name : string May be fully qualified database : string Spark does not have a database argument for its table() method, so this must be None Returns ------- schema : ibis Schema """ if database is not None: raise com.UnsupportedArgumentError( 'Spark does not support database param for table') df = self._session.table(table_name) return sch.infer(df)
def table( self, name: str, schema: sch.Schema | None = None, ) -> ir.TableExpr: """Get an ibis expression representing a DataFusion table. Parameters ---------- name The name of the table to retreive schema An optional schema for the table Returns ------- TableExpr A table expression """ catalog = self._context.catalog() database = catalog.database('public') table = database.table(name) schema = sch.infer(table.schema) return self.table_class(name, schema, self).to_expr()
def get_schema(self, name, database=None): project, dataset = self._parse_project_and_dataset(database) table_ref = self.client.dataset(dataset, project=project).table(name) bq_table = self.client.get_table(table_ref) return sch.infer(bq_table)
def get_schema(self): # define a temporary table using delimited data return sch.infer(self.df)
def test_dtype_categorical(): df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category') inferred = sch.infer(df) expected = ibis.schema([('col', dt.Category())]) assert inferred == expected
def test_dtype_uint64(): df = pd.DataFrame({'col': np.uint64([666, 2, 3])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'uint64')]) assert inferred == expected
def test_dtype_uint16(): df = pd.DataFrame({'col': np.uint16([5569, 1, 33])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'uint16')]) assert inferred == expected
def test_dtype_float32(): df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.0])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'float')]) assert inferred == expected
def get_schema(self, name, database=None): table_id = self._fully_qualified_name(name, database) table_ref = bq.TableReference.from_string(table_id) bq_table = self.client.get_table(table_ref) return sch.infer(bq_table)
def test_schema_infer(col_data, schema_type): df = pd.DataFrame({'col': col_data}) inferred = sch.infer(df) expected = ibis.schema([('col', schema_type)]) assert inferred == expected
def test_infer_exhaustive_dataframe(): df = pd.DataFrame( { 'bigint_col': np.array( [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype='i8' ), 'bool_col': np.array( [ True, False, True, False, True, None, True, False, True, False, ], dtype=np.bool_, ), 'bool_obj_col': np.array( [ True, False, np.nan, False, True, np.nan, True, np.nan, True, False, ], dtype=np.object_, ), 'date_string_col': [ '11/01/10', None, '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', ], 'double_col': np.array( [ 0.0, 10.1, np.nan, 30.299999999999997, 40.399999999999999, 50.5, 60.599999999999994, 70.700000000000003, 80.799999999999997, 90.899999999999991, ], dtype=np.float64, ), 'float_col': np.array( [ np.nan, 1.1000000238418579, 2.2000000476837158, 3.2999999523162842, 4.4000000953674316, 5.5, 6.5999999046325684, 7.6999998092651367, 8.8000001907348633, 9.8999996185302734, ], dtype='f4', ), 'int_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i4'), 'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11], 'smallint_col': np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i2' ), 'string_col': [ '0', '1', None, 'double , whammy', '4', '5', '6', '7', '8', '9', ], 'timestamp_col': [ pd.Timestamp('2010-11-01 00:00:00'), None, pd.Timestamp('2010-11-01 00:02:00.100000'), pd.Timestamp('2010-11-01 00:03:00.300000'), pd.Timestamp('2010-11-01 00:04:00.600000'), pd.Timestamp('2010-11-01 00:05:00.100000'), pd.Timestamp('2010-11-01 00:06:00.150000'), pd.Timestamp('2010-11-01 00:07:00.210000'), pd.Timestamp('2010-11-01 00:08:00.280000'), pd.Timestamp('2010-11-01 00:09:00.360000'), ], 'tinyint_col': np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i1' ), 'year': [ 2010, 2010, 2010, 2010, 2010, 2009, 2009, 2009, 2009, 2009, ], } ) expected = [ ('bigint_col', dt.int64), ('bool_col', dt.boolean), ('bool_obj_col', dt.boolean), ('date_string_col', dt.string), ('double_col', dt.double), ('float_col', dt.float), ('int_col', dt.int32), ('month', dt.int64), ('smallint_col', dt.int16), ('string_col', dt.string), ('timestamp_col', dt.timestamp), ('tinyint_col', dt.int8), ('year', dt.int64), ] assert sch.infer(df) == ibis.schema(expected)
def test_infer_simple_dataframe(column, expected_dtype): df = pd.DataFrame({'col': column}) assert sch.infer(df) == ibis.schema([('col', expected_dtype)])
def get_schema(self, table_name, database=None): return sch.infer(self.dictionary[table_name])
def __init__(self, table, source, schema=None): schema = sch.infer(table, schema=schema) super().__init__(table.name, schema, source) self.sqla_table = table
def test_dtype_int64(): df = pd.DataFrame({'col': np.int64([102, 67_228_734, -0])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected
def test_dtype_float64(): df = pd.DataFrame({'col': np.float64([-3e43, 43.0, 10_000_000.0])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'double')]) assert inferred == expected
def test_dtype_uint32(): df = pd.DataFrame({'col': np.uint32([100, 0, 6])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'uint32')]) assert inferred == expected
def test_dtype_uint8(self): df = pd.DataFrame({'col': np.uint8([3, 0, 16])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'uint8')]) assert inferred == expected
def test_dtype_string(): df = pd.DataFrame({'col': ['foo', 'bar', 'hello']}) inferred = sch.infer(df) expected = ibis.schema([('col', 'string')]) assert inferred == expected
def table(self, name, schema=None): df = self.dictionary[name] schema = sch.infer(df, schema=schema) return self.table_class(name, schema, self).to_expr()
def test_dtype_bool(): df = pd.DataFrame({'col': [True, False, False]}) inferred = sch.infer(df) expected = ibis.schema([('col', 'boolean')]) assert inferred == expected
def table(self, name: str, schema: sch.Schema = None): df = self.dictionary[name] schema = sch.infer(df, schema=schema or self.schemas.get(name, None)) return self.table_class(name, schema, self).to_expr()
def table(self, name, schema=None): df = self.dictionary[name] schema = sch.infer(df, schema=schema) return PandasTable(name, schema, self).to_expr()
def test_schema_infer(col_data, schema_type): df = dd.from_pandas(pd.DataFrame({'col': col_data}), npartitions=1) inferred = sch.infer(df) expected = ibis.schema([('col', schema_type)]) assert inferred == expected
def get_schema(self, name, database=None): (table_id, dataset_id) = _ensure_split(name, database) bq_table = self._proxy.get_table(table_id, dataset_id) return sch.infer(bq_table)
def test_dtype_int8(): df = pd.DataFrame({'col': np.int8([-3, 9, 17])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'int8')]) assert inferred == expected
def table(self, name: str, schema: sch.Schema = None) -> DaskTable: df = self.dictionary[name] schema = sch.infer(df, schema=schema) return DaskTable(name, schema, self).to_expr()
def test_dtype_int16(): df = pd.DataFrame({'col': np.int16([-5, 0, 12])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'int16')]) assert inferred == expected
def test_infer_exhaustive_dataframe(): df = dd.from_pandas( pd.DataFrame({ 'bigint_col': np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype='i8'), 'bool_col': np.array( [ True, False, True, False, True, None, True, False, True, False, ], dtype=np.bool_, ), 'bool_obj_col': np.array( [ True, False, np.nan, False, True, np.nan, True, np.nan, True, False, ], dtype=np.object_, ), 'date_string_col': [ '11/01/10', None, '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', '11/01/10', ], 'double_col': np.array( [ 0.0, 10.1, np.nan, 30.299999999999997, 40.399999999999999, 50.5, 60.599999999999994, 70.700000000000003, 80.799999999999997, 90.899999999999991, ], dtype=np.float64, ), 'float_col': np.array( [ np.nan, 1.1000000238418579, 2.2000000476837158, 3.2999999523162842, 4.4000000953674316, 5.5, 6.5999999046325684, 7.6999998092651367, 8.8000001907348633, 9.8999996185302734, ], dtype='f4', ), 'int_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i4'), 'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11], 'smallint_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i2'), 'string_col': [ '0', '1', None, 'double , whammy', '4', '5', '6', '7', '8', '9', ], 'timestamp_col': [ pd.Timestamp('2010-11-01 00:00:00'), None, pd.Timestamp('2010-11-01 00:02:00.100000'), pd.Timestamp('2010-11-01 00:03:00.300000'), pd.Timestamp('2010-11-01 00:04:00.600000'), pd.Timestamp('2010-11-01 00:05:00.100000'), pd.Timestamp('2010-11-01 00:06:00.150000'), pd.Timestamp('2010-11-01 00:07:00.210000'), pd.Timestamp('2010-11-01 00:08:00.280000'), pd.Timestamp('2010-11-01 00:09:00.360000'), ], 'tinyint_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i1'), 'year': [ 2010, 2010, 2010, 2010, 2010, 2009, 2009, 2009, 2009, 2009, ], }), npartitions=1, ) expected = [ ('bigint_col', dt.int64), ('bool_col', dt.boolean), ('bool_obj_col', dt.boolean), ('date_string_col', dt.string), ('double_col', dt.double), ('float_col', dt.float), ('int_col', dt.int32), ('month', dt.int64), ('smallint_col', dt.int16), ('string_col', dt.string), ('timestamp_col', dt.timestamp), ('tinyint_col', dt.int8), ('year', dt.int64), ] assert sch.infer(df) == ibis.schema(expected)
def test_dtype_int32(): df = pd.DataFrame({'col': np.int32([-12, 3, 25000])}) inferred = sch.infer(df) expected = ibis.schema([('col', 'int32')]) assert inferred == expected