Example #1
0
def test_timestamp_with_timezone():
    df = pd.DataFrame(
        {'A': pd.date_range('20130101', periods=3, tz='US/Eastern')}
    )
    schema = sch.infer(df)
    expected = ibis.schema([('A', "timestamp('US/Eastern')")])
    assert schema.equals(expected)
    assert schema.types[0].equals(dt.Timestamp('US/Eastern'))
Example #2
0
    def table(self, name, path):
        if name not in self.list_tables(path):
            raise AttributeError(name)

        # get the schema
        with pd.HDFStore(str(path), mode='r') as store:
            df = store.select(name, start=0, stop=0)
            schema = sch.infer(df)

        t = self.table_class(name, schema, self).to_expr()
        self.dictionary[name] = path
        return t
Example #3
0
def test_insert(con, temp_table_db, exhaustive_df):
    tmp_db, table_name = temp_table_db
    schema = sch.infer(exhaustive_df)

    con.create_table(table_name, database=tmp_db, schema=schema)

    con.insert(table_name, exhaustive_df.iloc[:4], database=tmp_db)
    con.insert(table_name, exhaustive_df.iloc[4:], database=tmp_db)

    table = con.table(table_name, database=tmp_db)

    result = (
        table.execute().sort_values(by='tinyint_col').reset_index(drop=True)
    )
    assert_frame_equal(result, exhaustive_df)
Example #4
0
    def get_schema(self, table_name, database=None):
        """
        Return a Schema object for the indicated table and database

        Parameters
        ----------
        table_name : string
          May be fully qualified
        database : string, default None

        Returns
        -------
        schema : ibis Schema
        """
        return sch.infer(self.dictionary[table_name])
Example #5
0
    def table(self, name, path):
        if name not in self.list_tables(path):
            raise AttributeError(name)

        if path is None:
            path = self.root

        # get the schema
        f = path / "{}.parquet".format(name)

        parquet_file = pq.ParquetFile(str(f))
        schema = sch.infer(parquet_file.schema)

        table = self.table_class(name, schema, self).to_expr()
        self.dictionary[name] = f

        return table
Example #6
0
File: csv.py Project: cloudera/ibis
    def table(self, name, path=None, schema=None, **kwargs):
        if name not in self.list_tables(path):
            raise AttributeError(name)

        if path is None:
            path = self.root

        # get the schema
        f = path / "{}.{}".format(name, self.extension)

        # read sample
        schema = schema or sch.schema([])
        sample = _read_csv(f, schema=schema, header=0, nrows=50, **kwargs)

        # infer sample's schema and define table
        schema = sch.infer(sample)
        table = self.table_class(name, schema, self, **kwargs).to_expr()

        self.dictionary[name] = f

        return table
Example #7
0
    def get_schema(self, table_name, database=None):
        """
        Return a Schema object for the indicated table and database

        Parameters
        ----------
        table_name : string
          May be fully qualified
        database : string
          Spark does not have a database argument for its table() method,
          so this must be None

        Returns
        -------
        schema : ibis Schema
        """
        if database is not None:
            raise com.UnsupportedArgumentError(
                'Spark does not support database param for table')

        df = self._session.table(table_name)

        return sch.infer(df)
Example #8
0
    def table(
        self,
        name: str,
        schema: sch.Schema | None = None,
    ) -> ir.TableExpr:
        """Get an ibis expression representing a DataFusion table.

        Parameters
        ----------
        name
            The name of the table to retreive
        schema
            An optional schema for the table

        Returns
        -------
        TableExpr
            A table expression
        """
        catalog = self._context.catalog()
        database = catalog.database('public')
        table = database.table(name)
        schema = sch.infer(table.schema)
        return self.table_class(name, schema, self).to_expr()
Example #9
0
 def get_schema(self, name, database=None):
     project, dataset = self._parse_project_and_dataset(database)
     table_ref = self.client.dataset(dataset, project=project).table(name)
     bq_table = self.client.get_table(table_ref)
     return sch.infer(bq_table)
Example #10
0
 def get_schema(self):
     # define a temporary table using delimited data
     return sch.infer(self.df)
Example #11
0
def test_dtype_categorical():
    df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category')
    inferred = sch.infer(df)
    expected = ibis.schema([('col', dt.Category())])
    assert inferred == expected
Example #12
0
def test_dtype_uint64():
    df = pd.DataFrame({'col': np.uint64([666, 2, 3])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'uint64')])
    assert inferred == expected
Example #13
0
def test_dtype_uint16():
    df = pd.DataFrame({'col': np.uint16([5569, 1, 33])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'uint16')])
    assert inferred == expected
Example #14
0
def test_dtype_float32():
    df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.0])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'float')])
    assert inferred == expected
Example #15
0
 def get_schema(self, name, database=None):
     table_id = self._fully_qualified_name(name, database)
     table_ref = bq.TableReference.from_string(table_id)
     bq_table = self.client.get_table(table_ref)
     return sch.infer(bq_table)
Example #16
0
def test_schema_infer(col_data, schema_type):
    df = pd.DataFrame({'col': col_data})

    inferred = sch.infer(df)
    expected = ibis.schema([('col', schema_type)])
    assert inferred == expected
Example #17
0
def test_infer_exhaustive_dataframe():
    df = pd.DataFrame(
        {
            'bigint_col': np.array(
                [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype='i8'
            ),
            'bool_col': np.array(
                [
                    True,
                    False,
                    True,
                    False,
                    True,
                    None,
                    True,
                    False,
                    True,
                    False,
                ],
                dtype=np.bool_,
            ),
            'bool_obj_col': np.array(
                [
                    True,
                    False,
                    np.nan,
                    False,
                    True,
                    np.nan,
                    True,
                    np.nan,
                    True,
                    False,
                ],
                dtype=np.object_,
            ),
            'date_string_col': [
                '11/01/10',
                None,
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
            ],
            'double_col': np.array(
                [
                    0.0,
                    10.1,
                    np.nan,
                    30.299999999999997,
                    40.399999999999999,
                    50.5,
                    60.599999999999994,
                    70.700000000000003,
                    80.799999999999997,
                    90.899999999999991,
                ],
                dtype=np.float64,
            ),
            'float_col': np.array(
                [
                    np.nan,
                    1.1000000238418579,
                    2.2000000476837158,
                    3.2999999523162842,
                    4.4000000953674316,
                    5.5,
                    6.5999999046325684,
                    7.6999998092651367,
                    8.8000001907348633,
                    9.8999996185302734,
                ],
                dtype='f4',
            ),
            'int_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i4'),
            'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11],
            'smallint_col': np.array(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i2'
            ),
            'string_col': [
                '0',
                '1',
                None,
                'double , whammy',
                '4',
                '5',
                '6',
                '7',
                '8',
                '9',
            ],
            'timestamp_col': [
                pd.Timestamp('2010-11-01 00:00:00'),
                None,
                pd.Timestamp('2010-11-01 00:02:00.100000'),
                pd.Timestamp('2010-11-01 00:03:00.300000'),
                pd.Timestamp('2010-11-01 00:04:00.600000'),
                pd.Timestamp('2010-11-01 00:05:00.100000'),
                pd.Timestamp('2010-11-01 00:06:00.150000'),
                pd.Timestamp('2010-11-01 00:07:00.210000'),
                pd.Timestamp('2010-11-01 00:08:00.280000'),
                pd.Timestamp('2010-11-01 00:09:00.360000'),
            ],
            'tinyint_col': np.array(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i1'
            ),
            'year': [
                2010,
                2010,
                2010,
                2010,
                2010,
                2009,
                2009,
                2009,
                2009,
                2009,
            ],
        }
    )

    expected = [
        ('bigint_col', dt.int64),
        ('bool_col', dt.boolean),
        ('bool_obj_col', dt.boolean),
        ('date_string_col', dt.string),
        ('double_col', dt.double),
        ('float_col', dt.float),
        ('int_col', dt.int32),
        ('month', dt.int64),
        ('smallint_col', dt.int16),
        ('string_col', dt.string),
        ('timestamp_col', dt.timestamp),
        ('tinyint_col', dt.int8),
        ('year', dt.int64),
    ]

    assert sch.infer(df) == ibis.schema(expected)
Example #18
0
def test_infer_simple_dataframe(column, expected_dtype):
    df = pd.DataFrame({'col': column})
    assert sch.infer(df) == ibis.schema([('col', expected_dtype)])
Example #19
0
 def get_schema(self, table_name, database=None):
     return sch.infer(self.dictionary[table_name])
Example #20
0
 def __init__(self, table, source, schema=None):
     schema = sch.infer(table, schema=schema)
     super().__init__(table.name, schema, source)
     self.sqla_table = table
Example #21
0
def test_dtype_int64():
    df = pd.DataFrame({'col': np.int64([102, 67_228_734, -0])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'int64')])
    assert inferred == expected
Example #22
0
def test_infer_simple_dataframe(column, expected_dtype):
    df = pd.DataFrame({'col': column})
    assert sch.infer(df) == ibis.schema([('col', expected_dtype)])
Example #23
0
def test_dtype_float64():
    df = pd.DataFrame({'col': np.float64([-3e43, 43.0, 10_000_000.0])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'double')])
    assert inferred == expected
Example #24
0
 def get_schema(self):
     # define a temporary table using delimited data
     return sch.infer(self.df)
Example #25
0
def test_dtype_uint32():
    df = pd.DataFrame({'col': np.uint32([100, 0, 6])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'uint32')])
    assert inferred == expected
Example #26
0
 def test_dtype_uint8(self):
     df = pd.DataFrame({'col': np.uint8([3, 0, 16])})
     inferred = sch.infer(df)
     expected = ibis.schema([('col', 'uint8')])
     assert inferred == expected
Example #27
0
def test_dtype_string():
    df = pd.DataFrame({'col': ['foo', 'bar', 'hello']})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'string')])
    assert inferred == expected
Example #28
0
 def table(self, name, schema=None):
     df = self.dictionary[name]
     schema = sch.infer(df, schema=schema)
     return self.table_class(name, schema, self).to_expr()
Example #29
0
def test_dtype_bool():
    df = pd.DataFrame({'col': [True, False, False]})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'boolean')])
    assert inferred == expected
Example #30
0
 def __init__(self, table, source, schema=None):
     schema = sch.infer(table, schema=schema)
     super().__init__(table.name, schema, source)
     self.sqla_table = table
Example #31
0
 def table(self, name: str, schema: sch.Schema = None):
     df = self.dictionary[name]
     schema = sch.infer(df, schema=schema or self.schemas.get(name, None))
     return self.table_class(name, schema, self).to_expr()
Example #32
0
 def table(self, name, schema=None):
     df = self.dictionary[name]
     schema = sch.infer(df, schema=schema)
     return PandasTable(name, schema, self).to_expr()
Example #33
0
def test_schema_infer(col_data, schema_type):
    df = dd.from_pandas(pd.DataFrame({'col': col_data}), npartitions=1)
    inferred = sch.infer(df)
    expected = ibis.schema([('col', schema_type)])
    assert inferred == expected
Example #34
0
File: client.py Project: shshe/ibis
 def get_schema(self, name, database=None):
     (table_id, dataset_id) = _ensure_split(name, database)
     bq_table = self._proxy.get_table(table_id, dataset_id)
     return sch.infer(bq_table)
Example #35
0
 def table(self, name, schema=None):
     df = self.dictionary[name]
     schema = sch.infer(df, schema=schema)
     return PandasTable(name, schema, self).to_expr()
Example #36
0
def test_dtype_int8():
    df = pd.DataFrame({'col': np.int8([-3, 9, 17])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'int8')])
    assert inferred == expected
Example #37
0
 def table(self, name: str, schema: sch.Schema = None) -> DaskTable:
     df = self.dictionary[name]
     schema = sch.infer(df, schema=schema)
     return DaskTable(name, schema, self).to_expr()
Example #38
0
def test_dtype_int16():
    df = pd.DataFrame({'col': np.int16([-5, 0, 12])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'int16')])
    assert inferred == expected
Example #39
0
def test_infer_exhaustive_dataframe():
    df = dd.from_pandas(
        pd.DataFrame({
            'bigint_col':
            np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype='i8'),
            'bool_col':
            np.array(
                [
                    True,
                    False,
                    True,
                    False,
                    True,
                    None,
                    True,
                    False,
                    True,
                    False,
                ],
                dtype=np.bool_,
            ),
            'bool_obj_col':
            np.array(
                [
                    True,
                    False,
                    np.nan,
                    False,
                    True,
                    np.nan,
                    True,
                    np.nan,
                    True,
                    False,
                ],
                dtype=np.object_,
            ),
            'date_string_col': [
                '11/01/10',
                None,
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
                '11/01/10',
            ],
            'double_col':
            np.array(
                [
                    0.0,
                    10.1,
                    np.nan,
                    30.299999999999997,
                    40.399999999999999,
                    50.5,
                    60.599999999999994,
                    70.700000000000003,
                    80.799999999999997,
                    90.899999999999991,
                ],
                dtype=np.float64,
            ),
            'float_col':
            np.array(
                [
                    np.nan,
                    1.1000000238418579,
                    2.2000000476837158,
                    3.2999999523162842,
                    4.4000000953674316,
                    5.5,
                    6.5999999046325684,
                    7.6999998092651367,
                    8.8000001907348633,
                    9.8999996185302734,
                ],
                dtype='f4',
            ),
            'int_col':
            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i4'),
            'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11],
            'smallint_col':
            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i2'),
            'string_col': [
                '0',
                '1',
                None,
                'double , whammy',
                '4',
                '5',
                '6',
                '7',
                '8',
                '9',
            ],
            'timestamp_col': [
                pd.Timestamp('2010-11-01 00:00:00'),
                None,
                pd.Timestamp('2010-11-01 00:02:00.100000'),
                pd.Timestamp('2010-11-01 00:03:00.300000'),
                pd.Timestamp('2010-11-01 00:04:00.600000'),
                pd.Timestamp('2010-11-01 00:05:00.100000'),
                pd.Timestamp('2010-11-01 00:06:00.150000'),
                pd.Timestamp('2010-11-01 00:07:00.210000'),
                pd.Timestamp('2010-11-01 00:08:00.280000'),
                pd.Timestamp('2010-11-01 00:09:00.360000'),
            ],
            'tinyint_col':
            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i1'),
            'year': [
                2010,
                2010,
                2010,
                2010,
                2010,
                2009,
                2009,
                2009,
                2009,
                2009,
            ],
        }),
        npartitions=1,
    )

    expected = [
        ('bigint_col', dt.int64),
        ('bool_col', dt.boolean),
        ('bool_obj_col', dt.boolean),
        ('date_string_col', dt.string),
        ('double_col', dt.double),
        ('float_col', dt.float),
        ('int_col', dt.int32),
        ('month', dt.int64),
        ('smallint_col', dt.int16),
        ('string_col', dt.string),
        ('timestamp_col', dt.timestamp),
        ('tinyint_col', dt.int8),
        ('year', dt.int64),
    ]

    assert sch.infer(df) == ibis.schema(expected)
Example #40
0
def test_dtype_int32():
    df = pd.DataFrame({'col': np.int32([-12, 3, 25000])})
    inferred = sch.infer(df)
    expected = ibis.schema([('col', 'int32')])
    assert inferred == expected