Example #1
0
def test_create_table_with_partition_column(con, temp_table_db):
    schema = ibis.schema(
        [
            ('year', 'int32'),
            ('month', 'string'),
            ('day', 'int8'),
            ('value', 'double'),
        ]
    )

    tmp_db, name = temp_table_db
    con.create_table(
        name, schema=schema, database=tmp_db, partition=['year', 'month']
    )

    # the partition column get put at the end of the table
    ex_schema = ibis.schema(
        [
            ('day', 'int8'),
            ('value', 'double'),
            ('year', 'int32'),
            ('month', 'string'),
        ]
    )
    table_schema = con.get_schema(name, database=tmp_db)
    assert_equal(table_schema, ex_schema)

    partition_schema = con.database(tmp_db).table(name).partition_schema()

    expected = ibis.schema([('year', 'int32'), ('month', 'string')])
    assert_equal(partition_schema, expected)
Example #2
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet'))
    schemas = {
        'functional_alltypes': ibis.schema(
            [('id', 'int32'),
             ('bool_col', 'boolean'),
             ('tinyint_col', 'int8'),
             ('smallint_col', 'int16'),
             ('int_col', 'int32'),
             ('bigint_col', 'int64'),
             ('float_col', 'float'),
             ('double_col', 'double'),
             ('date_string_col', 'string'),
             ('string_col', 'string'),
             ('timestamp_col', 'timestamp'),
             ('year', 'int32'),
             ('month', 'int32')]),
        'tpch_region': ibis.schema(
            [('r_regionkey', 'int16'),
             ('r_name', 'string'),
             ('r_comment', 'string')])}
    tables = []
    for path in parquet_files:
        head, table_name = osp.split(path)
        print('Creating {0}'.format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        table = con.parquet_file(path, schema=schema, name=table_name,
                                 database=ENV.test_data_db, persist=True)
        tables.append(table)
    return tables
Example #3
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([('year', 'int32'),
                              ('month', 'int8'),
                              ('day', 'int8'),
                              ('value', 'double')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema,
                              database=self.tmp_db,
                              partition=['year', 'month'],
                              location=self._temp_location())
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name, database=self.tmp_db)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.db.table(name).partition_schema()

        expected = ibis.schema([('year', 'int32'),
                                ('month', 'int8')])
        assert_equal(partition_schema, expected)
Example #4
0
def create_parquet_tables(con):
    parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, "parquet"))
    schemas = {
        "functional_alltypes": ibis.schema(
            [
                ("id", "int32"),
                ("bool_col", "boolean"),
                ("tinyint_col", "int8"),
                ("smallint_col", "int16"),
                ("int_col", "int32"),
                ("bigint_col", "int64"),
                ("float_col", "float"),
                ("double_col", "double"),
                ("date_string_col", "string"),
                ("string_col", "string"),
                ("timestamp_col", "timestamp"),
                ("year", "int32"),
                ("month", "int32"),
            ]
        ),
        "tpch_region": ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]),
    }

    tables = []

    for path in parquet_files:
        head, table_name = posixpath.split(path)
        print("Creating {0}".format(table_name))
        # if no schema infer!
        schema = schemas.get(table_name)
        t = con.parquet_file(path, schema=schema, name=table_name, database=ENV.test_data_db, persist=True)
        tables.append(t)

    return tables
Example #5
0
def test_schema_subset():
    s1 = ibis.schema([('a', dt.int64), ('b', dt.int32), ('c', dt.string)])

    s2 = ibis.schema([('a', dt.int64), ('c', dt.string)])

    assert s1 > s2
    assert s2 < s1

    assert s1 >= s2
    assert s2 <= s1
Example #6
0
    def test_create_table_with_partition_column(self):
        schema = ibis.schema([("year", "int32"), ("month", "int8"), ("day", "int8"), ("value", "double")])

        name = util.guid()
        self.con.create_table(name, schema=schema, partition=["year", "month"])
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([("day", "int8"), ("value", "double"), ("year", "int32"), ("month", "int8")])
        table_schema = self.con.get_schema(name)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.con.get_partition_schema(name)
        expected = ibis.schema([("year", "int32"), ("month", "int8")])
        assert_equal(partition_schema, expected)
Example #7
0
def pandas_to_ibis_schema(frame):
    # no analog for decimal in pandas
    pairs = []
    for col_name in frame:
        ibis_type = pandas_col_to_ibis_type(frame[col_name])
        pairs.append((col_name, ibis_type))
    return ibis.schema(pairs)
Example #8
0
def test_create_table_schema(con):
    t_name = 'mytable'

    con.drop_table(t_name, force=True)

    schema = ibis.schema(
        [
            ('a', 'float'),
            ('b', 'double'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('x', 'point'),
            ('y', 'linestring'),
            ('z', 'polygon'),
            ('w', 'multipolygon'),
        ]
    )

    con.create_table(t_name, schema=schema)

    try:
        t = con.table(t_name)

        assert isinstance(t.a, ir.FloatingColumn)
        assert isinstance(t.b, ir.FloatingColumn)
        assert isinstance(t.c, ir.IntegerColumn)
        assert isinstance(t.d, ir.IntegerColumn)
        assert isinstance(t.x, ir.PointColumn)
        assert isinstance(t.y, ir.LineStringColumn)
        assert isinstance(t.z, ir.PolygonColumn)
        assert isinstance(t.w, ir.MultiPolygonColumn)
    finally:
        con.drop_table(t_name)
Example #9
0
def test_create_table_parquet_with_schema():
    directory = '/path/to/'

    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]
    )

    statement = ddl.CreateTableParquet(
        'new_table',
        directory,
        schema=schema,
        external=True,
        can_exist=True,
        database='foo',
    )

    result = statement.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(
        directory
    )

    assert result == expected
Example #10
0
    def test_create_external_ddl(self):
        schema = ibis.schema(
            [('key1', 'int32'), ('key2', 'int64'), ('value1', 'double')]
        )

        stmt = ksupport.CreateTableKudu(
            'impala_name',
            'kudu_name',
            ['master1.d.com:7051', 'master2.d.com:7051'],
            schema,
            ['key1', 'key2'],
        )

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE `impala_name`
(`key1` int,
 `key2` bigint,
 `value1` double)
TBLPROPERTIES (
  'kudu.key_columns'='key1, key2',
  'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
)"""
        assert result == expected
Example #11
0
def test_add_partition_string_key():
    part_schema = ibis.schema([('foo', 'int32'), ('bar', 'string')])
    stmt = ddl.AddPartition('tbl', {'foo': 5, 'bar': 'qux'}, part_schema)

    result = stmt.compile()
    expected = 'ALTER TABLE tbl ADD PARTITION (foo=5, bar="qux")'
    assert result == expected
Example #12
0
def test_create_table_with_location_compile():
    path = '/path/to/table'
    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]
    )
    statement = ddl.CreateTableWithSchema(
        'another_table',
        schema,
        can_exist=False,
        format='parquet',
        path=path,
        database='foo',
    )
    result = statement.compile()

    expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(
        path
    )
    assert result == expected
Example #13
0
    def create_table(self, name, expr=None, schema=None, database=None):
        if database is not None and database != self.engine.url.database:
            raise NotImplementedError(
                'Creating tables from a different database is not yet '
                'implemented'
            )

        if expr is None and schema is None:
            raise ValueError('You must pass either an expression or a schema')

        if expr is not None and schema is not None:
            if not expr.schema().equals(ibis.schema(schema)):
                raise TypeError(
                    'Expression schema is not equal to passed schema. '
                    'Try passing the expression without the schema'
                )
        if schema is None:
            schema = expr.schema()

        self._schemas[self._fully_qualified_name(name, database)] = schema
        t = table_from_schema(name, self.meta, schema)

        with self.begin() as bind:
            t.create(bind=bind)
            if expr is not None:
                bind.execute(
                    t.insert().from_select(list(expr.columns), expr.compile())
                )
Example #14
0
    def test_kudu_schema_convert(self):
        spec = [
            # name, type, is_nullable, is_primary_key
            ('a', dt.Int8(False), 'int8', False, True),
            ('b', dt.Int16(False), 'int16', False, True),
            ('c', dt.Int32(False), 'int32', False, False),
            ('d', dt.Int64(True), 'int64', True, False),
            ('e', dt.String(True), 'string', True, False),
            ('f', dt.Boolean(False), 'bool', False, False),
            ('g', dt.Float(False), 'float', False, False),
            ('h', dt.Double(True), 'double', True, False),
            # TODO
            # ('i', 'binary', False, False),
            ('j', dt.Timestamp(True), 'timestamp', True, False),
        ]

        builder = kudu.schema_builder()
        primary_keys = []
        ibis_types = []
        for name, itype, type_, is_nullable, is_primary_key in spec:
            builder.add_column(name, type_, nullable=is_nullable)

            if is_primary_key:
                primary_keys.append(name)

            ibis_types.append((name, itype))

        builder.set_primary_keys(primary_keys)
        kschema = builder.build()

        ischema = ksupport.schema_kudu_to_ibis(kschema)
        expected = ibis.schema(ibis_types)

        assert_equal(ischema, expected)
Example #15
0
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table):
    schema = ibis.schema(
        [('foo', 'string'), ('year', 'int32'), ('month', 'int16')]
    )
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    subdir = util.guid()
    basename = util.guid()
    path = '/tmp/{}/{}'.format(subdir, basename)

    hdfs.mkdir('/tmp/{}'.format(subdir))
    hdfs.chown('/tmp/{}'.format(subdir), owner='impala', group='supergroup')

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Example #16
0
def test_is_partitioned(con, temp_table):
    schema = ibis.schema(
        [('foo', 'string'), ('year', 'int32'), ('month', 'string')]
    )
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])
    assert con.table(name).is_partitioned
Example #17
0
def test_apply_to_schema_with_timezone():
    data = {'time': pd.date_range('2018-01-01', '2018-01-02', freq='H')}
    df = pd.DataFrame(data)
    expected = df.assign(time=df.time.astype('datetime64[ns, EST]'))
    desired_schema = ibis.schema([('time', 'timestamp("EST")')])
    result = desired_schema.apply_to(df.copy())
    tm.assert_frame_equal(expected, result)
Example #18
0
    def test_create_table_delimited(self):
        path = '/path/to/files/'
        schema = ibis.schema([('a', 'string'),
                              ('b', 'int32'),
                              ('c', 'double'),
                              ('d', 'decimal(12,2)')])

        stmt = ddl.CreateTableDelimited('new_table', path, schema,
                                        delimiter='|',
                                        escapechar='\\',
                                        lineterminator='\0',
                                        database='foo',
                                        can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(path)
        assert result == expected
Example #19
0
    def test_create_table_delimited(self):
        path = "/path/to/files/"
        schema = ibis.schema([("a", "string"), ("b", "int32"), ("c", "double"), ("d", "decimal(12,2)")])

        stmt = ddl.CreateTableDelimited(
            "new_table",
            path,
            schema,
            delimiter="|",
            escapechar="\\",
            lineterminator="\0",
            database="foo",
            can_exist=True,
        )

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(
            path
        )
        assert result == expected
Example #20
0
def test_filter_with_analytic():
    x = ibis.table(ibis.schema([('col', 'int32')]), 'x')
    with_filter_col = x[x.columns + [ibis.null().name('filter')]]
    filtered = with_filter_col[with_filter_col['filter'].isnull()]
    subquery = filtered[filtered.columns]

    with_analytic = subquery[['col', subquery.count().name('analytic')]]
    expr = with_analytic[with_analytic.columns]

    result = ibis.impala.compile(expr)
    expected = """\
SELECT `col`, `analytic`
FROM (
  SELECT `col`, count(*) OVER () AS `analytic`
  FROM (
    SELECT `col`, `filter`
    FROM (
      SELECT *
      FROM (
        SELECT `col`, NULL AS `filter`
        FROM x
      ) t3
      WHERE `filter` IS NULL
    ) t2
  ) t1
) t0"""

    assert result == expected
Example #21
0
    def test_sqla_schema_conversion(self):
        typespec = [
            # name, type, nullable
            ("smallint", sat.SmallInteger, False, dt.int16),
            ("int", sat.Integer, True, dt.int32),
            ("integer", sat.INTEGER(), True, dt.int64),
            ("bigint", sat.BigInteger, False, dt.int64),
            ("real", sat.REAL, True, dt.double),
            ("bool", sat.Boolean, True, dt.boolean),
            ("timestamp", sat.DateTime, True, dt.timestamp),
        ]

        sqla_types = []
        ibis_types = []
        for name, t, nullable, ibis_type in typespec:
            sqla_type = sa.Column(name, t, nullable=nullable)
            sqla_types.append(sqla_type)
            ibis_types.append((name, ibis_type(nullable)))

        table = sa.Table("tname", self.meta, *sqla_types)

        schema = alch.schema_from_table(table)
        expected = ibis.schema(ibis_types)

        assert_equal(schema, expected)
Example #22
0
def test_timestamp_with_timezone():
    df = pd.DataFrame(
        {'A': pd.date_range('20130101', periods=3, tz='US/Eastern')}
    )
    schema = sch.infer(df)
    expected = ibis.schema([('A', "timestamp('US/Eastern')")])
    assert schema.equals(expected)
    assert schema.types[0].equals(dt.Timestamp('US/Eastern'))
Example #23
0
 def test_dtype_datetime64(self):
     df = pd.DataFrame({
         'col': [pd.Timestamp('2010-11-01 00:01:00'),
                 pd.Timestamp('2010-11-01 00:02:00.1000'),
                 pd.Timestamp('2010-11-01 00:03:00.300000')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'timestamp')])
     assert inferred == expected
Example #24
0
 def test_dtype_timedelta64(self):
     df = pd.DataFrame({
         'col': [pd.Timedelta('1 days'),
                 pd.Timedelta('-1 days 2 min 3us'),
                 pd.Timedelta('-2 days +23:57:59.999997')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
Example #25
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")

        ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")])

        table = self.con.parquet_file(hdfs_path, like_table="tpch_region")

        assert_equal(table.schema(), ex_schema)
Example #26
0
 def test_is_partitioned(self):
     schema = ibis.schema([('foo', 'string'),
                           ('year', 'int32'),
                           ('month', 'int16')])
     name = _tmp_name()
     self.db.create_table(name, schema=schema,
                          partition=['year', 'month'])
     assert self.db.table(name).is_partitioned
Example #27
0
def create_parquet_tables(con, executor):
    def create_table(table_name):
        logger.info('Creating %s', table_name)
        schema = schemas.get(table_name)
        path = os.path.join(ENV.test_data_dir, 'parquet', table_name)
        table = con.parquet_file(
            path,
            schema=schema,
            name=table_name,
            database=ENV.test_data_db,
            persist=True,
        )
        return table

    parquet_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'parquet'))
    schemas = {
        'functional_alltypes': ibis.schema(
            [
                ('id', 'int32'),
                ('bool_col', 'boolean'),
                ('tinyint_col', 'int8'),
                ('smallint_col', 'int16'),
                ('int_col', 'int32'),
                ('bigint_col', 'int64'),
                ('float_col', 'float'),
                ('double_col', 'double'),
                ('date_string_col', 'string'),
                ('string_col', 'string'),
                ('timestamp_col', 'timestamp'),
                ('year', 'int32'),
                ('month', 'int32'),
            ]
        ),
        'tpch_region': ibis.schema(
            [
                ('r_regionkey', 'int16'),
                ('r_name', 'string'),
                ('r_comment', 'string'),
            ]
        ),
    }
    return (
        executor.submit(create_table, table_name)
        for table_name in parquet_files
    )
Example #28
0
    def test_query_parquet_file_like_table(self):
        hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')

        ex_schema = ibis.schema([('r_regionkey', 'int16'),
                                 ('r_name', 'string'),
                                 ('r_comment', 'string')])

        table = self.con.parquet_file(hdfs_path, like_table='tpch_region')

        assert_equal(table.schema(), ex_schema)
Example #29
0
    def test_query_parquet_infer_schema(self):
        hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region")
        table = self.con.parquet_file(hdfs_path)

        # NOTE: the actual schema should have an int16, but bc this is being
        # inferred from a parquet file, which has no notion of int16, the
        # inferred schema will have an int32 instead.
        ex_schema = ibis.schema([("r_regionkey", "int32"), ("r_name", "string"), ("r_comment", "string")])

        assert_equal(table.schema(), ex_schema)
Example #30
0
    def test_create_partitioned_separate_schema(self):
        schema = ibis.schema([('day', 'int8'),
                              ('value', 'double')])
        part_schema = ibis.schema([('year', 'int32'),
                                   ('month', 'int8')])

        name = _tmp_name()
        self.con.create_table(name, schema=schema, partition=part_schema)
        self.temp_tables.append(name)

        # the partition column get put at the end of the table
        ex_schema = ibis.schema([('day', 'int8'),
                                 ('value', 'double'),
                                 ('year', 'int32'),
                                 ('month', 'int8')])
        table_schema = self.con.get_schema(name)
        assert_equal(table_schema, ex_schema)

        partition_schema = self.con.table(name).partition_schema()
        assert_equal(partition_schema, part_schema)
Example #31
0
    def test_create_table_with_location(self):
        path = '/path/to/table'
        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])
        statement = ddl.CreateTableWithSchema('another_table', schema,
                                              can_exist=False,
                                              format='parquet',
                                              path=path, database='foo')
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(path)
        assert result == expected
Example #32
0
def test_load_data_sqlalchemy(backend, con, temp_table):
    sch = ibis.schema([
        ('first_name', 'string'),
        ('last_name', 'string'),
        ('department_name', 'string'),
        ('salary', 'float64'),
    ])

    df = pd.DataFrame({
        'first_name': ['A', 'B', 'C'],
        'last_name': ['D', 'E', 'F'],
        'department_name': ['AA', 'BB', 'CC'],
        'salary': [100.0, 200.0, 300.0],
    })
    con.create_table(temp_table, schema=sch)
    con.load_data(temp_table, df, if_exists='append')
    result = con.table(temp_table).execute()

    backend.assert_frame_equal(df, result)
Example #33
0
 def batting(self) -> ir.TableExpr:
     schema = ibis.schema(
         [
             ('lgID', dt.string),
             ('G', dt.float64),
             ('AB', dt.float64),
             ('R', dt.float64),
             ('H', dt.float64),
             ('X2B', dt.float64),
             ('X3B', dt.float64),
             ('HR', dt.float64),
             ('RBI', dt.float64),
             ('SB', dt.float64),
             ('CS', dt.float64),
             ('BB', dt.float64),
             ('SO', dt.float64),
         ]
     )
     return self.connection.table('batting', schema=schema)
Example #34
0
def test_nullable_input_output(con, backend, temp_table):
    # - Impala, PySpark and Spark non-nullable issues #2138 and #2137
    if not hasattr(con, 'create_table') or not hasattr(con, 'drop_table'):
        pytest.xfail(
            '{} backend doesn\'t have create_table or drop_table methods.')

    sch = ibis.schema([
        ('foo', 'int64'),
        ('bar', ibis.expr.datatypes.int64(nullable=False)),
        ('baz', 'boolean*'),
    ])

    con.create_table(temp_table, schema=sch)

    t = con.table(temp_table)

    assert t.schema().types[0].nullable
    assert not t.schema().types[1].nullable
    assert t.schema().types[2].nullable
Example #35
0
    def test_add_drop_partition_no_location(self):
        schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                              ('month', 'int16')])
        name = _tmp_name()
        self.db.create_table(name, schema=schema, partition=['year', 'month'])

        table = self.db.table(name)

        part = {'year': 2007, 'month': 4}

        table.add_partition(part)

        assert len(table.partitions()) == 2

        table.drop_partition(part)

        assert len(table.partitions()) == 1

        table.drop()
Example #36
0
    def test_query_delimited_file_directory(self):
        hdfs_path = pjoin(self.test_data_dir, 'csv')

        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'double'),
                              ('baz', 'int8')])
        name = 'delimited_table_test1'
        table = self.con.delimited_file(hdfs_path, schema, name=name,
                                        database=self.tmp_db,
                                        delimiter=',')
        try:
            expr = (table
                    [table.bar > 0]
                    .group_by('foo')
                    .aggregate([table.bar.sum().name('sum(bar)'),
                                table.baz.sum().name('mean(baz)')]))
            expr.execute()
        finally:
            self.con.drop_table(name, database=self.tmp_db)
Example #37
0
    def test_add_drop_partition(self):
        pytest.skip('HIVE-12613')
        schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                              ('month', 'int16')])
        name = _tmp_name()
        self.db.create_table(name, schema=schema, partition=['year', 'month'])

        table = self.db.table(name)

        part = {'year': 2007, 'month': 4}

        path = '/tmp/tmp-{0}'.format(util.guid())
        table.add_partition(part, location=path)

        assert len(table.partitions()) == 2

        table.drop_partition(part)

        assert len(table.partitions()) == 1
Example #38
0
def test_add_drop_partition_hive_bug(con, temp_table):
    schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                          ('month', 'int16')])
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    path = '/tmp/{}'.format(util.guid())

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Example #39
0
def test_query_parquet_file_with_schema(con, test_data_dir):
    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    schema = ibis.schema([
        ('r_regionkey', 'int16'),
        ('r_name', 'string'),
        ('r_comment', 'string'),
    ])

    table = con.parquet_file(hdfs_path, schema=schema)

    name = table.op().name

    # table exists
    con.table(name)

    expr = table.r_name.value_counts()
    expr.execute()

    assert table.count().execute() == 5
Example #40
0
def test_persist_parquet_file_with_name(con, test_data_dir, temp_table_db):
    import gc

    hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region')

    tmp_db, name = temp_table_db
    schema = ibis.schema([
        ('r_regionkey', 'int16'),
        ('r_name', 'string'),
        ('r_comment', 'string'),
    ])
    con.parquet_file(hdfs_path,
                     schema=schema,
                     name=name,
                     database=tmp_db,
                     persist=True)
    gc.collect()

    # table still exists
    con.table(name, database=tmp_db)
Example #41
0
def test_mutation_fusion_no_overwrite():
    """Test fusion with chained mutation that doesn't overwrite existing
    columns.
    """
    t = ibis.table(ibis.schema([('col', 'int32')]), 't')

    result = t
    result = result.mutate(col1=t['col'] + 1)
    result = result.mutate(col2=t['col'] + 2)
    result = result.mutate(col3=t['col'] + 3)

    first_selection = result

    assert len(result.op().selections) == 4
    assert (first_selection.op().selections[1].equals(
        (t['col'] + 1).name('col1')))
    assert (first_selection.op().selections[2].equals(
        (t['col'] + 2).name('col2')))
    assert (first_selection.op().selections[3].equals(
        (t['col'] + 3).name('col3')))
Example #42
0
    def create_table(self, name, expr=None, schema=None, database=None):
        if database is not None and database != self.engine.url.database:
            raise NotImplementedError(
                'Creating tables from a different database is not yet '
                'implemented')

        if expr is None and schema is None:
            raise ValueError('You must pass either an expression or a schema')

        if expr is not None and schema is not None:
            if not expr.schema().equals(ibis.schema(schema)):
                raise TypeError(
                    'Expression schema is not equal to passed schema. '
                    'Try passing the expression without the schema')
        t = table_from_schema(name, self.meta, schema or expr.schema())
        with self.con.begin() as bind:
            t.create(bind=bind)
            if expr is not None:
                bind.execute(t.insert().from_select(list(expr.columns),
                                                    expr.compile()))
Example #43
0
def impala_create_test_database(con, env):
    con.drop_database(env.test_data_db, force=True)
    con.create_database(env.test_data_db)
    con.create_table(
        'alltypes',
        schema=ibis.schema(
            [
                ('a', 'int8'),
                ('b', 'int16'),
                ('c', 'int32'),
                ('d', 'int64'),
                ('e', 'float'),
                ('f', 'double'),
                ('g', 'string'),
                ('h', 'boolean'),
                ('i', 'timestamp'),
            ]
        ),
        database=env.test_data_db,
    )
Example #44
0
def test_create_table_schema(con, temp_table, properties):
    schema = ibis.schema([
        ('a', 'float'),
        ('b', 'double'),
        ('c', 'int8'),
        ('d', 'int16'),
        ('e', 'int32'),
        ('f', 'int64'),
        ('x', 'point'),
        ('y', 'linestring'),
        ('z', 'polygon'),
        ('w', 'multipolygon'),
    ])

    con.create_table(temp_table, schema=schema, **properties)

    t = con.table(temp_table)

    for k, i_type in t.schema().items():
        assert schema[k] == i_type
Example #45
0
def get_type(expr):
    try:
        return str(expr.type())
    except (AttributeError, NotImplementedError):
        pass

    try:
        schema = expr.schema()
    except (AttributeError, NotImplementedError):
        try:
            # As a last resort try get the name of the output_type class
            return expr.op().output_type().__name__
        except (AttributeError, NotImplementedError):
            return '\u2205'  # empty set character
    except com.IbisError:
        op = expr.op()
        assert isinstance(op, ops.Join)
        left_table_name = getattr(op.left.op(), 'name', None) or ops.genname()
        left_schema = op.left.schema()
        right_table_name = (
            getattr(op.right.op(), 'name', None) or ops.genname()
        )
        right_schema = op.right.schema()
        pairs = [
            ('{}.{}'.format(left_table_name, left_column), type)
            for left_column, type in left_schema.items()
        ] + [
            ('{}.{}'.format(right_table_name, right_column), type)
            for right_column, type in right_schema.items()
        ]
        schema = ibis.schema(pairs)

    return (
        ''.join(
            '<BR ALIGN="LEFT" />  <I>{}</I>: {}'.format(
                escape(name), escape(str(type))
            )
            for name, type in zip(schema.names, schema.types)
        )
        + '<BR ALIGN="LEFT" />'
    )
Example #46
0
    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes', path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected
Example #47
0
def test_convert_parquet(parquet_schema):
    strings = [dt.string, dt.string, dt.string]

    # uint32, int8, int16 stored as upcasted types
    types = ([
        dt.uint8,
        dt.uint16,
        dt.int64,
        dt.uint64,
        dt.int16,
        dt.int16,
        dt.int32,
        dt.int64,
        dt.float32,
        dt.float64,
        dt.boolean,
        dt.timestamp,
    ] + strings + [dt.binary, dt.int64])
    names = [
        'uint8',
        'uint16',
        'uint32',
        'uint64',
        'int8',
        'int16',
        'int32',
        'int64',
        'float32',
        'float64',
        'bool',
        'datetime',
        'str',
        'str_with_nulls',
        'empty_str',
        'bytes',
    ]
    expected = ibis.schema(zip(names, types))

    result = ibis.infer_schema(parquet_schema)
    assert result == expected
Example #48
0
def create_test_database(con):
    if con.exists_database(ENV.test_data_db):
        con.drop_database(ENV.test_data_db, force=True)
    con.create_database(ENV.test_data_db)
    logger.info('Created database %s', ENV.test_data_db)

    con.create_table(
        'alltypes',
        schema=ibis.schema([
            ('a', 'int8'),
            ('b', 'int16'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('e', 'float'),
            ('f', 'double'),
            ('g', 'string'),
            ('h', 'boolean'),
            ('i', 'timestamp'),
        ]),
        database=ENV.test_data_db,
    )
    logger.info('Created empty table %s.`alltypes`', ENV.test_data_db)
Example #49
0
def test_query_delimited_file_directory(con, test_data_dir, tmp_db):
    hdfs_path = pjoin(test_data_dir, 'csv')

    schema = ibis.schema(
        [('foo', 'string'), ('bar', 'double'), ('baz', 'int8')]
    )
    name = 'delimited_table_test1'
    table = con.delimited_file(
        hdfs_path, schema, name=name, database=tmp_db, delimiter=','
    )

    expr = (
        table[table.bar > 0]
        .group_by('foo')
        .aggregate(
            [
                table.bar.sum().name('sum(bar)'),
                table.baz.sum().name('mean(baz)'),
            ]
        )
    )
    assert expr.execute() is not None
Example #50
0
def test_load_data_sqlalchemy(backend, con, temp_table):
    if not isinstance(con.dialect(), ibis.sql.alchemy.AlchemyDialect):
        pytest.skip('{} is not a SQL Alchemy Client.'.format(backend.name))

    sch = ibis.schema([
        ('first_name', 'string'),
        ('last_name', 'string'),
        ('department_name', 'string'),
        ('salary', 'float64'),
    ])

    df = pd.DataFrame({
        'first_name': ['A', 'B', 'C'],
        'last_name': ['D', 'E', 'F'],
        'department_name': ['AA', 'BB', 'CC'],
        'salary': [100.0, 200.0, 300.0],
    })
    con.create_table(temp_table, schema=sch)
    con.load_data(temp_table, df, if_exists='append')
    result = con.table(temp_table).execute()

    backend.assert_frame_equal(df, result)
    def test_create_external_ddl(self):
        schema = ibis.schema([('key1', 'int32'), ('key2', 'int64'),
                              ('value1', 'double')])

        stmt = ksupport.CreateTableKudu(
            'impala_name', 'kudu_name',
            ['master1.d.com:7051', 'master2.d.com:7051'], schema,
            ['key1', 'key2'])

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE `impala_name`
(`key1` int,
 `key2` bigint,
 `value1` double)
TBLPROPERTIES (
  'kudu.key_columns'='key1, key2',
  'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
)"""
        assert result == expected
Example #52
0
def test_convert_parquet(parquet_schema):
    # TODO(jreback)
    # not entirely sure this is correct
    # should these be strings in py2?
    if PY2:
        strings = [dt.binary, dt.binary, dt.binary]
    else:
        strings = [dt.string, dt.string, dt.string]

    # uint32, int8, int16 stored as upcasted types
    types = [
        dt.uint8, dt.uint16, dt.int64, dt.uint64, dt.int16, dt.int16, dt.int32,
        dt.int64, dt.float32, dt.float64, dt.boolean, dt.timestamp
    ] + strings + [dt.binary, dt.int64]
    names = [
        'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32',
        'int64', 'float32', 'float64', 'bool', 'datetime', 'str',
        'str_with_nulls', 'empty_str', 'bytes', '__index_level_0__'
    ]
    expected = ibis.schema(zip(names, types))

    result = ibis.infer_schema(parquet_schema)
    assert result == expected
Example #53
0
    def test_create_table_parquet_with_schema(self):
        directory = '/path/to/'

        schema = ibis.schema([('foo', 'string'), ('bar', 'int8'),
                              ('baz', 'int16')])

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           schema=schema,
                                           external=True,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

        assert result == expected
Example #54
0
def test_create_table_delimited():
    path = '/path/to/files/'
    schema = ibis.schema(
        [
            ('a', 'string'),
            ('b', 'int32'),
            ('c', 'double'),
            ('d', 'decimal(12, 2)'),
        ]
    )

    stmt = ddl.CreateTableDelimited(
        'new_table',
        path,
        schema,
        delimiter='|',
        escapechar='\\',
        lineterminator='\0',
        database='foo',
        can_exist=True,
    )

    result = stmt.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12, 2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(
        path
    )
    assert result == expected
Example #55
0
def test_sa_default_numeric_precision_and_scale(con, backend, dialects,
                                                default_precisions,
                                                default_scales):
    # TODO: find a better way to access ibis.sql.alchemy
    import ibis.sql.alchemy as alch

    dialect = dialects[backend.name]
    default_precision = default_precisions[backend.name]
    default_scale = default_scales[backend.name]

    typespec = [
        # name, sqlalchemy type, ibis type
        ('n1', dialect.NUMERIC, dt.Decimal(default_precision, default_scale)),
        ('n2', dialect.NUMERIC(5), dt.Decimal(5, default_scale)),
        ('n3', dialect.NUMERIC(None, 4), dt.Decimal(default_precision, 4)),
        ('n4', dialect.NUMERIC(10, 2), dt.Decimal(10, 2)),
    ]

    sqla_types = []
    ibis_types = []
    for name, t, ibis_type in typespec:
        sqla_type = sa.Column(name, t, nullable=True)
        sqla_types.append(sqla_type)
        ibis_types.append((name, ibis_type(nullable=True)))

    # Create a table with the numeric types.
    table_name = 'test_sa_default_param_decimal'
    engine = con.con
    table = sa.Table(table_name, sa.MetaData(bind=engine), *sqla_types)

    # Check that we can correctly recover the default precision and scale.
    schema = alch.schema_from_table(table)
    expected = ibis.schema(ibis_types)

    assert_equal(schema, expected)
    con.drop_table(table_name, force=True)
Example #56
0
def test_add_drop_partition_owned_by_impala(hdfs, con, temp_table):
    schema = ibis.schema([('foo', 'string'), ('year', 'int32'),
                          ('month', 'int16')])
    name = temp_table
    con.create_table(name, schema=schema, partition=['year', 'month'])

    table = con.table(name)

    part = {'year': 2007, 'month': 4}

    subdir = util.guid()
    basename = util.guid()
    path = f'/tmp/{subdir}/{basename}'

    hdfs.mkdir(f'/tmp/{subdir}')
    hdfs.chown(f'/tmp/{subdir}', owner='impala', group='supergroup')

    table.add_partition(part, location=path)

    assert len(table.partitions()) == 2

    table.drop_partition(part)

    assert len(table.partitions()) == 1
Example #57
0
def test_create_table_schema(con):
    t_name = 'mytable'

    con.drop_table(t_name, force=True)

    schema = ibis.schema([('a', 'float'), ('b', 'double'), ('c', 'int32'),
                          ('d', 'int64'), ('x', 'point'), ('y', 'linestring'),
                          ('z', 'polygon'), ('w', 'multipolygon')])

    con.create_table(t_name, schema=schema)

    try:
        t = con.table(t_name)

        assert isinstance(t.a, ir.FloatingColumn)
        assert isinstance(t.b, ir.FloatingColumn)
        assert isinstance(t.c, ir.IntegerColumn)
        assert isinstance(t.d, ir.IntegerColumn)
        assert isinstance(t.x, ir.PointColumn)
        assert isinstance(t.y, ir.LineStringColumn)
        assert isinstance(t.z, ir.PolygonColumn)
        assert isinstance(t.w, ir.MultiPolygonColumn)
    finally:
        con.drop_table(t_name)
Example #58
0
def test_read_csv(con, temp_table, filename):
    schema = ibis.schema(
        [
            ('index', 'int64'),
            ('Unnamed__0', 'int64'),
            ('id', 'int32'),
            ('bool_col', 'bool'),
            ('tinyint_col', 'int16'),
            ('smallint_col', 'int16'),
            ('int_col', 'int32'),
            ('bigint_col', 'int64'),
            ('float_col', 'float32'),
            ('double_col', 'double'),
            ('date_string_col', 'string'),
            ('string_col', 'string'),
            ('timestamp_col', 'timestamp'),
            ('year_', 'int32'),
            ('month_', 'int32'),
        ]
    )
    con.create_table(temp_table, schema=schema)

    # prepare csv file inside omnisci docker container
    # if the file exists, then it will be overwritten
    con._execute(
        "COPY (SELECT * FROM functional_alltypes) TO '{}'".format(filename)
    )

    db = con.database()
    table = db.table(temp_table)
    table.read_csv(filename, header=False, quotechar='"', delimiter=",")

    df_read_csv = table.execute()
    df_expected = db.table("functional_alltypes").execute()

    pd.testing.assert_frame_equal(df_expected, df_read_csv)
Example #59
0
def test_sqla_schema_conversion(con):
    typespec = [
        # name, type, nullable
        ('smallint', sat.SmallInteger, False, dt.int16),
        ('int', sat.Integer, True, dt.int32),
        ('integer', sat.INTEGER(), True, dt.int32),
        ('bigint', sat.BigInteger, False, dt.int64),
        ('real', sat.REAL, True, dt.float32),
        ('bool', sat.Boolean, True, dt.bool),
        ('timestamp', sat.DateTime, True, dt.timestamp),
    ]

    sqla_types = []
    ibis_types = []
    for name, t, nullable, ibis_type in typespec:
        sqla_types.append(sa.Column(name, t, nullable=nullable))
        ibis_types.append((name, ibis_type(nullable=nullable)))

    table = sa.Table('tname', con.meta, *sqla_types)

    schema = schema_from_table(table)
    expected = ibis.schema(ibis_types)

    assert_equal(schema, expected)
Example #60
0
def test_query_schema(backend, con, alltypes, expr_fn, expected):
    if not hasattr(con, '_build_ast'):
        pytest.skip(
            '{} backend has no _build_ast method'.format(
                type(backend).__name__
            )
        )

    expr = expr_fn(alltypes)

    # we might need a public API for it
    ast = con._build_ast(expr, backend.make_context())
    query = con.query_class(con, ast)
    schema = query.schema()

    # clickhouse columns has been defined as non-nullable
    # whereas other backends don't support non-nullable columns yet
    expected = ibis.schema(
        [
            (name, dtype(nullable=schema[name].nullable))
            for name, dtype in expected
        ]
    )
    assert query.schema().equals(expected)