Example #1
0
    def test_ctas_ddl(self):
        con = MockConnection()

        select = build_ast(con.table('test1')).queries[0]
        statement = ksupport.CTASKudu(
            'another_table',
            'kudu_name',
            ['dom.d.com:7051'],
            select,
            ['string_col'],
            external=True,
            can_exist=False,
            database='foo',
        )
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
TBLPROPERTIES (
  'kudu.key_columns'='string_col',
  'kudu.master_addresses'='dom.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
) AS
SELECT *
FROM test1"""
        assert result == expected
Example #2
0
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [
            self.i8,
            self.i16,
            self.i32,
            self.i64,
            self.d,
            self.f,
            self.dec,
            self.s,
            self.b,
            self.t,
        ]
Example #3
0
 def setUp(self):
     self.schema = [('a', 'int8'), ('b', 'int16'), ('c', 'int32'),
                    ('d', 'int64'), ('e', 'float'), ('f', 'double'),
                    ('g', 'string'), ('h', 'boolean')]
     self.schema_dict = dict(self.schema)
     self.table = ibis.table(self.schema)
     self.con = MockConnection()
Example #4
0
    def test_ctas_ddl(self):
        con = MockConnection()

        select = build_ast(con.table('test1')).queries[0]
        statement = ksupport.CTASKudu(
            'another_table',
            'kudu_name',
            ['dom.d.com:7051'],
            select,
            ['string_col'],
            external=True,
            can_exist=False,
            database='foo',
        )
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
TBLPROPERTIES (
  'kudu.key_columns'='string_col',
  'kudu.master_addresses'='dom.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
) AS
SELECT *
FROM test1"""
        assert result == expected
Example #5
0
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

        self.int_cols = ['a', 'b', 'c', 'd']
        self.bool_cols = ['h']
        self.float_cols = ['e', 'f']
Example #6
0
class TestInteractiveUse(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_interactive_execute_on_repr(self):
        table = self.con.table('functional_alltypes')
        expr = table.bigint_col.sum()
        with config.option_context('interactive', True):
            repr(expr)

        assert len(self.con.executed_queries) > 0

    def test_default_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            repr(table)

        expected = """\
SELECT *
FROM functional_alltypes
LIMIT {0}""".format(config.options.sql.default_limit)

        assert self.con.executed_queries[0] == expected

    def test_disable_query_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            with config.option_context('sql.default_limit', None):
                repr(table)

        expected = """\
SELECT *
FROM functional_alltypes"""

        assert self.con.executed_queries[0] == expected

    def test_interactive_non_compilable_repr_not_fail(self):
        # #170
        table = self.con.table('functional_alltypes')

        expr = table.string_col.topk(3)

        # it works!
        with config.option_context('interactive', True):
            repr(expr)

    def test_histogram_repr_no_query_execute(self):
        t = self.con.table('functional_alltypes')
        tier = t.double_col.histogram(10).name('bucket')
        expr = t.group_by(tier).size()
        with config.option_context('interactive', True):
            expr._repr()
        assert self.con.executed_queries == []
Example #7
0
class TestInteractiveUse(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_interactive_execute_on_repr(self):
        table = self.con.table('functional_alltypes')
        expr = table.bigint_col.sum()
        with config.option_context('interactive', True):
            repr(expr)

        assert len(self.con.executed_queries) > 0

    def test_default_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            repr(table)

        expected = """\
SELECT *
FROM functional_alltypes
LIMIT {0}""".format(config.options.sql.default_limit)

        assert self.con.executed_queries[0] == expected

    def test_disable_query_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            with config.option_context('sql.default_limit', None):
                repr(table)

        expected = """\
SELECT *
FROM functional_alltypes"""

        assert self.con.executed_queries[0] == expected

    def test_interactive_non_compilable_repr_not_fail(self):
        # #170
        table = self.con.table('functional_alltypes')

        expr = table.string_col.topk(3)

        # it works!
        with config.option_context('interactive', True):
            repr(expr)

    def test_histogram_repr_no_query_execute(self):
        t = self.con.table('functional_alltypes')
        tier = t.double_col.histogram(10).name('bucket')
        expr = t.group_by(tier).size()
        with config.option_context('interactive', True):
            expr._repr()
        assert self.con.executed_queries == []
Example #8
0
class TestDistinct(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_simple_table_distinct(self):
        t = self.con.table('functional_alltypes')

        expr = t[t.string_col, t.int_col].distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`, `int_col`
FROM functional_alltypes"""
        assert result == expected

    def test_array_distinct(self):
        t = self.con.table('functional_alltypes')
        expr = t.string_col.distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`
FROM functional_alltypes"""
        assert result == expected

    def test_count_distinct(self):
        t = self.con.table('functional_alltypes')

        metric = t.int_col.nunique().name('nunique')
        expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric])

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique`
FROM functional_alltypes
WHERE `bigint_col` > 0
GROUP BY 1"""
        assert result == expected

    def test_multiple_count_distinct(self):
        # Impala and some other databases will not execute multiple
        # count-distincts in a single aggregation query. This error reporting
        # will be left to the database itself, for now.
        t = self.con.table('functional_alltypes')
        metrics = [
            t.int_col.nunique().name('int_card'),
            t.smallint_col.nunique().name('smallint_card')
        ]

        expr = t.group_by('string_col').aggregate(metrics)

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`,
       COUNT(DISTINCT `smallint_col`) AS `smallint_card`
FROM functional_alltypes
GROUP BY 1"""
        assert result == expected
Example #9
0
    def setUp(self):
        self.con = MockConnection()

        table = self.con.table('functional_alltypes')

        self.t1 = (table[table.int_col > 0][
            table.string_col.name('key'),
            table.float_col.cast('double').name('value')])
        self.t2 = (table[table.int_col <= 0][table.string_col.name('key'),
                                             table.double_col.name('value')])

        self.union1 = self.t1.union(self.t2)
Example #10
0
class TestDistinct(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_simple_table_distinct(self):
        t = self.con.table('functional_alltypes')

        expr = t[t.string_col, t.int_col].distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`, `int_col`
FROM functional_alltypes"""
        assert result == expected

    def test_array_distinct(self):
        t = self.con.table('functional_alltypes')
        expr = t.string_col.distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`
FROM functional_alltypes"""
        assert result == expected

    def test_count_distinct(self):
        t = self.con.table('functional_alltypes')

        metric = t.int_col.nunique().name('nunique')
        expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric])

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique`
FROM functional_alltypes
WHERE `bigint_col` > 0
GROUP BY 1"""
        assert result == expected

    def test_multiple_count_distinct(self):
        # Impala and some other databases will not execute multiple
        # count-distincts in a single aggregation query. This error reporting
        # will be left to the database itself, for now.
        t = self.con.table('functional_alltypes')
        metrics = [t.int_col.nunique().name('int_card'),
                   t.smallint_col.nunique().name('smallint_card')]

        expr = t.group_by('string_col').aggregate(metrics)

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`,
       COUNT(DISTINCT `smallint_col`) AS `smallint_card`
FROM functional_alltypes
GROUP BY 1"""
        assert result == expected
Example #11
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_field_in_literals(self):
        cases = [(self.table.g.isin(["foo", "bar",
                                     "baz"]), "g IN ('foo', 'bar', 'baz')"),
                 (self.table.g.notin(["foo", "bar", "baz"]),
                  "g NOT IN ('foo', 'bar', 'baz')")]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (ibis.literal(2).isin([self.table.a, self.table.b,
                                   self.table.c]), '2 IN (a, b, c)'),
            (ibis.literal(2).notin([self.table.a, self.table.b,
                                    self.table.c]), '2 NOT IN (a, b, c)')
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        filtered = self.table[self.table.g.isin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE g IN ('foo', 'bar')"""
        assert result == expected

        filtered = self.table[self.table.g.notin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE g NOT IN ('foo', 'bar')"""
        assert result == expected
Example #12
0
class TestInsert(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Example #13
0
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [
            self.i8,
            self.i16,
            self.i32,
            self.i64,
            self.d,
            self.f,
            self.dec,
            self.s,
            self.b,
            self.t,
        ]
Example #14
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("alltypes")

    def test_field_in_literals(self):
        cases = [
            (self.table.g.isin(["foo", "bar", "baz"]), "`g` IN ('foo', 'bar', 'baz')"),
            (self.table.g.notin(["foo", "bar", "baz"]), "`g` NOT IN ('foo', 'bar', 'baz')"),
        ]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (L(2).isin([self.table.a, self.table.b, self.table.c]), "2 IN (`a`, `b`, `c`)"),
            (L(2).notin([self.table.a, self.table.b, self.table.c]), "2 NOT IN (`a`, `b`, `c`)"),
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        filtered = self.table[self.table.g.isin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` IN ('foo', 'bar')"""
        assert result == expected

        filtered = self.table[self.table.g.notin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` NOT IN ('foo', 'bar')"""
        assert result == expected
Example #15
0
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("alltypes")

        self.int_cols = ["a", "b", "c", "d"]
        self.bool_cols = ["h"]
        self.float_cols = ["e", "f"]
Example #16
0
class TestUnions(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

        table = self.con.table('functional_alltypes')

        self.t1 = (table[table.int_col > 0][
            table.string_col.name('key'),
            table.float_col.cast('double').name('value')])
        self.t2 = (table[table.int_col <= 0][table.string_col.name('key'),
                                             table.double_col.name('value')])

        self.union1 = self.t1.union(self.t2)

    def test_union(self):
        result = to_sql(self.union1)
        expected = """\
SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
FROM functional_alltypes
WHERE `int_col` > 0
UNION ALL
SELECT `string_col` AS `key`, `double_col` AS `value`
FROM functional_alltypes
WHERE `int_col` <= 0"""
        assert result == expected

    def test_union_distinct(self):
        union = self.t1.union(self.t2, distinct=True)
        result = to_sql(union)
        expected = """\
SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
FROM functional_alltypes
WHERE `int_col` > 0
UNION
SELECT `string_col` AS `key`, `double_col` AS `value`
FROM functional_alltypes
WHERE `int_col` <= 0"""
        assert result == expected

    def test_union_project_column(self):
        # select a column, get a subquery
        expr = self.union1[[self.union1.key]]
        result = to_sql(expr)
        expected = """SELECT `key`
FROM (
  SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
  FROM functional_alltypes
  WHERE `int_col` > 0
  UNION ALL
  SELECT `string_col` AS `key`, `double_col` AS `value`
  FROM functional_alltypes
  WHERE `int_col` <= 0
) t0"""
        assert result == expected

    def test_union_extract_with_block(self):
        pass

    def test_union_in_subquery(self):
        pass
Example #17
0
class TestInsert(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Example #18
0
class TestAnalytics(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')

    def test_category_project(self):
        t = self.alltypes

        tier = t.double_col.bucket([0, 50, 100]).name('tier')
        expr = t[tier, t]

        assert isinstance(expr.tier, ir.CategoryArray)

    def test_bucket(self):
        d = self.alltypes.double_col
        bins = [0, 10, 50, 100]

        expr = d.bucket(bins)
        assert isinstance(expr, ir.CategoryArray)
        assert expr.op().nbuckets == 3

        expr = d.bucket(bins, include_over=True)
        assert expr.op().nbuckets == 4

        expr = d.bucket(bins, include_over=True, include_under=True)
        assert expr.op().nbuckets == 5

    def test_bucket_error_cases(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.bucket, [])
        self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo')

        # it works!
        d.bucket([10], include_under=True, include_over=True)

        self.assertRaises(ValueError, d.bucket, [10])
        self.assertRaises(ValueError, d.bucket, [10], include_under=True)
        self.assertRaises(ValueError, d.bucket, [10], include_over=True)

    def test_histogram(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5)
        self.assertRaises(ValueError, d.histogram)
        self.assertRaises(ValueError, d.histogram, 10, closed='foo')

    def test_topk_analysis_bug(self):
        # GH #398
        airlines = ibis.table([('dest', 'string'), ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
        filtered = t.filter([delay_filter])

        # predicate is unmodified by analysis
        post_pred = filtered.op().predicates[1]
        assert delay_filter.equals(post_pred)
Example #19
0
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_analytic_exprs(self):
        t = self.table

        w = ibis.window(order_by=t.float_col)

        cases = [
            (ibis.row_number().over(w),
             '(row_number() OVER (ORDER BY `float_col`) - 1)'),
            (t.string_col.lag(), 'lag(`string_col`)'),
            (t.string_col.lag(2), 'lag(`string_col`, 2)'),
            (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'),
            (t.string_col.lead(), 'lead(`string_col`)'),
            (t.string_col.lead(2), 'lead(`string_col`, 2)'),
            (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'),
            (t.double_col.first(), 'first_value(`double_col`)'),
            (t.double_col.last(), 'last_value(`double_col`)'),
            # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))')
        ]
        self._check_expr_cases(cases)
Example #20
0
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_analytic_exprs(self):
        t = self.table

        w = ibis.window(order_by=t.float_col)

        cases = [
            (ibis.row_number().over(w),
             'row_number() OVER (ORDER BY `float_col`) - 1'),
            (t.string_col.lag(), 'lag(`string_col`)'),
            (t.string_col.lag(2), 'lag(`string_col`, 2)'),
            (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'),
            (t.string_col.lead(), 'lead(`string_col`)'),
            (t.string_col.lead(2), 'lead(`string_col`, 2)'),
            (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'),
            (t.double_col.first(), 'first_value(`double_col`)'),
            (t.double_col.last(), 'last_value(`double_col`)'),
            # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))')
        ]
        self._check_expr_cases(cases)
Example #21
0
    def setUp(self):
        self.con = MockConnection()

        self.t1 = ibis.table([('key1', 'string'), ('key2', 'string'),
                              ('value1', 'double')], 'foo')

        self.t2 = ibis.table([('key1', 'string'), ('key2', 'string')], 'bar')
Example #22
0
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_coalesce(self):
        t = self.table
        cases = [
            (ibis.coalesce(t.string_col, 'foo'),
             "coalesce(`string_col`, 'foo')"),
            (ibis.coalesce(t.int_col, t.bigint_col),
             'coalesce(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_greatest(self):
        t = self.table
        cases = [
            (ibis.greatest(t.string_col, 'foo'),
             "greatest(`string_col`, 'foo')"),
            (ibis.greatest(t.int_col, t.bigint_col),
             'greatest(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_least(self):
        t = self.table
        cases = [
            (ibis.least(t.string_col, 'foo'),
             "least(`string_col`, 'foo')"),
            (ibis.least(t.int_col, t.bigint_col),
             'least(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)
Example #23
0
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

        self.int_cols = ['a', 'b', 'c', 'd']
        self.bool_cols = ['h']
        self.float_cols = ['e', 'f']
Example #24
0
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_coalesce(self):
        t = self.table
        cases = [
            (ibis.coalesce(t.string_col, 'foo'),
             "coalesce(`string_col`, 'foo')"),
            (ibis.coalesce(t.int_col, t.bigint_col),
             'coalesce(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_greatest(self):
        t = self.table
        cases = [
            (ibis.greatest(t.string_col, 'foo'),
             "greatest(`string_col`, 'foo')"),
            (ibis.greatest(t.int_col, t.bigint_col),
             'greatest(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_least(self):
        t = self.table
        cases = [
            (ibis.least(t.string_col, 'foo'),
             "least(`string_col`, 'foo')"),
            (ibis.least(t.int_col, t.bigint_col),
             'least(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)
Example #25
0
    def test_memoize_database_table(self):
        con = MockConnection()
        table = con.table('test1')
        table2 = con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1
Example #26
0
class TestDistinct(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_distinct_basic(self):
        expr = self.table.distinct()
        assert isinstance(expr.op(), ops.Distinct)
        assert isinstance(expr, ir.TableExpr)
        assert expr.op().table is self.table

        expr = self.table.string_col.distinct()
        assert isinstance(expr.op(), ops.DistinctArray)
        assert isinstance(expr, ir.StringArray)

    # def test_distinct_array_interactions(self):
    # TODO

    # array cardinalities / shapes are likely to be different.
    #     a = self.table.int_col.distinct()
    #     b = self.table.bigint_col

    #     self.assertRaises(ir.RelationError, a.__add__, b)

    def test_distinct_count(self):
        result = self.table.string_col.distinct().count()
        expected = self.table.string_col.nunique().name('count')
        assert_equal(result, expected)
        assert isinstance(result.op(), ops.CountDistinct)

    def test_distinct_unnamed_array_expr(self):
        table = ibis.table([('year', 'int32'),
                            ('month', 'int32'),
                            ('day', 'int32')], 'foo')

        # it works!
        expr = (ibis.literal('-')
                .join([table.year.cast('string'),
                       table.month.cast('string'),
                       table.day.cast('string')])
                .distinct())
        repr(expr)

    def test_distinct_count_numeric_types(self):
        table = self.table
        metric = (table.bigint_col.distinct().count()
                  .name('unique_bigints'))

        table.group_by('string_col').aggregate(metric)

    def test_nunique(self):
        expr = self.table.string_col.nunique()
        assert isinstance(expr.op(), ops.CountDistinct)

    def test_project_with_distinct(self):
        pass
Example #27
0
    def test_memoize_database_table(self):
        con = MockConnection()
        table = con.table('test1')
        table2 = con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1
Example #28
0
    def setUp(self):
        self.con = MockConnection()

        table = self.con.table('functional_alltypes')

        self.t1 = (table[table.int_col > 0]
                   [table.string_col.name('key'),
                    table.float_col.cast('double').name('value')])
        self.t2 = (table[table.int_col <= 0]
                   [table.string_col.name('key'),
                    table.double_col.name('value')])

        self.union1 = self.t1.union(self.t2)
Example #29
0
 def setUp(self):
     self.schema = [
         ('a', 'int8'),
         ('b', 'int16'),
         ('c', 'int32'),
         ('d', 'int64'),
         ('e', 'float'),
         ('f', 'double'),
         ('g', 'string'),
         ('h', 'boolean')
     ]
     self.schema_dict = dict(self.schema)
     self.table = ibis.table(self.schema)
     self.con = MockConnection()
Example #30
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_field_in_literals(self):
        values = ['foo', 'bar', 'baz']
        values_formatted = tuple(set(values))
        cases = [
            (self.table.g.isin(values),
             "`g` IN {}".format(values_formatted)),
            (self.table.g.notin(values),
             "`g` NOT IN {}".format(values_formatted))
        ]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (L(2).isin([self.table.a, self.table.b, self.table.c]),
             '2 IN (`a`, `b`, `c`)'),
            (L(2).notin([self.table.a, self.table.b, self.table.c]),
             '2 NOT IN (`a`, `b`, `c`)')
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        values = ['foo', 'bar']
        values_formatted = tuple(set(values))

        filtered = self.table[self.table.g.isin(values)]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` IN {}"""
        assert result == expected.format(values_formatted)

        filtered = self.table[self.table.g.notin(values)]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` NOT IN {}"""
        assert result == expected.format(values_formatted)
Example #31
0
class TestCaseExprs(unittest.TestCase, ExprSQLTest, ExprTestCases):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_isnull_1_0(self):
        expr = self.table.g.isnull().ifelse(1, 0)

        result = self._translate(expr)
        expected = 'CASE WHEN `g` IS NULL THEN 1 ELSE 0 END'
        assert result == expected

        # inside some other function
        result = self._translate(expr.sum())
        expected = 'sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END)'
        assert result == expected

    def test_simple_case(self):
        expr = self._case_simple_case()
        result = self._translate(expr)
        expected = """CASE `g`
  WHEN 'foo' THEN 'bar'
  WHEN 'baz' THEN 'qux'
  ELSE 'default'
END"""
        assert result == expected

    def test_search_case(self):
        expr = self._case_search_case()
        result = self._translate(expr)
        expected = """CASE
  WHEN `f` > 0 THEN `d` * 2
  WHEN `c` < 0 THEN `a` * 2
  ELSE NULL
END"""
        assert result == expected

    def test_where_use_if(self):
        expr = ibis.where(self.table.f > 0, self.table.e, self.table.a)
        assert isinstance(expr, ir.FloatValue)

        result = self._translate(expr)
        expected = "if(`f` > 0, `e`, `a`)"
        assert result == expected

    def test_nullif_ifnull(self):
        table = self.con.table('tpch_lineitem')

        f = table.l_quantity

        cases = [
            (f.nullif(f == 0),
             'nullif(`l_quantity`, `l_quantity` = 0)'),
            (f.fillna(0),
             'isnull(`l_quantity`, CAST(0 AS decimal(12,2)))'),
        ]
        self._check_expr_cases(cases)

    def test_decimal_fillna_cast_arg(self):
        table = self.con.table('tpch_lineitem')
        f = table.l_extendedprice

        cases = [
            (f.fillna(0),
             'isnull(`l_extendedprice`, CAST(0 AS decimal(12,2)))'),
            (f.fillna(0.0), 'isnull(`l_extendedprice`, 0.0)'),
        ]
        self._check_expr_cases(cases)
Example #32
0
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_numeric_unary_builtins(self):
        # No argument functions
        functions = ['abs', 'ceil', 'floor', 'exp', 'sqrt', 'sign',
                     ('log', 'ln'),
                     ('approx_median', 'appx_median'),
                     ('approx_nunique', 'ndv'),
                     'ln', 'log2', 'log10', 'nullifzero', 'zeroifnull']

        cases = []
        for what in functions:
            if isinstance(what, tuple):
                ibis_name, sql_name = what
            else:
                ibis_name = sql_name = what

            for cname in ['double_col', 'int_col']:
                expr = getattr(self.table[cname], ibis_name)()
                cases.append((expr, '{0}({1})'.format(
                    sql_name, '`{0}`'.format(cname))))

        self._check_expr_cases(cases)

    def test_log_other_bases(self):
        cases = [
            (self.table.double_col.log(5), 'log(`double_col`, 5)')
        ]
        self._check_expr_cases(cases)

    def test_round(self):
        cases = [
            (self.table.double_col.round(), 'round(`double_col`)'),
            (self.table.double_col.round(0), 'round(`double_col`, 0)'),
            (self.table.double_col.round(2, ), 'round(`double_col`, 2)'),
            (self.table.double_col.round(self.table.tinyint_col),
             'round(`double_col`, `tinyint_col`)')
        ]
        self._check_expr_cases(cases)

    def test_hash(self):
        expr = self.table.int_col.hash()
        assert isinstance(expr, ir.Int64Array)
        assert isinstance(self.table.int_col.sum().hash(),
                          ir.Int64Scalar)

        cases = [
            (self.table.int_col.hash(), 'fnv_hash(`int_col`)')
        ]
        self._check_expr_cases(cases)

    def test_reduction_where(self):
        cond = self.table.bigint_col < 70
        c = self.table.double_col
        tmp = ('{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` '
               'ELSE NULL END)')
        cases = [
            (c.sum(where=cond), tmp.format('sum')),
            (c.count(where=cond), tmp.format('count')),
            (c.mean(where=cond), tmp.format('avg')),
            (c.max(where=cond), tmp.format('max')),
            (c.min(where=cond), tmp.format('min')),
            (c.std(where=cond), tmp.format('stddev')),
            (c.std(where=cond, how='pop'), tmp.format('stddev_pop')),
            (c.var(where=cond), tmp.format('variance')),
            (c.var(where=cond, how='pop'), tmp.format('variance_pop')),
        ]
        self._check_expr_cases(cases)

    def test_reduction_invalid_where(self):
        condbad_literal = L('T')
        c = self.table.double_col
        for reduction in [c.sum, c.count, c.mean, c.max, c.min]:
            with self.assertRaises(TypeError):
                reduction(where=condbad_literal)
Example #33
0
 def setUp(self):
     self.con = MockConnection()
     self.alltypes = self.con.table('alltypes')
     self.col = self.alltypes.i
Example #34
0
def mockcon():
    return MockConnection()
Example #35
0
 def setUp(self):
     self.con = MockConnection()
Example #36
0
    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table('functional_alltypes')
        self.expr = t[t.bigint_col > 0]
Example #37
0
class TestFixedOffsets(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_upconvert(self):
        cases = [
            (T.day(14), 'w', T.week(2)),
            (T.hour(72), 'd', T.day(3)),
            (T.minute(240), 'h', T.hour(4)),
            (T.second(360), 'm', T.minute(6)),
            (T.second(3 * 86400), 'd', T.day(3)),
            (T.millisecond(5000), 's', T.second(5)),
            (T.microsecond(5000000), 's', T.second(5)),
            (T.nanosecond(5000000000), 's', T.second(5)),
        ]

        for offset, unit, expected in cases:
            result = offset.to_unit(unit)
            assert result.equals(expected)

    def test_multiply(self):
        offset = T.day(2)

        assert (offset * 2).equals(T.day(4))
        assert (offset * (-2)).equals(T.day(-4))
        assert (3 * offset).equals(T.day(6))
        assert ((-3) * offset).equals(T.day(-6))

    def test_repr(self):
        assert repr(T.day()) == '<Timedelta: 1 day>'
        assert repr(T.day(2)) == '<Timedelta: 2 days>'
        assert repr(T.year()) == '<Timedelta: 1 year>'
        assert repr(T.month(2)) == '<Timedelta: 2 months>'
        assert repr(T.second(40)) == '<Timedelta: 40 seconds>'

    def test_cannot_upconvert(self):
        cases = [
            (T.day(), 'w'),
            (T.hour(), 'd'),
            (T.minute(), 'h'),
            (T.second(), 'm'),
            (T.second(), 'd'),
            (T.millisecond(), 's'),
            (T.microsecond(), 's'),
            (T.nanosecond(), 's'),
        ]

        for delta, target in cases:
            self.assertRaises(IbisError, delta.to_unit, target)

    def test_downconvert_second_parts(self):
        K = 2

        sec = T.second(K)
        milli = T.millisecond(K)
        micro = T.microsecond(K)
        nano = T.nanosecond(K)

        cases = [(sec.to_unit('s'), T.second(K)),
                 (sec.to_unit('ms'), T.millisecond(K * 1000)),
                 (sec.to_unit('us'), T.microsecond(K * 1000000)),
                 (sec.to_unit('ns'), T.nanosecond(K * 1000000000)),
                 (milli.to_unit('ms'), T.millisecond(K)),
                 (milli.to_unit('us'), T.microsecond(K * 1000)),
                 (milli.to_unit('ns'), T.nanosecond(K * 1000000)),
                 (micro.to_unit('us'), T.microsecond(K)),
                 (micro.to_unit('ns'), T.nanosecond(K * 1000)),
                 (nano.to_unit('ns'), T.nanosecond(K))]
        self._check_cases(cases)

    def test_downconvert_hours(self):
        K = 2
        offset = T.hour(K)

        cases = [(offset.to_unit('h'), T.hour(K)),
                 (offset.to_unit('m'), T.minute(K * 60)),
                 (offset.to_unit('s'), T.second(K * 3600)),
                 (offset.to_unit('ms'), T.millisecond(K * 3600000)),
                 (offset.to_unit('us'), T.microsecond(K * 3600000000)),
                 (offset.to_unit('ns'), T.nanosecond(K * 3600000000000))]
        self._check_cases(cases)

    def test_downconvert_day(self):
        K = 2

        week = T.week(K)
        day = T.day(K)

        cases = [(week.to_unit('d'), T.day(K * 7)),
                 (week.to_unit('h'), T.hour(K * 7 * 24)),
                 (day.to_unit('d'), T.day(K)),
                 (day.to_unit('h'), T.hour(K * 24)),
                 (day.to_unit('m'), T.minute(K * 1440)),
                 (day.to_unit('s'), T.second(K * 86400)),
                 (day.to_unit('ms'), T.millisecond(K * 86400000)),
                 (day.to_unit('us'), T.microsecond(K * 86400000000)),
                 (day.to_unit('ns'), T.nanosecond(K * 86400000000000))]
        self._check_cases(cases)

    def test_combine_with_different_kinds(self):
        cases = [(T.day() + T.minute(), T.minute(1441)),
                 (T.second() + T.millisecond(10), T.millisecond(1010)),
                 (T.hour() + T.minute(5) + T.second(10), T.second(3910))]
        self._check_cases(cases)

    def test_timedelta_generic_api(self):
        cases = [
            (T.timedelta(weeks=2), T.week(2)),
            (T.timedelta(days=3), T.day(3)),
            (T.timedelta(hours=4), T.hour(4)),
            (T.timedelta(minutes=5), T.minute(5)),
            (T.timedelta(seconds=6), T.second(6)),
            (T.timedelta(milliseconds=7), T.millisecond(7)),
            (T.timedelta(microseconds=8), T.microsecond(8)),
            (T.timedelta(nanoseconds=9), T.nanosecond(9)),
        ]
        self._check_cases(cases)

    def _check_cases(self, cases):
        for x, y in cases:
            assert x.equals(y)

    def test_offset_timestamp_expr(self):
        c = self.table.i
        x = T.timedelta(days=1)

        expr = x + c
        assert isinstance(expr, ir.TimestampColumn)
        assert isinstance(expr.op(), ops.TimestampDelta)

        # test radd
        expr = c + x
        assert isinstance(expr, ir.TimestampColumn)
        assert isinstance(expr.op(), ops.TimestampDelta)
Example #38
0
class TestExprFormatting(unittest.TestCase):
    # Uncertain about how much we want to commit to unit tests around the
    # particulars of the output at the moment.

    def setUp(self):
        self.schema = [
            ('a', 'int8'),
            ('b', 'int16'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('e', 'float'),
            ('f', 'double'),
            ('g', 'string'),
            ('h', 'boolean')
        ]
        self.schema_dict = dict(self.schema)
        self.table = ibis.table(self.schema)
        self.con = MockConnection()

    def test_format_table_column(self):
        # GH #507
        result = repr(self.table.f)
        assert 'Column[array(double)]' in result

    def test_format_projection(self):
        # This should produce a ref to the projection
        proj = self.table[['c', 'a', 'f']]
        repr(proj['a'])

    def test_table_type_output(self):
        foo = ibis.table(
            [
                ('job', 'string'),
                ('dept_id', 'string'),
                ('year', 'int32'),
                ('y', 'double')
            ], 'foo')

        expr = foo.dept_id == foo.view().dept_id
        result = repr(expr)
        assert 'SelfReference[table]' in result
        assert 'UnboundTable[table]' in result

    def test_memoize_aggregate_correctly(self):
        table = self.table

        agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis')
        agg_exprs = [table['a'].sum().name('sum(a)'),
                     table['b'].mean().name('mean(b)'), agg_expr]

        result = table.aggregate(agg_exprs, by=['g'])

        formatter = ExprFormatter(result)
        formatted = formatter.get_result()

        alias = formatter.memo.get_alias(table.op())
        assert formatted.count(alias) == 7

    def test_aggregate_arg_names(self):
        # Not sure how to test this *well*

        t = self.table

        by_exprs = [t.g.name('key1'), t.f.round().name('key2')]
        agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')]

        expr = self.table.group_by(by_exprs).aggregate(agg_exprs)
        result = repr(expr)
        assert 'metrics' in result
        assert 'by' in result

    def test_format_multiple_join_with_projection(self):
        # Star schema with fact table
        table = ibis.table([
            ('c', 'int32'),
            ('f', 'double'),
            ('foo_id', 'string'),
            ('bar_id', 'string'),
        ])

        table2 = ibis.table([
            ('foo_id', 'string'),
            ('value1', 'double')
        ])

        table3 = ibis.table([
            ('bar_id', 'string'),
            ('value2', 'double')
        ])

        filtered = table[table['f'] > 0]

        pred1 = table['foo_id'] == table2['foo_id']
        pred2 = filtered['bar_id'] == table3['bar_id']

        j1 = filtered.left_join(table2, [pred1])
        j2 = j1.inner_join(table3, [pred2])

        # Project out the desired fields
        view = j2[[table, table2['value1'], table3['value2']]]

        # it works!
        repr(view)

    def test_memoize_database_table(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1

    def test_memoize_filtered_table(self):
        airlines = ibis.table([('dest', 'string'),
                               ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())

        result = repr(delay_filter)
        assert result.count('Filter') == 1

    def test_memoize_insert_sort_key(self):
        table = self.con.table('airlines')

        t = table['arrdelay', 'dest']
        expr = (t.group_by('dest')
                .mutate(dest_avg=t.arrdelay.mean(),
                        dev=t.arrdelay - t.arrdelay.mean()))

        worst = expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10)

        result = repr(worst)
        assert result.count('airlines') == 1

    def test_named_value_expr_show_name(self):
        expr = self.table.f * 2
        expr2 = expr.name('baz')

        # it works!
        repr(expr)

        result2 = repr(expr2)

        # not really committing to a particular output yet
        assert 'baz' in result2

    def test_memoize_filtered_tables_in_join(self):
        # related: GH #667
        purchases = ibis.table([('region', 'string'),
                                ('kind', 'string'),
                                ('user', 'int64'),
                                ('amount', 'double')], 'purchases')

        metric = purchases.amount.sum().name('total')
        agged = (purchases.group_by(['region', 'kind'])
                 .aggregate(metric))

        left = agged[agged.kind == 'foo']
        right = agged[agged.kind == 'bar']

        cond = left.region == right.region
        joined = left.join(right, cond)

        result = repr(joined)
        assert result.count('Filter') == 2
Example #39
0
 def setUp(self):
     self.con = MockConnection()
     self.table = self.con.table('functional_alltypes')
Example #40
0
class TestTimestamp(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('alltypes')
        self.col = self.alltypes.i

    def test_field_select(self):
        assert isinstance(self.col, ir.TimestampArray)

    def test_string_cast_to_timestamp(self):
        casted = self.alltypes.g.cast('timestamp')
        assert isinstance(casted, ir.TimestampArray)

        string = api.literal('2000-01-01')
        casted = string.cast('timestamp')
        assert isinstance(casted, ir.TimestampScalar)

    def test_extract_fields(self):
        # type-size may be database specific
        cases = [
            ('year', ops.ExtractYear, ir.Int32Array),
            ('month', ops.ExtractMonth, ir.Int32Array),
            ('day', ops.ExtractDay, ir.Int32Array),
            ('hour', ops.ExtractHour, ir.Int32Array),
            ('minute', ops.ExtractMinute, ir.Int32Array),
            ('second', ops.ExtractSecond, ir.Int32Array)
        ]

        for attr, ex_op, ex_type in cases:
            result = getattr(self.col, attr)()
            assert isinstance(result, ex_type)
            assert isinstance(result.op(), ex_op)

    def test_extract_no_propagate_name(self):
        # see #146
        table = self.con.table('functional_alltypes')

        expr = table.timestamp_col.hour()
        self.assertRaises(com.ExpressionError, expr.get_name)

    def test_now(self):
        result = api.now()
        assert isinstance(result, ir.TimestampScalar)
        assert isinstance(result.op(), ops.TimestampNow)

    def test_timestamp_literals(self):
        ts_str = '2015-01-01 00:00:00'
        val = pd.Timestamp(ts_str)

        expr = ibis.literal(val)
        assert isinstance(expr, ir.TimestampScalar)

        expr = ibis.timestamp(ts_str)
        assert isinstance(expr, ir.TimestampScalar)

        self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71')

    def test_integer_to_timestamp(self):
        # #246
        pass

    def test_comparison_timestamp(self):
        expr = self.col > (self.col.min() + ibis.day(3))
        assert isinstance(expr, ir.BooleanArray)

    def test_comparisons_string(self):
        val = '2015-01-01 00:00:00'
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)

        expr2 = val < self.col
        op = expr2.op()
        assert isinstance(op, ops.Greater)
        assert isinstance(op.right, ir.TimestampScalar)

    def test_comparisons_pandas_timestamp(self):
        val = pd.Timestamp('2015-01-01 00:00:00')
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)
Example #41
0
class TestWrapping(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d,
                         self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        func = api.scalar_function(['string'], 'string', name='Tester')
        func.register('identity', 'udf_testing')

        result = func('hello world')
        assert result == "SELECT udf_testing.identity('hello world')"

    def test_sql_generation_from_infoclass(self):
        func = api.wrap_udf('test.so', ['string'], 'string', 'info_test')
        repr(func)

        func.register('info_test', 'udf_testing')
        result = func('hello world')
        assert result == "SELECT udf_testing.info_test('hello world')"

    def test_udf_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_udf([t], t, 'test')

            ibis_type = validate_type(t)

            expr = func(sv)
            assert type(expr) == ibis_type.scalar_type()
            expr = func(av)
            assert type(expr) == ibis_type.array_type()

    def test_uda_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_uda([t], t, 'test')

            ibis_type = validate_type(t)

            expr1 = func(sv)
            expr2 = func(sv)
            assert isinstance(expr1, ibis_type.scalar_type())
            assert isinstance(expr2, ibis_type.scalar_type())

    def test_decimal(self):
        func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test')
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalArray

    def test_udf_invalid_typecasting(self):
        cases = [
            ('int8', self.all_cols[:1], self.all_cols[1:]),
            ('int16', self.all_cols[:2], self.all_cols[2:]),
            ('int32', self.all_cols[:3], self.all_cols[3:]),
            ('int64', self.all_cols[:4], self.all_cols[4:]),
            ('boolean', [], self.all_cols[:8] + self.all_cols[9:]),

            # allowing double here for now
            ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),

            ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),
            ('string', [], self.all_cols[:7] + self.all_cols[8:]),
            ('timestamp', [], self.all_cols[:-1]),
            ('decimal', [], self.all_cols[:4] + self.all_cols[7:])
        ]

        for t, valid_casts, invalid_casts in cases:
            func = self._register_udf([t], 'int32', 'typecast')

            for expr in valid_casts:
                func(expr)

            for expr in invalid_casts:
                self.assertRaises(IbisTypeError, func, expr)

    def test_mult_args(self):
        func = self._register_udf(['int32', 'double', 'string',
                                   'boolean', 'timestamp'],
                                  'int64', 'mult_types')

        expr = func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ArrayExpr)

        expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10'))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _register_udf(self, inputs, output, name):
        func = api.scalar_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func

    def _register_uda(self, inputs, output, name):
        func = api.aggregate_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func
Example #42
0
class TestStringOps(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_lower_upper(self):
        lresult = self.table.g.lower()
        uresult = self.table.g.upper()

        assert isinstance(lresult, ir.StringArray)
        assert isinstance(uresult, ir.StringArray)

        assert isinstance(lresult.op(), ops.Lowercase)
        assert isinstance(uresult.op(), ops.Uppercase)

        lit = literal('FoO')

        lresult = lit.lower()
        uresult = lit.upper()
        assert isinstance(lresult, ir.StringScalar)
        assert isinstance(uresult, ir.StringScalar)

    def test_substr(self):
        lit = literal('FoO')

        result = self.table.g.substr(2, 4)
        lit_result = lit.substr(0, 2)

        assert isinstance(result, ir.StringArray)
        assert isinstance(lit_result, ir.StringScalar)

        op = result.op()
        assert isinstance(op, ops.Substring)

        start, length = op.args[1:]

        assert start.equals(literal(2))
        assert length.equals(literal(4))

    def test_left_right(self):
        result = self.table.g.left(5)
        expected = self.table.g.substr(0, 5)
        assert result.equals(expected)

        result = self.table.g.right(5)
        op = result.op()
        assert isinstance(op, ops.StrRight)
        assert op.args[1].equals(literal(5))

    def test_length(self):
        lit = literal('FoO')
        result = self.table.g.length()
        lit_result = lit.length()

        assert isinstance(result, ir.Int32Array)
        assert isinstance(lit_result, ir.Int32Scalar)
        assert isinstance(result.op(), ops.StringLength)

    def test_join(self):
        dash = literal('-')

        expr = dash.join([self.table.f.cast('string'),
                          self.table.g])
        assert isinstance(expr, ir.StringArray)

        expr = dash.join([literal('ab'), literal('cd')])
        assert isinstance(expr, ir.StringScalar)

    def test_contains(self):
        expr = self.table.g.contains('foo')
        expected = self.table.g.find('foo') >= 0
        assert_equal(expr, expected)

        self.assertRaises(Exception, lambda: 'foo' in self.table.g)

    def test_getitem_slice(self):
        cases = [
            (self.table.g[:3], self.table.g.substr(0, 3)),
            (self.table.g[2:6], self.table.g.substr(2, 4)),
        ]

        for case, expected in cases:
            assert_equal(case, expected)
Example #43
0
class TestInsertLoadData(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_load_data_unpartitioned(self):
        path = '/path/to/data'
        stmt = ddl.LoadData('functional_alltypes', path, database='foo')

        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "INTO TABLE foo.`functional_alltypes`")
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "OVERWRITE INTO TABLE foo.`functional_alltypes`")
        assert result == expected

    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes', path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Example #44
0
class TestValueExprs(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

        self.int_cols = ['a', 'b', 'c', 'd']
        self.bool_cols = ['h']
        self.float_cols = ['e', 'f']

    def _check_literals(self, cases):
        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_string_literals(self):
        cases = [
            ('simple', "'simple'"),
            ('I can\'t', "'I can\\'t'"),
            ('An "escape"', "'An \"escape\"'")
        ]

        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_decimal_builtins(self):
        t = self.con.table('tpch_lineitem')
        col = t.l_extendedprice
        cases = [
            (col.precision(), 'precision(`l_extendedprice`)'),
            (col.scale(), 'scale(`l_extendedprice`)'),
        ]
        self._check_expr_cases(cases)

    def test_number_boolean_literals(self):
        cases = [
            (5, '5'),
            (1.5, '1.5'),
            (True, 'TRUE'),
            (False, 'FALSE')
        ]
        self._check_literals(cases)

    def test_column_ref_table_aliases(self):
        context = ImpalaContext()

        table1 = ibis.table([
            ('key1', 'string'),
            ('value1', 'double')
        ])

        table2 = ibis.table([
            ('key2', 'string'),
            ('value and2', 'double')
        ])

        context.set_ref(table1, 't0')
        context.set_ref(table2, 't1')

        expr = table1['value1'] - table2['value and2']

        result = self._translate(expr, context=context)
        expected = 't0.`value1` - t1.`value and2`'
        assert result == expected

    def test_column_ref_quoting(self):
        schema = [('has a space', 'double')]
        table = ibis.table(schema)
        self._translate(table['has a space'], '`has a space`')

    def test_identifier_quoting(self):
        schema = [('date', 'double'), ('table', 'string')]
        table = ibis.table(schema)
        self._translate(table['date'], '`date`')
        self._translate(table['table'], '`table`')

    def test_named_expressions(self):
        a, b, g = self.table.get_columns(['a', 'b', 'g'])

        cases = [
            (g.cast('double').name('g_dub'), 'CAST(`g` AS double) AS `g_dub`'),
            (g.name('has a space'), '`g` AS `has a space`'),
            (((a - b) * a).name('expr'), '(`a` - `b`) * `a` AS `expr`')
        ]

        return self._check_expr_cases(cases, named=True)

    def test_binary_infix_operators(self):
        # For each function, verify that the generated code is what we expect
        a, b, h = self.table.get_columns(['a', 'b', 'h'])
        bool_col = a > 0

        cases = [
            (a + b, '`a` + `b`'),
            (a - b, '`a` - `b`'),
            (a * b, '`a` * `b`'),
            (a / b, '`a` / `b`'),
            (a ** b, 'pow(`a`, `b`)'),
            (a < b, '`a` < `b`'),
            (a <= b, '`a` <= `b`'),
            (a > b, '`a` > `b`'),
            (a >= b, '`a` >= `b`'),
            (a == b, '`a` = `b`'),
            (a != b, '`a` != `b`'),
            (h & bool_col, '`h` AND (`a` > 0)'),
            (h | bool_col, '`h` OR (`a` > 0)'),
            # xor is brute force
            (h ^ bool_col, '(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))')
        ]
        self._check_expr_cases(cases)

    def test_binary_infix_parenthesization(self):
        a, b, c = self.table.get_columns(['a', 'b', 'c'])

        cases = [
            ((a + b) + c, '(`a` + `b`) + `c`'),
            (a.log() + c, 'ln(`a`) + `c`'),
            (b + (-(a + c)), '`b` + (-(`a` + `c`))')
        ]

        self._check_expr_cases(cases)

    def test_between(self):
        cases = [
            (self.table.f.between(0, 1), '`f` BETWEEN 0 AND 1')
        ]
        self._check_expr_cases(cases)

    def test_isnull_notnull(self):
        cases = [
            (self.table['g'].isnull(), '`g` IS NULL'),
            (self.table['a'].notnull(), '`a` IS NOT NULL'),
            ((self.table['a'] + self.table['b']).isnull(),
             '`a` + `b` IS NULL')
        ]
        self._check_expr_cases(cases)

    def test_casts(self):
        a, d, g = self.table.get_columns(['a', 'd', 'g'])
        cases = [
            (a.cast('int16'), 'CAST(`a` AS smallint)'),
            (a.cast('int32'), 'CAST(`a` AS int)'),
            (a.cast('int64'), 'CAST(`a` AS bigint)'),
            (a.cast('float'), 'CAST(`a` AS float)'),
            (a.cast('double'), 'CAST(`a` AS double)'),
            (a.cast('string'), 'CAST(`a` AS string)'),
            (d.cast('int8'), 'CAST(`d` AS tinyint)'),
            (g.cast('double'), 'CAST(`g` AS double)'),
            (g.cast('timestamp'), 'CAST(`g` AS timestamp)')
        ]
        self._check_expr_cases(cases)

    def test_misc_conditionals(self):
        a = self.table.a
        cases = [
            (a.nullif(0), 'nullif(`a`, 0)')
        ]
        self._check_expr_cases(cases)

    def test_decimal_casts(self):
        cases = [
            (L('9.9999999').cast('decimal(38,5)'),
             "CAST('9.9999999' AS decimal(38,5))"),
            (self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))")
        ]
        self._check_expr_cases(cases)

    def test_negate(self):
        cases = [
            (-self.table['a'], '-`a`'),
            (-self.table['f'], '-`f`'),
            (-self.table['h'], 'NOT `h`')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_extract_field(self):
        fields = ['year', 'month', 'day', 'hour', 'minute',
                  'second', 'millisecond']

        cases = [(getattr(self.table.i, field)(),
                  "extract(`i`, '{0}')".format(field))
                 for field in fields]
        self._check_expr_cases(cases)

        # integration with SQL translation
        expr = self.table[self.table.i.year().name('year'),
                          self.table.i.month().name('month'),
                          self.table.i.day().name('day')]

        result = to_sql(expr)
        expected = \
            """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`,
       extract(`i`, 'day') AS `day`
FROM alltypes"""
        assert result == expected

    def test_timestamp_now(self):
        cases = [
            (ibis.now(), 'now()')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_deltas(self):
        units = ['year', 'month', 'week', 'day',
                 'hour', 'minute', 'second',
                 'millisecond', 'microsecond']

        t = self.table.i
        f = '`i`'

        cases = []
        for unit in units:
            K = 5
            offset = getattr(ibis, unit)(K)
            template = '{0}s_add({1}, {2})'

            cases.append((t + offset, template.format(unit, f, K)))
            cases.append((t - offset, template.format(unit, f, -K)))

        self._check_expr_cases(cases)

    def test_timestamp_literals(self):
        from pandas import Timestamp

        tv1 = '2015-01-01 12:34:56'
        ex1 = ("'2015-01-01 12:34:56'")

        cases = [
            (L(Timestamp(tv1)), ex1),
            (L(Timestamp(tv1).to_pydatetime()), ex1),
            (ibis.timestamp(tv1), ex1)
        ]
        self._check_expr_cases(cases)

    def test_timestamp_from_integer(self):
        col = self.table.c

        cases = [
            (col.to_timestamp(),
             'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('ms'),
             'CAST(from_unixtime(CAST(`c` / 1000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('us'),
             'CAST(from_unixtime(CAST(`c` / 1000000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
        ]
        self._check_expr_cases(cases)

    def test_correlated_predicate_subquery(self):
        t0 = self.table
        t1 = t0.view()

        expr = t0.g == t1.g

        ctx = ImpalaContext()
        ctx.make_alias(t0)

        # Grab alias from parent context
        subctx = ctx.subcontext()
        subctx.make_alias(t1)
        subctx.make_alias(t0)

        result = self._translate(expr, context=subctx)
        expected = "t0.`g` = t1.`g`"
        assert result == expected

    def test_any_all(self):
        t = self.table

        bool_expr = t.f == 0

        cases = [
            (bool_expr.any(), 'sum(`f` = 0) > 0'),
            (-bool_expr.any(), 'sum(`f` = 0) = 0'),
            (bool_expr.all(), 'sum(`f` = 0) = count(*)'),
            (-bool_expr.all(), 'sum(`f` = 0) < count(*)'),
        ]
        self._check_expr_cases(cases)
Example #45
0
 def __init__(self):
     self.meta = sa.MetaData()
     MockConnection.__init__(self)
Example #46
0
class TestBucketHistogram(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_bucket_to_case(self):
        buckets = [0, 10, 25, 50]

        expr1 = self.table.f.bucket(buckets)
        expected1 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr2 = self.table.f.bucket(buckets, close_extreme=False)
        expected2 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` < 50) THEN 2
  ELSE NULL
END"""

        expr3 = self.table.f.bucket(buckets, closed='right')
        expected3 = """\
CASE
  WHEN (`f` >= 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr4 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False)
        expected4 = """\
CASE
  WHEN (`f` > 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr5 = self.table.f.bucket(buckets, include_under=True)
        expected5 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr6 = self.table.f.bucket(buckets,
                                    include_under=True,
                                    include_over=True)
        expected6 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  WHEN `f` > 50 THEN 4
  ELSE NULL
END"""

        expr7 = self.table.f.bucket(buckets,
                                    close_extreme=False,
                                    include_under=True,
                                    include_over=True)
        expected7 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` < 50) THEN 3
  WHEN `f` >= 50 THEN 4
  ELSE NULL
END"""

        expr8 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False,
                                    include_under=True)
        expected8 = """\
CASE
  WHEN `f` <= 0 THEN 0
  WHEN (`f` > 0) AND (`f` <= 10) THEN 1
  WHEN (`f` > 10) AND (`f` <= 25) THEN 2
  WHEN (`f` > 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr9 = self.table.f.bucket([10], closed='right',
                                    include_over=True,
                                    include_under=True)
        expected9 = """\
CASE
  WHEN `f` <= 10 THEN 0
  WHEN `f` > 10 THEN 1
  ELSE NULL
END"""

        expr10 = self.table.f.bucket([10], include_over=True,
                                     include_under=True)
        expected10 = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        cases = [
            (expr1, expected1),
            (expr2, expected2),
            (expr3, expected3),
            (expr4, expected4),
            (expr5, expected5),
            (expr6, expected6),
            (expr7, expected7),
            (expr8, expected8),
            (expr9, expected9),
            (expr10, expected10),
        ]
        self._check_expr_cases(cases)

    def test_cast_category_to_int_noop(self):
        # Because the bucket result is an integer, no explicit cast is
        # necessary
        expr = (self.table.f.bucket([10], include_over=True,
                                    include_under=True)
                .cast('int32'))

        expected = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        expr2 = (self.table.f.bucket([10], include_over=True,
                                     include_under=True)
                 .cast('double'))

        expected2 = """\
CAST(CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END AS double)"""

        self._check_expr_cases([(expr, expected),
                                (expr2, expected2)])

    def test_bucket_assign_labels(self):
        buckets = [0, 10, 25, 50]
        bucket = self.table.f.bucket(buckets, include_under=True)

        size = self.table.group_by(bucket.name('tier')).size()
        labelled = size.tier.label(['Under 0', '0 to 10',
                                    '10 to 25', '25 to 50'],
                                   nulls='error').name('tier2')
        expr = size[labelled, size['count']]

        expected = """\
SELECT
  CASE `tier`
    WHEN 0 THEN 'Under 0'
    WHEN 1 THEN '0 to 10'
    WHEN 2 THEN '10 to 25'
    WHEN 3 THEN '25 to 50'
    ELSE 'error'
  END AS `tier2`, `count`
FROM (
  SELECT
    CASE
      WHEN `f` < 0 THEN 0
      WHEN (`f` >= 0) AND (`f` < 10) THEN 1
      WHEN (`f` >= 10) AND (`f` < 25) THEN 2
      WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
      ELSE NULL
    END AS `tier`, count(*) AS `count`
  FROM alltypes
  GROUP BY 1
) t0"""

        result = to_sql(expr)

        assert result == expected

        self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c'])
        self.assertRaises(ValueError, size.tier.label,
                          ['a', 'b', 'c', 'd', 'e'])
Example #47
0
 def __init__(self):
     self.meta = sa.MetaData()
     MockConnection.__init__(self)
Example #48
0
    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table('functional_alltypes')
        self.expr = t[t.bigint_col > 0]
Example #49
0
class TestBuiltins(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')
        self.lineitem = self.con.table('tpch_lineitem')

    def test_abs(self):
        colnames = [
            'tinyint_col', 'smallint_col', 'int_col', 'bigint_col',
            'float_col', 'double_col'
        ]

        fname = 'abs'
        op = ops.Abs

        for col in colnames:
            expr = self.alltypes[col]
            self._check_unary_op(expr, fname, op, type(expr))

        expr = self.lineitem.l_extendedprice
        self._check_unary_op(expr, fname, op, type(expr))

    def test_group_concat(self):
        col = self.alltypes.string_col

        expr = col.group_concat()
        assert isinstance(expr.op(), ops.GroupConcat)
        arg, sep = expr.op().args
        assert sep == ','

        expr = col.group_concat('|')
        arg, sep = expr.op().args
        assert sep == '|'

    def test_zeroifnull(self):
        dresult = self.alltypes.double_col.zeroifnull()
        iresult = self.alltypes.int_col.zeroifnull()

        assert type(dresult.op()) == ops.ZeroIfNull
        assert type(dresult) == ir.DoubleArray

        # Impala upconverts all ints to bigint. Hmm.
        assert type(iresult) == type(iresult)

    def test_fillna(self):
        result = self.alltypes.double_col.fillna(5)
        assert isinstance(result, ir.DoubleArray)

        assert isinstance(result.op(), ops.IfNull)

        result = self.alltypes.bool_col.fillna(True)
        assert isinstance(result, ir.BooleanArray)

        # Retains type of caller (for now)
        result = self.alltypes.int_col.fillna(self.alltypes.bigint_col)
        assert isinstance(result, ir.Int32Array)

    def test_ceil_floor(self):
        cresult = self.alltypes.double_col.ceil()
        fresult = self.alltypes.double_col.floor()
        assert isinstance(cresult, ir.Int64Array)
        assert isinstance(fresult, ir.Int64Array)
        assert type(cresult.op()) == ops.Ceil
        assert type(fresult.op()) == ops.Floor

        cresult = ibis.literal(1.2345).ceil()
        fresult = ibis.literal(1.2345).floor()
        assert isinstance(cresult, ir.Int64Scalar)
        assert isinstance(fresult, ir.Int64Scalar)

        dec_col = self.lineitem.l_extendedprice
        cresult = dec_col.ceil()
        fresult = dec_col.floor()
        assert isinstance(cresult, ir.DecimalArray)
        assert cresult.meta == dec_col.meta

        assert isinstance(fresult, ir.DecimalArray)
        assert fresult.meta == dec_col.meta

    def test_sign(self):
        result = self.alltypes.double_col.sign()
        assert isinstance(result, ir.FloatArray)
        assert type(result.op()) == ops.Sign

        result = ibis.literal(1.2345).sign()
        assert isinstance(result, ir.FloatScalar)

        dec_col = self.lineitem.l_extendedprice
        result = dec_col.sign()
        assert isinstance(result, ir.FloatArray)

    def test_round(self):
        result = self.alltypes.double_col.round()
        assert isinstance(result, ir.Int64Array)
        assert result.op().args[1] is None

        result = self.alltypes.double_col.round(2)
        assert isinstance(result, ir.DoubleArray)
        assert result.op().args[1] == 2

        # Even integers are double (at least in Impala, check with other DB
        # implementations)
        result = self.alltypes.int_col.round(2)
        assert isinstance(result, ir.DoubleArray)

        dec = self.lineitem.l_extendedprice
        result = dec.round()
        assert isinstance(result, ir.DecimalArray)

        result = dec.round(2)
        assert isinstance(result, ir.DecimalArray)

        result = ibis.literal(1.2345).round()
        assert isinstance(result, ir.Int64Scalar)

    def _check_unary_op(self, expr, fname, ex_op, ex_type):
        result = getattr(expr, fname)()
        assert type(result.op()) == ex_op
        assert type(result) == ex_type
Example #50
0
class TestStringBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_unary_ops(self):
        s = self.table.string_col
        cases = [
            (s.lower(), 'lower(`string_col`)'),
            (s.upper(), 'upper(`string_col`)'),
            (s.reverse(), 'reverse(`string_col`)'),
            (s.strip(), 'trim(`string_col`)'),
            (s.lstrip(), 'ltrim(`string_col`)'),
            (s.rstrip(), 'rtrim(`string_col`)'),
            (s.capitalize(), 'initcap(`string_col`)'),
            (s.length(), 'length(`string_col`)'),
            (s.ascii_str(), 'ascii(`string_col`)')
        ]
        self._check_expr_cases(cases)

    def test_substr(self):
        # Database numbers starting from 1
        cases = [
            (self.table.string_col.substr(2), 'substr(`string_col`, 2 + 1)'),
            (self.table.string_col.substr(0, 3),
             'substr(`string_col`, 0 + 1, 3)')
        ]
        self._check_expr_cases(cases)

    def test_strright(self):
        cases = [
            (self.table.string_col.right(4), 'strright(`string_col`, 4)')
        ]
        self._check_expr_cases(cases)

    def test_like(self):
        cases = [
            (self.table.string_col.like('foo%'), "`string_col` LIKE 'foo%'")
        ]
        self._check_expr_cases(cases)

    def test_rlike(self):
        ex = "`string_col` RLIKE '[\d]+'"
        cases = [
            (self.table.string_col.rlike('[\d]+'), ex),
            (self.table.string_col.re_search('[\d]+'), ex),
        ]
        self._check_expr_cases(cases)

    def test_re_extract(self):
        sql = "regexp_extract(`string_col`, '[\d]+', 0)"
        cases = [
            (self.table.string_col.re_extract('[\d]+', 0), sql)
        ]
        self._check_expr_cases(cases)

    def test_re_replace(self):
        sql = "regexp_replace(`string_col`, '[\d]+', 'aaa')"
        cases = [
            (self.table.string_col.re_replace('[\d]+', 'aaa'), sql)
        ]
        self._check_expr_cases(cases)

    def test_parse_url(self):
        sql = "parse_url(`string_col`, 'HOST')"
        cases = [
            (self.table.string_col.parse_url('HOST'), sql)
        ]
        self._check_expr_cases(cases)

    def test_repeat(self):
        cases = [
            (self.table.string_col.repeat(2), 'repeat(`string_col`, 2)')
        ]
        self._check_expr_cases(cases)

    def test_translate(self):
        cases = [
            (self.table.string_col.translate('a', 'b'),
             "translate(`string_col`, 'a', 'b')")
        ]
        self._check_expr_cases(cases)

    def test_find(self):
        s = self.table.string_col
        i1 = self.table.tinyint_col
        cases = [
            (s.find('a'), "locate('a', `string_col`) - 1"),
            (s.find('a', 2), "locate('a', `string_col`, 3) - 1"),
            (s.find('a', start=i1),
             "locate('a', `string_col`, `tinyint_col` + 1) - 1")
        ]
        self._check_expr_cases(cases)

    def test_lpad(self):
        cases = [
            (self.table.string_col.lpad(1, 'a'), "lpad(`string_col`, 1, 'a')"),
            (self.table.string_col.lpad(25), "lpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_rpad(self):
        cases = [
            (self.table.string_col.rpad(1, 'a'), "rpad(`string_col`, 1, 'a')"),
            (self.table.string_col.rpad(25), "rpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_find_in_set(self):
        cases = [
            (self.table.string_col.find_in_set(['a']),
             "find_in_set(`string_col`, 'a') - 1"),
            (self.table.string_col.find_in_set(['a', 'b']),
             "find_in_set(`string_col`, 'a,b') - 1")
        ]
        self._check_expr_cases(cases)

    def test_string_join(self):
        cases = [
            (L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
        ]
        self._check_expr_cases(cases)
Example #51
0
class TestInsertLoadData(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_load_data_unpartitioned(self):
        path = '/path/to/data'
        stmt = ddl.LoadData('functional_alltypes', path, database='foo')

        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "INTO TABLE foo.`functional_alltypes`")
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "OVERWRITE INTO TABLE foo.`functional_alltypes`")
        assert result == expected

    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes',
                            path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Example #52
0
def con():
    return MockConnection()
Example #53
0
 def setUp(self):
     self.con = MockConnection()
     self.t = self.con.table('functional_alltypes')
Example #54
0
class TestTimestamp(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('alltypes')
        self.col = self.alltypes.i

    def test_field_select(self):
        assert isinstance(self.col, ir.TimestampArray)

    def test_string_cast_to_timestamp(self):
        casted = self.alltypes.g.cast('timestamp')
        assert isinstance(casted, ir.TimestampArray)

        string = api.literal('2000-01-01')
        casted = string.cast('timestamp')
        assert isinstance(casted, ir.TimestampScalar)

    def test_extract_fields(self):
        # type-size may be database specific
        cases = [
            ('year', ops.ExtractYear, ir.Int32Array),
            ('month', ops.ExtractMonth, ir.Int32Array),
            ('day', ops.ExtractDay, ir.Int32Array),
            ('hour', ops.ExtractHour, ir.Int32Array),
            ('minute', ops.ExtractMinute, ir.Int32Array),
            ('second', ops.ExtractSecond, ir.Int32Array),
            ('millisecond', ops.ExtractMillisecond, ir.Int32Array),
        ]

        for attr, ex_op, ex_type in cases:
            result = getattr(self.col, attr)()
            assert result.get_name() == attr
            assert isinstance(result, ex_type)
            assert isinstance(result.op(), ex_op)

    def test_now(self):
        result = api.now()
        assert isinstance(result, ir.TimestampScalar)
        assert isinstance(result.op(), ops.TimestampNow)

    def test_timestamp_literals(self):
        ts_str = '2015-01-01 00:00:00'
        val = pd.Timestamp(ts_str)

        expr = ibis.literal(val)
        assert isinstance(expr, ir.TimestampScalar)

        expr = ibis.timestamp(ts_str)
        assert isinstance(expr, ir.TimestampScalar)

        self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71')

    def test_integer_to_timestamp(self):
        # #246
        pass

    def test_comparison_timestamp(self):
        expr = self.col > (self.col.min() + ibis.day(3))
        assert isinstance(expr, ir.BooleanArray)

    def test_comparisons_string(self):
        val = '2015-01-01 00:00:00'
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)

        expr2 = val < self.col
        op = expr2.op()
        assert isinstance(op, ops.Greater)
        assert isinstance(op.right, ir.TimestampScalar)

    def test_comparisons_pandas_timestamp(self):
        val = pd.Timestamp('2015-01-01 00:00:00')
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)
Example #55
0
class TestWrapping(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d,
                         self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        func = api.scalar_function(['string'], 'string', name='Tester')
        func.register('identity', 'udf_testing')

        result = func('hello world')
        assert (ibis.impala.compile(result) ==
                "SELECT udf_testing.identity('hello world') AS `tmp`")

    def test_sql_generation_from_infoclass(self):
        func = api.wrap_udf('test.so', ['string'], 'string', 'info_test')
        repr(func)

        func.register('info_test', 'udf_testing')
        result = func('hello world')
        assert (ibis.impala.compile(result) ==
                "SELECT udf_testing.info_test('hello world') AS `tmp`")

    def test_udf_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_udf([t], t, 'test')

            ibis_type = validate_type(t)

            expr = func(sv)
            assert type(expr) == type(ibis_type.scalar_type()(expr.op()))  # noqa: E501, E721
            expr = func(av)
            assert type(expr) == type(ibis_type.array_type()(expr.op()))  # noqa: E501, E721

    def test_uda_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_uda([t], t, 'test')

            ibis_type = validate_type(t)

            expr1 = func(sv)
            expr2 = func(sv)
            expected_type1 = type(ibis_type.scalar_type()(expr1.op()))
            expected_type2 = type(ibis_type.scalar_type()(expr2.op()))
            assert isinstance(expr1, expected_type1)
            assert isinstance(expr2, expected_type2)

    def test_decimal(self):
        func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test')
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalColumn

    def test_udf_invalid_typecasting(self):
        cases = [
            ('int8', self.all_cols[:1], self.all_cols[1:]),
            ('int16', self.all_cols[:2], self.all_cols[2:]),
            ('int32', self.all_cols[:3], self.all_cols[3:]),
            ('int64', self.all_cols[:4], self.all_cols[4:]),
            ('boolean', [], self.all_cols[:8] + self.all_cols[9:]),

            # allowing double here for now
            ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),

            ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),
            ('string', [], self.all_cols[:7] + self.all_cols[8:]),
            ('timestamp', [], self.all_cols[:-1]),
            ('decimal', [], self.all_cols[:4] + self.all_cols[7:])
        ]

        for t, valid_casts, invalid_casts in cases:
            func = self._register_udf([t], 'int32', 'typecast')

            for expr in valid_casts:
                func(expr)

            for expr in invalid_casts:
                self.assertRaises(IbisTypeError, func, expr)

    def test_mult_args(self):
        func = self._register_udf(['int32', 'double', 'string',
                                   'boolean', 'timestamp'],
                                  'int64', 'mult_types')

        expr = func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ColumnExpr)

        expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10'))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _register_udf(self, inputs, output, name):
        func = api.scalar_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func

    def _register_uda(self, inputs, output, name):
        func = api.aggregate_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func
Example #56
0
 def setUp(self):
     self.con = MockConnection()
     self.table = self.con.table('alltypes')
Example #57
0
 def setUp(self):
     self.con = MockConnection()
     self.name = 'test_name'
     self.inputs = ['string', 'string']
     self.output = 'int64'
Example #58
0
 def setUp(self):
     self.con = MockConnection()
     self.alltypes = self.con.table('functional_alltypes')
     self.lineitem = self.con.table('tpch_lineitem')
Example #59
0
class TestCreateTable(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table('functional_alltypes')
        self.expr = t[t.bigint_col > 0]

    def test_create_external_table_as(self):
        path = '/path/to/table'
        select = build_ast(self.con.table('test1')).queries[0]
        statement = ddl.CTAS('another_table',
                             select,
                             external=True,
                             can_exist=False,
                             path=path,
                             database='foo')
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
STORED AS PARQUET
LOCATION '{0}'
AS
SELECT *
FROM test1""".format(path)
        assert result == expected

    def test_create_table_with_location(self):
        path = '/path/to/table'
        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])
        statement = ddl.CreateTableWithSchema('another_table', schema,
                                              ddl.NoFormat(),
                                              can_exist=False,
                                              path=path, database='foo')
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_table_like_parquet(self):
        directory = '/path/to/'
        path = '/path/to/parquetfile'
        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_file=path,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE PARQUET '{0}'
STORED AS PARQUET
LOCATION '{1}'""".format(path, directory)

        assert result == expected

    def test_create_table_parquet_like_other(self):
        # alternative to "LIKE PARQUET"
        directory = '/path/to/'
        example_table = 'db.other'

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_table=example_table,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE {0}
STORED AS PARQUET
LOCATION '{1}'""".format(example_table, directory)

        assert result == expected

    def test_create_table_parquet_with_schema(self):
        directory = '/path/to/'

        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           schema=schema,
                                           external=True,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

        assert result == expected

    def test_create_table_delimited(self):
        path = '/path/to/files/'
        schema = ibis.schema([('a', 'string'),
                              ('b', 'int32'),
                              ('c', 'double'),
                              ('d', 'decimal(12,2)')])

        stmt = ddl.CreateTableDelimited('new_table', path, schema,
                                        delimiter='|',
                                        escapechar='\\',
                                        lineterminator='\0',
                                        database='foo',
                                        can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_external_table_avro(self):
        path = '/path/to/files/'

        avro_schema = {
            'fields': [
                {'name': 'a', 'type': 'string'},
                {'name': 'b', 'type': 'int'},
                {'name': 'c', 'type': 'double'},
                {"type": "bytes",
                 "logicalType": "decimal",
                 "precision": 4,
                 "scale": 2,
                 'name': 'd'}
            ],
            'name': 'my_record',
            'type': 'record'
        }

        stmt = ddl.CreateTableAvro('new_table', path, avro_schema,
                                   database='foo', can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
STORED AS AVRO
LOCATION '%s'
TBLPROPERTIES ('avro.schema.literal'='{
  "fields": [
    {
      "name": "a",
      "type": "string"
    },
    {
      "name": "b",
      "type": "int"
    },
    {
      "name": "c",
      "type": "double"
    },
    {
      "logicalType": "decimal",
      "name": "d",
      "precision": 4,
      "scale": 2,
      "type": "bytes"
    }
  ],
  "name": "my_record",
  "type": "record"
}')""" % path
        assert result == expected

    def test_create_table_parquet(self):
        statement = _create_table('some_table', self.expr,
                                  database='bar',
                                  can_exist=False)
        result = statement.compile()

        expected = """\
CREATE TABLE bar.`some_table`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_no_overwrite(self):
        statement = _create_table('tname', self.expr, can_exist=True)
        result = statement.compile()

        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_avro_other_formats(self):
        statement = _create_table('tname', self.t, format='avro',
                                  can_exist=True)
        result = statement.compile()
        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS AVRO
AS
SELECT *
FROM functional_alltypes"""
        assert result == expected

        self.assertRaises(ValueError, _create_table, 'tname', self.t,
                          format='foo')

    def test_partition_by(self):
        pass