def test_ctas_ddl(self): con = MockConnection() select = build_ast(con.table('test1')).queries[0] statement = ksupport.CTASKudu( 'another_table', 'kudu_name', ['dom.d.com:7051'], select, ['string_col'], external=True, can_exist=False, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` TBLPROPERTIES ( 'kudu.key_columns'='string_col', 'kudu.master_addresses'='dom.d.com:7051', 'kudu.table_name'='kudu_name', 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' ) AS SELECT * FROM test1""" assert result == expected
def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [ self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t, ]
def setUp(self): self.schema = [('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean')] self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema) self.con = MockConnection()
def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') self.int_cols = ['a', 'b', 'c', 'd'] self.bool_cols = ['h'] self.float_cols = ['e', 'f']
class TestInteractiveUse(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_interactive_execute_on_repr(self): table = self.con.table('functional_alltypes') expr = table.bigint_col.sum() with config.option_context('interactive', True): repr(expr) assert len(self.con.executed_queries) > 0 def test_default_limit(self): table = self.con.table('functional_alltypes') with config.option_context('interactive', True): repr(table) expected = """\ SELECT * FROM functional_alltypes LIMIT {0}""".format(config.options.sql.default_limit) assert self.con.executed_queries[0] == expected def test_disable_query_limit(self): table = self.con.table('functional_alltypes') with config.option_context('interactive', True): with config.option_context('sql.default_limit', None): repr(table) expected = """\ SELECT * FROM functional_alltypes""" assert self.con.executed_queries[0] == expected def test_interactive_non_compilable_repr_not_fail(self): # #170 table = self.con.table('functional_alltypes') expr = table.string_col.topk(3) # it works! with config.option_context('interactive', True): repr(expr) def test_histogram_repr_no_query_execute(self): t = self.con.table('functional_alltypes') tier = t.double_col.histogram(10).name('bucket') expr = t.group_by(tier).size() with config.option_context('interactive', True): expr._repr() assert self.con.executed_queries == []
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_simple_table_distinct(self): t = self.con.table('functional_alltypes') expr = t[t.string_col, t.int_col].distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col`, `int_col` FROM functional_alltypes""" assert result == expected def test_array_distinct(self): t = self.con.table('functional_alltypes') expr = t.string_col.distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col` FROM functional_alltypes""" assert result == expected def test_count_distinct(self): t = self.con.table('functional_alltypes') metric = t.int_col.nunique().name('nunique') expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric]) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique` FROM functional_alltypes WHERE `bigint_col` > 0 GROUP BY 1""" assert result == expected def test_multiple_count_distinct(self): # Impala and some other databases will not execute multiple # count-distincts in a single aggregation query. This error reporting # will be left to the database itself, for now. t = self.con.table('functional_alltypes') metrics = [ t.int_col.nunique().name('int_card'), t.smallint_col.nunique().name('smallint_card') ] expr = t.group_by('string_col').aggregate(metrics) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`, COUNT(DISTINCT `smallint_col`) AS `smallint_card` FROM functional_alltypes GROUP BY 1""" assert result == expected
def setUp(self): self.con = MockConnection() table = self.con.table('functional_alltypes') self.t1 = (table[table.int_col > 0][ table.string_col.name('key'), table.float_col.cast('double').name('value')]) self.t2 = (table[table.int_col <= 0][table.string_col.name('key'), table.double_col.name('value')]) self.union1 = self.t1.union(self.t2)
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_simple_table_distinct(self): t = self.con.table('functional_alltypes') expr = t[t.string_col, t.int_col].distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col`, `int_col` FROM functional_alltypes""" assert result == expected def test_array_distinct(self): t = self.con.table('functional_alltypes') expr = t.string_col.distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col` FROM functional_alltypes""" assert result == expected def test_count_distinct(self): t = self.con.table('functional_alltypes') metric = t.int_col.nunique().name('nunique') expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric]) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique` FROM functional_alltypes WHERE `bigint_col` > 0 GROUP BY 1""" assert result == expected def test_multiple_count_distinct(self): # Impala and some other databases will not execute multiple # count-distincts in a single aggregation query. This error reporting # will be left to the database itself, for now. t = self.con.table('functional_alltypes') metrics = [t.int_col.nunique().name('int_card'), t.smallint_col.nunique().name('smallint_card')] expr = t.group_by('string_col').aggregate(metrics) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`, COUNT(DISTINCT `smallint_col`) AS `smallint_card` FROM functional_alltypes GROUP BY 1""" assert result == expected
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_field_in_literals(self): cases = [(self.table.g.isin(["foo", "bar", "baz"]), "g IN ('foo', 'bar', 'baz')"), (self.table.g.notin(["foo", "bar", "baz"]), "g NOT IN ('foo', 'bar', 'baz')")] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (ibis.literal(2).isin([self.table.a, self.table.b, self.table.c]), '2 IN (a, b, c)'), (ibis.literal(2).notin([self.table.a, self.table.b, self.table.c]), '2 NOT IN (a, b, c)') ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): filtered = self.table[self.table.g.isin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE g IN ('foo', 'bar')""" assert result == expected filtered = self.table[self.table.g.notin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE g NOT IN ('foo', 'bar')""" assert result == expected
class TestInsert(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = self.con.table('functional_alltypes') def test_select_basics(self): name = 'testing123456' expr = self.t.limit(10) select, _ = _get_select(expr) stmt = ddl.InsertSelect(name, select, database='foo') result = stmt.compile() expected = """\ INSERT INTO foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True) result = stmt.compile() expected = """\ INSERT OVERWRITE foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected def test_select_overwrite(self): pass
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table("alltypes") def test_field_in_literals(self): cases = [ (self.table.g.isin(["foo", "bar", "baz"]), "`g` IN ('foo', 'bar', 'baz')"), (self.table.g.notin(["foo", "bar", "baz"]), "`g` NOT IN ('foo', 'bar', 'baz')"), ] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (L(2).isin([self.table.a, self.table.b, self.table.c]), "2 IN (`a`, `b`, `c`)"), (L(2).notin([self.table.a, self.table.b, self.table.c]), "2 NOT IN (`a`, `b`, `c`)"), ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): filtered = self.table[self.table.g.isin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` IN ('foo', 'bar')""" assert result == expected filtered = self.table[self.table.g.notin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` NOT IN ('foo', 'bar')""" assert result == expected
def setUp(self): self.con = MockConnection() self.table = self.con.table("alltypes") self.int_cols = ["a", "b", "c", "d"] self.bool_cols = ["h"] self.float_cols = ["e", "f"]
class TestUnions(unittest.TestCase): def setUp(self): self.con = MockConnection() table = self.con.table('functional_alltypes') self.t1 = (table[table.int_col > 0][ table.string_col.name('key'), table.float_col.cast('double').name('value')]) self.t2 = (table[table.int_col <= 0][table.string_col.name('key'), table.double_col.name('value')]) self.union1 = self.t1.union(self.t2) def test_union(self): result = to_sql(self.union1) expected = """\ SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION ALL SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0""" assert result == expected def test_union_distinct(self): union = self.t1.union(self.t2, distinct=True) result = to_sql(union) expected = """\ SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0""" assert result == expected def test_union_project_column(self): # select a column, get a subquery expr = self.union1[[self.union1.key]] result = to_sql(expr) expected = """SELECT `key` FROM ( SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION ALL SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0 ) t0""" assert result == expected def test_union_extract_with_block(self): pass def test_union_in_subquery(self): pass
class TestAnalytics(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') def test_category_project(self): t = self.alltypes tier = t.double_col.bucket([0, 50, 100]).name('tier') expr = t[tier, t] assert isinstance(expr.tier, ir.CategoryArray) def test_bucket(self): d = self.alltypes.double_col bins = [0, 10, 50, 100] expr = d.bucket(bins) assert isinstance(expr, ir.CategoryArray) assert expr.op().nbuckets == 3 expr = d.bucket(bins, include_over=True) assert expr.op().nbuckets == 4 expr = d.bucket(bins, include_over=True, include_under=True) assert expr.op().nbuckets == 5 def test_bucket_error_cases(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.bucket, []) self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo') # it works! d.bucket([10], include_under=True, include_over=True) self.assertRaises(ValueError, d.bucket, [10]) self.assertRaises(ValueError, d.bucket, [10], include_under=True) self.assertRaises(ValueError, d.bucket, [10], include_over=True) def test_histogram(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5) self.assertRaises(ValueError, d.histogram) self.assertRaises(ValueError, d.histogram, 10, closed='foo') def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) filtered = t.filter([delay_filter]) # predicate is unmodified by analysis post_pred = filtered.op().predicates[1] assert delay_filter.equals(post_pred)
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_analytic_exprs(self): t = self.table w = ibis.window(order_by=t.float_col) cases = [ (ibis.row_number().over(w), '(row_number() OVER (ORDER BY `float_col`) - 1)'), (t.string_col.lag(), 'lag(`string_col`)'), (t.string_col.lag(2), 'lag(`string_col`, 2)'), (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'), (t.string_col.lead(), 'lead(`string_col`)'), (t.string_col.lead(2), 'lead(`string_col`, 2)'), (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'), (t.double_col.first(), 'first_value(`double_col`)'), (t.double_col.last(), 'last_value(`double_col`)'), # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))') ] self._check_expr_cases(cases)
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_analytic_exprs(self): t = self.table w = ibis.window(order_by=t.float_col) cases = [ (ibis.row_number().over(w), 'row_number() OVER (ORDER BY `float_col`) - 1'), (t.string_col.lag(), 'lag(`string_col`)'), (t.string_col.lag(2), 'lag(`string_col`, 2)'), (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'), (t.string_col.lead(), 'lead(`string_col`)'), (t.string_col.lead(2), 'lead(`string_col`, 2)'), (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'), (t.double_col.first(), 'first_value(`double_col`)'), (t.double_col.last(), 'last_value(`double_col`)'), # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))') ] self._check_expr_cases(cases)
def setUp(self): self.con = MockConnection() self.t1 = ibis.table([('key1', 'string'), ('key2', 'string'), ('value1', 'double')], 'foo') self.t2 = ibis.table([('key1', 'string'), ('key2', 'string')], 'bar')
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_coalesce(self): t = self.table cases = [ (ibis.coalesce(t.string_col, 'foo'), "coalesce(`string_col`, 'foo')"), (ibis.coalesce(t.int_col, t.bigint_col), 'coalesce(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases) def test_greatest(self): t = self.table cases = [ (ibis.greatest(t.string_col, 'foo'), "greatest(`string_col`, 'foo')"), (ibis.greatest(t.int_col, t.bigint_col), 'greatest(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases) def test_least(self): t = self.table cases = [ (ibis.least(t.string_col, 'foo'), "least(`string_col`, 'foo')"), (ibis.least(t.int_col, t.bigint_col), 'least(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases)
def test_memoize_database_table(self): con = MockConnection() table = con.table('test1') table2 = con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) formatted = repr(result) assert formatted.count('test1') == 1 assert formatted.count('test2') == 1
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_distinct_basic(self): expr = self.table.distinct() assert isinstance(expr.op(), ops.Distinct) assert isinstance(expr, ir.TableExpr) assert expr.op().table is self.table expr = self.table.string_col.distinct() assert isinstance(expr.op(), ops.DistinctArray) assert isinstance(expr, ir.StringArray) # def test_distinct_array_interactions(self): # TODO # array cardinalities / shapes are likely to be different. # a = self.table.int_col.distinct() # b = self.table.bigint_col # self.assertRaises(ir.RelationError, a.__add__, b) def test_distinct_count(self): result = self.table.string_col.distinct().count() expected = self.table.string_col.nunique().name('count') assert_equal(result, expected) assert isinstance(result.op(), ops.CountDistinct) def test_distinct_unnamed_array_expr(self): table = ibis.table([('year', 'int32'), ('month', 'int32'), ('day', 'int32')], 'foo') # it works! expr = (ibis.literal('-') .join([table.year.cast('string'), table.month.cast('string'), table.day.cast('string')]) .distinct()) repr(expr) def test_distinct_count_numeric_types(self): table = self.table metric = (table.bigint_col.distinct().count() .name('unique_bigints')) table.group_by('string_col').aggregate(metric) def test_nunique(self): expr = self.table.string_col.nunique() assert isinstance(expr.op(), ops.CountDistinct) def test_project_with_distinct(self): pass
def setUp(self): self.con = MockConnection() table = self.con.table('functional_alltypes') self.t1 = (table[table.int_col > 0] [table.string_col.name('key'), table.float_col.cast('double').name('value')]) self.t2 = (table[table.int_col <= 0] [table.string_col.name('key'), table.double_col.name('value')]) self.union1 = self.t1.union(self.t2)
def setUp(self): self.schema = [ ('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean') ] self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema) self.con = MockConnection()
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_field_in_literals(self): values = ['foo', 'bar', 'baz'] values_formatted = tuple(set(values)) cases = [ (self.table.g.isin(values), "`g` IN {}".format(values_formatted)), (self.table.g.notin(values), "`g` NOT IN {}".format(values_formatted)) ] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (L(2).isin([self.table.a, self.table.b, self.table.c]), '2 IN (`a`, `b`, `c`)'), (L(2).notin([self.table.a, self.table.b, self.table.c]), '2 NOT IN (`a`, `b`, `c`)') ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): values = ['foo', 'bar'] values_formatted = tuple(set(values)) filtered = self.table[self.table.g.isin(values)] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` IN {}""" assert result == expected.format(values_formatted) filtered = self.table[self.table.g.notin(values)] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` NOT IN {}""" assert result == expected.format(values_formatted)
class TestCaseExprs(unittest.TestCase, ExprSQLTest, ExprTestCases): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_isnull_1_0(self): expr = self.table.g.isnull().ifelse(1, 0) result = self._translate(expr) expected = 'CASE WHEN `g` IS NULL THEN 1 ELSE 0 END' assert result == expected # inside some other function result = self._translate(expr.sum()) expected = 'sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END)' assert result == expected def test_simple_case(self): expr = self._case_simple_case() result = self._translate(expr) expected = """CASE `g` WHEN 'foo' THEN 'bar' WHEN 'baz' THEN 'qux' ELSE 'default' END""" assert result == expected def test_search_case(self): expr = self._case_search_case() result = self._translate(expr) expected = """CASE WHEN `f` > 0 THEN `d` * 2 WHEN `c` < 0 THEN `a` * 2 ELSE NULL END""" assert result == expected def test_where_use_if(self): expr = ibis.where(self.table.f > 0, self.table.e, self.table.a) assert isinstance(expr, ir.FloatValue) result = self._translate(expr) expected = "if(`f` > 0, `e`, `a`)" assert result == expected def test_nullif_ifnull(self): table = self.con.table('tpch_lineitem') f = table.l_quantity cases = [ (f.nullif(f == 0), 'nullif(`l_quantity`, `l_quantity` = 0)'), (f.fillna(0), 'isnull(`l_quantity`, CAST(0 AS decimal(12,2)))'), ] self._check_expr_cases(cases) def test_decimal_fillna_cast_arg(self): table = self.con.table('tpch_lineitem') f = table.l_extendedprice cases = [ (f.fillna(0), 'isnull(`l_extendedprice`, CAST(0 AS decimal(12,2)))'), (f.fillna(0.0), 'isnull(`l_extendedprice`, 0.0)'), ] self._check_expr_cases(cases)
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_numeric_unary_builtins(self): # No argument functions functions = ['abs', 'ceil', 'floor', 'exp', 'sqrt', 'sign', ('log', 'ln'), ('approx_median', 'appx_median'), ('approx_nunique', 'ndv'), 'ln', 'log2', 'log10', 'nullifzero', 'zeroifnull'] cases = [] for what in functions: if isinstance(what, tuple): ibis_name, sql_name = what else: ibis_name = sql_name = what for cname in ['double_col', 'int_col']: expr = getattr(self.table[cname], ibis_name)() cases.append((expr, '{0}({1})'.format( sql_name, '`{0}`'.format(cname)))) self._check_expr_cases(cases) def test_log_other_bases(self): cases = [ (self.table.double_col.log(5), 'log(`double_col`, 5)') ] self._check_expr_cases(cases) def test_round(self): cases = [ (self.table.double_col.round(), 'round(`double_col`)'), (self.table.double_col.round(0), 'round(`double_col`, 0)'), (self.table.double_col.round(2, ), 'round(`double_col`, 2)'), (self.table.double_col.round(self.table.tinyint_col), 'round(`double_col`, `tinyint_col`)') ] self._check_expr_cases(cases) def test_hash(self): expr = self.table.int_col.hash() assert isinstance(expr, ir.Int64Array) assert isinstance(self.table.int_col.sum().hash(), ir.Int64Scalar) cases = [ (self.table.int_col.hash(), 'fnv_hash(`int_col`)') ] self._check_expr_cases(cases) def test_reduction_where(self): cond = self.table.bigint_col < 70 c = self.table.double_col tmp = ('{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` ' 'ELSE NULL END)') cases = [ (c.sum(where=cond), tmp.format('sum')), (c.count(where=cond), tmp.format('count')), (c.mean(where=cond), tmp.format('avg')), (c.max(where=cond), tmp.format('max')), (c.min(where=cond), tmp.format('min')), (c.std(where=cond), tmp.format('stddev')), (c.std(where=cond, how='pop'), tmp.format('stddev_pop')), (c.var(where=cond), tmp.format('variance')), (c.var(where=cond, how='pop'), tmp.format('variance_pop')), ] self._check_expr_cases(cases) def test_reduction_invalid_where(self): condbad_literal = L('T') c = self.table.double_col for reduction in [c.sum, c.count, c.mean, c.max, c.min]: with self.assertRaises(TypeError): reduction(where=condbad_literal)
def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('alltypes') self.col = self.alltypes.i
def mockcon(): return MockConnection()
def setUp(self): self.con = MockConnection()
def setUp(self): self.con = MockConnection() self.t = t = self.con.table('functional_alltypes') self.expr = t[t.bigint_col > 0]
class TestFixedOffsets(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_upconvert(self): cases = [ (T.day(14), 'w', T.week(2)), (T.hour(72), 'd', T.day(3)), (T.minute(240), 'h', T.hour(4)), (T.second(360), 'm', T.minute(6)), (T.second(3 * 86400), 'd', T.day(3)), (T.millisecond(5000), 's', T.second(5)), (T.microsecond(5000000), 's', T.second(5)), (T.nanosecond(5000000000), 's', T.second(5)), ] for offset, unit, expected in cases: result = offset.to_unit(unit) assert result.equals(expected) def test_multiply(self): offset = T.day(2) assert (offset * 2).equals(T.day(4)) assert (offset * (-2)).equals(T.day(-4)) assert (3 * offset).equals(T.day(6)) assert ((-3) * offset).equals(T.day(-6)) def test_repr(self): assert repr(T.day()) == '<Timedelta: 1 day>' assert repr(T.day(2)) == '<Timedelta: 2 days>' assert repr(T.year()) == '<Timedelta: 1 year>' assert repr(T.month(2)) == '<Timedelta: 2 months>' assert repr(T.second(40)) == '<Timedelta: 40 seconds>' def test_cannot_upconvert(self): cases = [ (T.day(), 'w'), (T.hour(), 'd'), (T.minute(), 'h'), (T.second(), 'm'), (T.second(), 'd'), (T.millisecond(), 's'), (T.microsecond(), 's'), (T.nanosecond(), 's'), ] for delta, target in cases: self.assertRaises(IbisError, delta.to_unit, target) def test_downconvert_second_parts(self): K = 2 sec = T.second(K) milli = T.millisecond(K) micro = T.microsecond(K) nano = T.nanosecond(K) cases = [(sec.to_unit('s'), T.second(K)), (sec.to_unit('ms'), T.millisecond(K * 1000)), (sec.to_unit('us'), T.microsecond(K * 1000000)), (sec.to_unit('ns'), T.nanosecond(K * 1000000000)), (milli.to_unit('ms'), T.millisecond(K)), (milli.to_unit('us'), T.microsecond(K * 1000)), (milli.to_unit('ns'), T.nanosecond(K * 1000000)), (micro.to_unit('us'), T.microsecond(K)), (micro.to_unit('ns'), T.nanosecond(K * 1000)), (nano.to_unit('ns'), T.nanosecond(K))] self._check_cases(cases) def test_downconvert_hours(self): K = 2 offset = T.hour(K) cases = [(offset.to_unit('h'), T.hour(K)), (offset.to_unit('m'), T.minute(K * 60)), (offset.to_unit('s'), T.second(K * 3600)), (offset.to_unit('ms'), T.millisecond(K * 3600000)), (offset.to_unit('us'), T.microsecond(K * 3600000000)), (offset.to_unit('ns'), T.nanosecond(K * 3600000000000))] self._check_cases(cases) def test_downconvert_day(self): K = 2 week = T.week(K) day = T.day(K) cases = [(week.to_unit('d'), T.day(K * 7)), (week.to_unit('h'), T.hour(K * 7 * 24)), (day.to_unit('d'), T.day(K)), (day.to_unit('h'), T.hour(K * 24)), (day.to_unit('m'), T.minute(K * 1440)), (day.to_unit('s'), T.second(K * 86400)), (day.to_unit('ms'), T.millisecond(K * 86400000)), (day.to_unit('us'), T.microsecond(K * 86400000000)), (day.to_unit('ns'), T.nanosecond(K * 86400000000000))] self._check_cases(cases) def test_combine_with_different_kinds(self): cases = [(T.day() + T.minute(), T.minute(1441)), (T.second() + T.millisecond(10), T.millisecond(1010)), (T.hour() + T.minute(5) + T.second(10), T.second(3910))] self._check_cases(cases) def test_timedelta_generic_api(self): cases = [ (T.timedelta(weeks=2), T.week(2)), (T.timedelta(days=3), T.day(3)), (T.timedelta(hours=4), T.hour(4)), (T.timedelta(minutes=5), T.minute(5)), (T.timedelta(seconds=6), T.second(6)), (T.timedelta(milliseconds=7), T.millisecond(7)), (T.timedelta(microseconds=8), T.microsecond(8)), (T.timedelta(nanoseconds=9), T.nanosecond(9)), ] self._check_cases(cases) def _check_cases(self, cases): for x, y in cases: assert x.equals(y) def test_offset_timestamp_expr(self): c = self.table.i x = T.timedelta(days=1) expr = x + c assert isinstance(expr, ir.TimestampColumn) assert isinstance(expr.op(), ops.TimestampDelta) # test radd expr = c + x assert isinstance(expr, ir.TimestampColumn) assert isinstance(expr.op(), ops.TimestampDelta)
class TestExprFormatting(unittest.TestCase): # Uncertain about how much we want to commit to unit tests around the # particulars of the output at the moment. def setUp(self): self.schema = [ ('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean') ] self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema) self.con = MockConnection() def test_format_table_column(self): # GH #507 result = repr(self.table.f) assert 'Column[array(double)]' in result def test_format_projection(self): # This should produce a ref to the projection proj = self.table[['c', 'a', 'f']] repr(proj['a']) def test_table_type_output(self): foo = ibis.table( [ ('job', 'string'), ('dept_id', 'string'), ('year', 'int32'), ('y', 'double') ], 'foo') expr = foo.dept_id == foo.view().dept_id result = repr(expr) assert 'SelfReference[table]' in result assert 'UnboundTable[table]' in result def test_memoize_aggregate_correctly(self): table = self.table agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis') agg_exprs = [table['a'].sum().name('sum(a)'), table['b'].mean().name('mean(b)'), agg_expr] result = table.aggregate(agg_exprs, by=['g']) formatter = ExprFormatter(result) formatted = formatter.get_result() alias = formatter.memo.get_alias(table.op()) assert formatted.count(alias) == 7 def test_aggregate_arg_names(self): # Not sure how to test this *well* t = self.table by_exprs = [t.g.name('key1'), t.f.round().name('key2')] agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')] expr = self.table.group_by(by_exprs).aggregate(agg_exprs) result = repr(expr) assert 'metrics' in result assert 'by' in result def test_format_multiple_join_with_projection(self): # Star schema with fact table table = ibis.table([ ('c', 'int32'), ('f', 'double'), ('foo_id', 'string'), ('bar_id', 'string'), ]) table2 = ibis.table([ ('foo_id', 'string'), ('value1', 'double') ]) table3 = ibis.table([ ('bar_id', 'string'), ('value2', 'double') ]) filtered = table[table['f'] > 0] pred1 = table['foo_id'] == table2['foo_id'] pred2 = filtered['bar_id'] == table3['bar_id'] j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields view = j2[[table, table2['value1'], table3['value2']]] # it works! repr(view) def test_memoize_database_table(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) formatted = repr(result) assert formatted.count('test1') == 1 assert formatted.count('test2') == 1 def test_memoize_filtered_table(self): airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) result = repr(delay_filter) assert result.count('Filter') == 1 def test_memoize_insert_sort_key(self): table = self.con.table('airlines') t = table['arrdelay', 'dest'] expr = (t.group_by('dest') .mutate(dest_avg=t.arrdelay.mean(), dev=t.arrdelay - t.arrdelay.mean())) worst = expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10) result = repr(worst) assert result.count('airlines') == 1 def test_named_value_expr_show_name(self): expr = self.table.f * 2 expr2 = expr.name('baz') # it works! repr(expr) result2 = repr(expr2) # not really committing to a particular output yet assert 'baz' in result2 def test_memoize_filtered_tables_in_join(self): # related: GH #667 purchases = ibis.table([('region', 'string'), ('kind', 'string'), ('user', 'int64'), ('amount', 'double')], 'purchases') metric = purchases.amount.sum().name('total') agged = (purchases.group_by(['region', 'kind']) .aggregate(metric)) left = agged[agged.kind == 'foo'] right = agged[agged.kind == 'bar'] cond = left.region == right.region joined = left.join(right, cond) result = repr(joined) assert result.count('Filter') == 2
def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes')
class TestTimestamp(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('alltypes') self.col = self.alltypes.i def test_field_select(self): assert isinstance(self.col, ir.TimestampArray) def test_string_cast_to_timestamp(self): casted = self.alltypes.g.cast('timestamp') assert isinstance(casted, ir.TimestampArray) string = api.literal('2000-01-01') casted = string.cast('timestamp') assert isinstance(casted, ir.TimestampScalar) def test_extract_fields(self): # type-size may be database specific cases = [ ('year', ops.ExtractYear, ir.Int32Array), ('month', ops.ExtractMonth, ir.Int32Array), ('day', ops.ExtractDay, ir.Int32Array), ('hour', ops.ExtractHour, ir.Int32Array), ('minute', ops.ExtractMinute, ir.Int32Array), ('second', ops.ExtractSecond, ir.Int32Array) ] for attr, ex_op, ex_type in cases: result = getattr(self.col, attr)() assert isinstance(result, ex_type) assert isinstance(result.op(), ex_op) def test_extract_no_propagate_name(self): # see #146 table = self.con.table('functional_alltypes') expr = table.timestamp_col.hour() self.assertRaises(com.ExpressionError, expr.get_name) def test_now(self): result = api.now() assert isinstance(result, ir.TimestampScalar) assert isinstance(result.op(), ops.TimestampNow) def test_timestamp_literals(self): ts_str = '2015-01-01 00:00:00' val = pd.Timestamp(ts_str) expr = ibis.literal(val) assert isinstance(expr, ir.TimestampScalar) expr = ibis.timestamp(ts_str) assert isinstance(expr, ir.TimestampScalar) self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71') def test_integer_to_timestamp(self): # #246 pass def test_comparison_timestamp(self): expr = self.col > (self.col.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanArray) def test_comparisons_string(self): val = '2015-01-01 00:00:00' expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar) expr2 = val < self.col op = expr2.op() assert isinstance(op, ops.Greater) assert isinstance(op.right, ir.TimestampScalar) def test_comparisons_pandas_timestamp(self): val = pd.Timestamp('2015-01-01 00:00:00') expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar)
class TestWrapping(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): func = api.scalar_function(['string'], 'string', name='Tester') func.register('identity', 'udf_testing') result = func('hello world') assert result == "SELECT udf_testing.identity('hello world')" def test_sql_generation_from_infoclass(self): func = api.wrap_udf('test.so', ['string'], 'string', 'info_test') repr(func) func.register('info_test', 'udf_testing') result = func('hello world') assert result == "SELECT udf_testing.info_test('hello world')" def test_udf_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_udf([t], t, 'test') ibis_type = validate_type(t) expr = func(sv) assert type(expr) == ibis_type.scalar_type() expr = func(av) assert type(expr) == ibis_type.array_type() def test_uda_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_uda([t], t, 'test') ibis_type = validate_type(t) expr1 = func(sv) expr2 = func(sv) assert isinstance(expr1, ibis_type.scalar_type()) assert isinstance(expr2, ibis_type.scalar_type()) def test_decimal(self): func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test') expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalArray def test_udf_invalid_typecasting(self): cases = [ ('int8', self.all_cols[:1], self.all_cols[1:]), ('int16', self.all_cols[:2], self.all_cols[2:]), ('int32', self.all_cols[:3], self.all_cols[3:]), ('int64', self.all_cols[:4], self.all_cols[4:]), ('boolean', [], self.all_cols[:8] + self.all_cols[9:]), # allowing double here for now ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('string', [], self.all_cols[:7] + self.all_cols[8:]), ('timestamp', [], self.all_cols[:-1]), ('decimal', [], self.all_cols[:4] + self.all_cols[7:]) ] for t, valid_casts, invalid_casts in cases: func = self._register_udf([t], 'int32', 'typecast') for expr in valid_casts: func(expr) for expr in invalid_casts: self.assertRaises(IbisTypeError, func, expr) def test_mult_args(self): func = self._register_udf(['int32', 'double', 'string', 'boolean', 'timestamp'], 'int64', 'mult_types') expr = func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ArrayExpr) expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10')) assert issubclass(type(expr), ir.ScalarExpr) def _register_udf(self, inputs, output, name): func = api.scalar_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func def _register_uda(self, inputs, output, name): func = api.aggregate_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func
class TestStringOps(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_lower_upper(self): lresult = self.table.g.lower() uresult = self.table.g.upper() assert isinstance(lresult, ir.StringArray) assert isinstance(uresult, ir.StringArray) assert isinstance(lresult.op(), ops.Lowercase) assert isinstance(uresult.op(), ops.Uppercase) lit = literal('FoO') lresult = lit.lower() uresult = lit.upper() assert isinstance(lresult, ir.StringScalar) assert isinstance(uresult, ir.StringScalar) def test_substr(self): lit = literal('FoO') result = self.table.g.substr(2, 4) lit_result = lit.substr(0, 2) assert isinstance(result, ir.StringArray) assert isinstance(lit_result, ir.StringScalar) op = result.op() assert isinstance(op, ops.Substring) start, length = op.args[1:] assert start.equals(literal(2)) assert length.equals(literal(4)) def test_left_right(self): result = self.table.g.left(5) expected = self.table.g.substr(0, 5) assert result.equals(expected) result = self.table.g.right(5) op = result.op() assert isinstance(op, ops.StrRight) assert op.args[1].equals(literal(5)) def test_length(self): lit = literal('FoO') result = self.table.g.length() lit_result = lit.length() assert isinstance(result, ir.Int32Array) assert isinstance(lit_result, ir.Int32Scalar) assert isinstance(result.op(), ops.StringLength) def test_join(self): dash = literal('-') expr = dash.join([self.table.f.cast('string'), self.table.g]) assert isinstance(expr, ir.StringArray) expr = dash.join([literal('ab'), literal('cd')]) assert isinstance(expr, ir.StringScalar) def test_contains(self): expr = self.table.g.contains('foo') expected = self.table.g.find('foo') >= 0 assert_equal(expr, expected) self.assertRaises(Exception, lambda: 'foo' in self.table.g) def test_getitem_slice(self): cases = [ (self.table.g[:3], self.table.g.substr(0, 3)), (self.table.g[2:6], self.table.g.substr(2, 4)), ] for case, expected in cases: assert_equal(case, expected)
class TestInsertLoadData(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = self.con.table('functional_alltypes') def test_select_basics(self): name = 'testing123456' expr = self.t.limit(10) select, _ = _get_select(expr) stmt = ddl.InsertSelect(name, select, database='foo') result = stmt.compile() expected = """\ INSERT INTO foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True) result = stmt.compile() expected = """\ INSERT OVERWRITE foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected def test_load_data_unpartitioned(self): path = '/path/to/data' stmt = ddl.LoadData('functional_alltypes', path, database='foo') result = stmt.compile() expected = ("LOAD DATA INPATH '/path/to/data' " "INTO TABLE foo.`functional_alltypes`") assert result == expected stmt.overwrite = True result = stmt.compile() expected = ("LOAD DATA INPATH '/path/to/data' " "OVERWRITE INTO TABLE foo.`functional_alltypes`") assert result == expected def test_load_data_partitioned(self): path = '/path/to/data' part = {'year': 2007, 'month': 7} part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')]) stmt = ddl.LoadData('functional_alltypes', path, database='foo', partition=part, partition_schema=part_schema) result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected stmt.overwrite = True result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected def test_select_overwrite(self): pass
class TestValueExprs(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') self.int_cols = ['a', 'b', 'c', 'd'] self.bool_cols = ['h'] self.float_cols = ['e', 'f'] def _check_literals(self, cases): for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_string_literals(self): cases = [ ('simple', "'simple'"), ('I can\'t', "'I can\\'t'"), ('An "escape"', "'An \"escape\"'") ] for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_decimal_builtins(self): t = self.con.table('tpch_lineitem') col = t.l_extendedprice cases = [ (col.precision(), 'precision(`l_extendedprice`)'), (col.scale(), 'scale(`l_extendedprice`)'), ] self._check_expr_cases(cases) def test_number_boolean_literals(self): cases = [ (5, '5'), (1.5, '1.5'), (True, 'TRUE'), (False, 'FALSE') ] self._check_literals(cases) def test_column_ref_table_aliases(self): context = ImpalaContext() table1 = ibis.table([ ('key1', 'string'), ('value1', 'double') ]) table2 = ibis.table([ ('key2', 'string'), ('value and2', 'double') ]) context.set_ref(table1, 't0') context.set_ref(table2, 't1') expr = table1['value1'] - table2['value and2'] result = self._translate(expr, context=context) expected = 't0.`value1` - t1.`value and2`' assert result == expected def test_column_ref_quoting(self): schema = [('has a space', 'double')] table = ibis.table(schema) self._translate(table['has a space'], '`has a space`') def test_identifier_quoting(self): schema = [('date', 'double'), ('table', 'string')] table = ibis.table(schema) self._translate(table['date'], '`date`') self._translate(table['table'], '`table`') def test_named_expressions(self): a, b, g = self.table.get_columns(['a', 'b', 'g']) cases = [ (g.cast('double').name('g_dub'), 'CAST(`g` AS double) AS `g_dub`'), (g.name('has a space'), '`g` AS `has a space`'), (((a - b) * a).name('expr'), '(`a` - `b`) * `a` AS `expr`') ] return self._check_expr_cases(cases, named=True) def test_binary_infix_operators(self): # For each function, verify that the generated code is what we expect a, b, h = self.table.get_columns(['a', 'b', 'h']) bool_col = a > 0 cases = [ (a + b, '`a` + `b`'), (a - b, '`a` - `b`'), (a * b, '`a` * `b`'), (a / b, '`a` / `b`'), (a ** b, 'pow(`a`, `b`)'), (a < b, '`a` < `b`'), (a <= b, '`a` <= `b`'), (a > b, '`a` > `b`'), (a >= b, '`a` >= `b`'), (a == b, '`a` = `b`'), (a != b, '`a` != `b`'), (h & bool_col, '`h` AND (`a` > 0)'), (h | bool_col, '`h` OR (`a` > 0)'), # xor is brute force (h ^ bool_col, '(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))') ] self._check_expr_cases(cases) def test_binary_infix_parenthesization(self): a, b, c = self.table.get_columns(['a', 'b', 'c']) cases = [ ((a + b) + c, '(`a` + `b`) + `c`'), (a.log() + c, 'ln(`a`) + `c`'), (b + (-(a + c)), '`b` + (-(`a` + `c`))') ] self._check_expr_cases(cases) def test_between(self): cases = [ (self.table.f.between(0, 1), '`f` BETWEEN 0 AND 1') ] self._check_expr_cases(cases) def test_isnull_notnull(self): cases = [ (self.table['g'].isnull(), '`g` IS NULL'), (self.table['a'].notnull(), '`a` IS NOT NULL'), ((self.table['a'] + self.table['b']).isnull(), '`a` + `b` IS NULL') ] self._check_expr_cases(cases) def test_casts(self): a, d, g = self.table.get_columns(['a', 'd', 'g']) cases = [ (a.cast('int16'), 'CAST(`a` AS smallint)'), (a.cast('int32'), 'CAST(`a` AS int)'), (a.cast('int64'), 'CAST(`a` AS bigint)'), (a.cast('float'), 'CAST(`a` AS float)'), (a.cast('double'), 'CAST(`a` AS double)'), (a.cast('string'), 'CAST(`a` AS string)'), (d.cast('int8'), 'CAST(`d` AS tinyint)'), (g.cast('double'), 'CAST(`g` AS double)'), (g.cast('timestamp'), 'CAST(`g` AS timestamp)') ] self._check_expr_cases(cases) def test_misc_conditionals(self): a = self.table.a cases = [ (a.nullif(0), 'nullif(`a`, 0)') ] self._check_expr_cases(cases) def test_decimal_casts(self): cases = [ (L('9.9999999').cast('decimal(38,5)'), "CAST('9.9999999' AS decimal(38,5))"), (self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))") ] self._check_expr_cases(cases) def test_negate(self): cases = [ (-self.table['a'], '-`a`'), (-self.table['f'], '-`f`'), (-self.table['h'], 'NOT `h`') ] self._check_expr_cases(cases) def test_timestamp_extract_field(self): fields = ['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond'] cases = [(getattr(self.table.i, field)(), "extract(`i`, '{0}')".format(field)) for field in fields] self._check_expr_cases(cases) # integration with SQL translation expr = self.table[self.table.i.year().name('year'), self.table.i.month().name('month'), self.table.i.day().name('day')] result = to_sql(expr) expected = \ """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`, extract(`i`, 'day') AS `day` FROM alltypes""" assert result == expected def test_timestamp_now(self): cases = [ (ibis.now(), 'now()') ] self._check_expr_cases(cases) def test_timestamp_deltas(self): units = ['year', 'month', 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'] t = self.table.i f = '`i`' cases = [] for unit in units: K = 5 offset = getattr(ibis, unit)(K) template = '{0}s_add({1}, {2})' cases.append((t + offset, template.format(unit, f, K))) cases.append((t - offset, template.format(unit, f, -K))) self._check_expr_cases(cases) def test_timestamp_literals(self): from pandas import Timestamp tv1 = '2015-01-01 12:34:56' ex1 = ("'2015-01-01 12:34:56'") cases = [ (L(Timestamp(tv1)), ex1), (L(Timestamp(tv1).to_pydatetime()), ex1), (ibis.timestamp(tv1), ex1) ] self._check_expr_cases(cases) def test_timestamp_from_integer(self): col = self.table.c cases = [ (col.to_timestamp(), 'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), (col.to_timestamp('ms'), 'CAST(from_unixtime(CAST(`c` / 1000 AS int), ' '"yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), (col.to_timestamp('us'), 'CAST(from_unixtime(CAST(`c` / 1000000 AS int), ' '"yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), ] self._check_expr_cases(cases) def test_correlated_predicate_subquery(self): t0 = self.table t1 = t0.view() expr = t0.g == t1.g ctx = ImpalaContext() ctx.make_alias(t0) # Grab alias from parent context subctx = ctx.subcontext() subctx.make_alias(t1) subctx.make_alias(t0) result = self._translate(expr, context=subctx) expected = "t0.`g` = t1.`g`" assert result == expected def test_any_all(self): t = self.table bool_expr = t.f == 0 cases = [ (bool_expr.any(), 'sum(`f` = 0) > 0'), (-bool_expr.any(), 'sum(`f` = 0) = 0'), (bool_expr.all(), 'sum(`f` = 0) = count(*)'), (-bool_expr.all(), 'sum(`f` = 0) < count(*)'), ] self._check_expr_cases(cases)
def __init__(self): self.meta = sa.MetaData() MockConnection.__init__(self)
class TestBucketHistogram(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_bucket_to_case(self): buckets = [0, 10, 25, 50] expr1 = self.table.f.bucket(buckets) expected1 = """\ CASE WHEN (`f` >= 0) AND (`f` < 10) THEN 0 WHEN (`f` >= 10) AND (`f` < 25) THEN 1 WHEN (`f` >= 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr2 = self.table.f.bucket(buckets, close_extreme=False) expected2 = """\ CASE WHEN (`f` >= 0) AND (`f` < 10) THEN 0 WHEN (`f` >= 10) AND (`f` < 25) THEN 1 WHEN (`f` >= 25) AND (`f` < 50) THEN 2 ELSE NULL END""" expr3 = self.table.f.bucket(buckets, closed='right') expected3 = """\ CASE WHEN (`f` >= 0) AND (`f` <= 10) THEN 0 WHEN (`f` > 10) AND (`f` <= 25) THEN 1 WHEN (`f` > 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr4 = self.table.f.bucket(buckets, closed='right', close_extreme=False) expected4 = """\ CASE WHEN (`f` > 0) AND (`f` <= 10) THEN 0 WHEN (`f` > 10) AND (`f` <= 25) THEN 1 WHEN (`f` > 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr5 = self.table.f.bucket(buckets, include_under=True) expected5 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 ELSE NULL END""" expr6 = self.table.f.bucket(buckets, include_under=True, include_over=True) expected6 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 WHEN `f` > 50 THEN 4 ELSE NULL END""" expr7 = self.table.f.bucket(buckets, close_extreme=False, include_under=True, include_over=True) expected7 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` < 50) THEN 3 WHEN `f` >= 50 THEN 4 ELSE NULL END""" expr8 = self.table.f.bucket(buckets, closed='right', close_extreme=False, include_under=True) expected8 = """\ CASE WHEN `f` <= 0 THEN 0 WHEN (`f` > 0) AND (`f` <= 10) THEN 1 WHEN (`f` > 10) AND (`f` <= 25) THEN 2 WHEN (`f` > 25) AND (`f` <= 50) THEN 3 ELSE NULL END""" expr9 = self.table.f.bucket([10], closed='right', include_over=True, include_under=True) expected9 = """\ CASE WHEN `f` <= 10 THEN 0 WHEN `f` > 10 THEN 1 ELSE NULL END""" expr10 = self.table.f.bucket([10], include_over=True, include_under=True) expected10 = """\ CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END""" cases = [ (expr1, expected1), (expr2, expected2), (expr3, expected3), (expr4, expected4), (expr5, expected5), (expr6, expected6), (expr7, expected7), (expr8, expected8), (expr9, expected9), (expr10, expected10), ] self._check_expr_cases(cases) def test_cast_category_to_int_noop(self): # Because the bucket result is an integer, no explicit cast is # necessary expr = (self.table.f.bucket([10], include_over=True, include_under=True) .cast('int32')) expected = """\ CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END""" expr2 = (self.table.f.bucket([10], include_over=True, include_under=True) .cast('double')) expected2 = """\ CAST(CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END AS double)""" self._check_expr_cases([(expr, expected), (expr2, expected2)]) def test_bucket_assign_labels(self): buckets = [0, 10, 25, 50] bucket = self.table.f.bucket(buckets, include_under=True) size = self.table.group_by(bucket.name('tier')).size() labelled = size.tier.label(['Under 0', '0 to 10', '10 to 25', '25 to 50'], nulls='error').name('tier2') expr = size[labelled, size['count']] expected = """\ SELECT CASE `tier` WHEN 0 THEN 'Under 0' WHEN 1 THEN '0 to 10' WHEN 2 THEN '10 to 25' WHEN 3 THEN '25 to 50' ELSE 'error' END AS `tier2`, `count` FROM ( SELECT CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 ELSE NULL END AS `tier`, count(*) AS `count` FROM alltypes GROUP BY 1 ) t0""" result = to_sql(expr) assert result == expected self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c']) self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c', 'd', 'e'])
class TestBuiltins(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') self.lineitem = self.con.table('tpch_lineitem') def test_abs(self): colnames = [ 'tinyint_col', 'smallint_col', 'int_col', 'bigint_col', 'float_col', 'double_col' ] fname = 'abs' op = ops.Abs for col in colnames: expr = self.alltypes[col] self._check_unary_op(expr, fname, op, type(expr)) expr = self.lineitem.l_extendedprice self._check_unary_op(expr, fname, op, type(expr)) def test_group_concat(self): col = self.alltypes.string_col expr = col.group_concat() assert isinstance(expr.op(), ops.GroupConcat) arg, sep = expr.op().args assert sep == ',' expr = col.group_concat('|') arg, sep = expr.op().args assert sep == '|' def test_zeroifnull(self): dresult = self.alltypes.double_col.zeroifnull() iresult = self.alltypes.int_col.zeroifnull() assert type(dresult.op()) == ops.ZeroIfNull assert type(dresult) == ir.DoubleArray # Impala upconverts all ints to bigint. Hmm. assert type(iresult) == type(iresult) def test_fillna(self): result = self.alltypes.double_col.fillna(5) assert isinstance(result, ir.DoubleArray) assert isinstance(result.op(), ops.IfNull) result = self.alltypes.bool_col.fillna(True) assert isinstance(result, ir.BooleanArray) # Retains type of caller (for now) result = self.alltypes.int_col.fillna(self.alltypes.bigint_col) assert isinstance(result, ir.Int32Array) def test_ceil_floor(self): cresult = self.alltypes.double_col.ceil() fresult = self.alltypes.double_col.floor() assert isinstance(cresult, ir.Int64Array) assert isinstance(fresult, ir.Int64Array) assert type(cresult.op()) == ops.Ceil assert type(fresult.op()) == ops.Floor cresult = ibis.literal(1.2345).ceil() fresult = ibis.literal(1.2345).floor() assert isinstance(cresult, ir.Int64Scalar) assert isinstance(fresult, ir.Int64Scalar) dec_col = self.lineitem.l_extendedprice cresult = dec_col.ceil() fresult = dec_col.floor() assert isinstance(cresult, ir.DecimalArray) assert cresult.meta == dec_col.meta assert isinstance(fresult, ir.DecimalArray) assert fresult.meta == dec_col.meta def test_sign(self): result = self.alltypes.double_col.sign() assert isinstance(result, ir.FloatArray) assert type(result.op()) == ops.Sign result = ibis.literal(1.2345).sign() assert isinstance(result, ir.FloatScalar) dec_col = self.lineitem.l_extendedprice result = dec_col.sign() assert isinstance(result, ir.FloatArray) def test_round(self): result = self.alltypes.double_col.round() assert isinstance(result, ir.Int64Array) assert result.op().args[1] is None result = self.alltypes.double_col.round(2) assert isinstance(result, ir.DoubleArray) assert result.op().args[1] == 2 # Even integers are double (at least in Impala, check with other DB # implementations) result = self.alltypes.int_col.round(2) assert isinstance(result, ir.DoubleArray) dec = self.lineitem.l_extendedprice result = dec.round() assert isinstance(result, ir.DecimalArray) result = dec.round(2) assert isinstance(result, ir.DecimalArray) result = ibis.literal(1.2345).round() assert isinstance(result, ir.Int64Scalar) def _check_unary_op(self, expr, fname, ex_op, ex_type): result = getattr(expr, fname)() assert type(result.op()) == ex_op assert type(result) == ex_type
class TestStringBuiltins(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_unary_ops(self): s = self.table.string_col cases = [ (s.lower(), 'lower(`string_col`)'), (s.upper(), 'upper(`string_col`)'), (s.reverse(), 'reverse(`string_col`)'), (s.strip(), 'trim(`string_col`)'), (s.lstrip(), 'ltrim(`string_col`)'), (s.rstrip(), 'rtrim(`string_col`)'), (s.capitalize(), 'initcap(`string_col`)'), (s.length(), 'length(`string_col`)'), (s.ascii_str(), 'ascii(`string_col`)') ] self._check_expr_cases(cases) def test_substr(self): # Database numbers starting from 1 cases = [ (self.table.string_col.substr(2), 'substr(`string_col`, 2 + 1)'), (self.table.string_col.substr(0, 3), 'substr(`string_col`, 0 + 1, 3)') ] self._check_expr_cases(cases) def test_strright(self): cases = [ (self.table.string_col.right(4), 'strright(`string_col`, 4)') ] self._check_expr_cases(cases) def test_like(self): cases = [ (self.table.string_col.like('foo%'), "`string_col` LIKE 'foo%'") ] self._check_expr_cases(cases) def test_rlike(self): ex = "`string_col` RLIKE '[\d]+'" cases = [ (self.table.string_col.rlike('[\d]+'), ex), (self.table.string_col.re_search('[\d]+'), ex), ] self._check_expr_cases(cases) def test_re_extract(self): sql = "regexp_extract(`string_col`, '[\d]+', 0)" cases = [ (self.table.string_col.re_extract('[\d]+', 0), sql) ] self._check_expr_cases(cases) def test_re_replace(self): sql = "regexp_replace(`string_col`, '[\d]+', 'aaa')" cases = [ (self.table.string_col.re_replace('[\d]+', 'aaa'), sql) ] self._check_expr_cases(cases) def test_parse_url(self): sql = "parse_url(`string_col`, 'HOST')" cases = [ (self.table.string_col.parse_url('HOST'), sql) ] self._check_expr_cases(cases) def test_repeat(self): cases = [ (self.table.string_col.repeat(2), 'repeat(`string_col`, 2)') ] self._check_expr_cases(cases) def test_translate(self): cases = [ (self.table.string_col.translate('a', 'b'), "translate(`string_col`, 'a', 'b')") ] self._check_expr_cases(cases) def test_find(self): s = self.table.string_col i1 = self.table.tinyint_col cases = [ (s.find('a'), "locate('a', `string_col`) - 1"), (s.find('a', 2), "locate('a', `string_col`, 3) - 1"), (s.find('a', start=i1), "locate('a', `string_col`, `tinyint_col` + 1) - 1") ] self._check_expr_cases(cases) def test_lpad(self): cases = [ (self.table.string_col.lpad(1, 'a'), "lpad(`string_col`, 1, 'a')"), (self.table.string_col.lpad(25), "lpad(`string_col`, 25, ' ')") ] self._check_expr_cases(cases) def test_rpad(self): cases = [ (self.table.string_col.rpad(1, 'a'), "rpad(`string_col`, 1, 'a')"), (self.table.string_col.rpad(25), "rpad(`string_col`, 25, ' ')") ] self._check_expr_cases(cases) def test_find_in_set(self): cases = [ (self.table.string_col.find_in_set(['a']), "find_in_set(`string_col`, 'a') - 1"), (self.table.string_col.find_in_set(['a', 'b']), "find_in_set(`string_col`, 'a,b') - 1") ] self._check_expr_cases(cases) def test_string_join(self): cases = [ (L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')") ] self._check_expr_cases(cases)
def con(): return MockConnection()
def setUp(self): self.con = MockConnection() self.t = self.con.table('functional_alltypes')
class TestTimestamp(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('alltypes') self.col = self.alltypes.i def test_field_select(self): assert isinstance(self.col, ir.TimestampArray) def test_string_cast_to_timestamp(self): casted = self.alltypes.g.cast('timestamp') assert isinstance(casted, ir.TimestampArray) string = api.literal('2000-01-01') casted = string.cast('timestamp') assert isinstance(casted, ir.TimestampScalar) def test_extract_fields(self): # type-size may be database specific cases = [ ('year', ops.ExtractYear, ir.Int32Array), ('month', ops.ExtractMonth, ir.Int32Array), ('day', ops.ExtractDay, ir.Int32Array), ('hour', ops.ExtractHour, ir.Int32Array), ('minute', ops.ExtractMinute, ir.Int32Array), ('second', ops.ExtractSecond, ir.Int32Array), ('millisecond', ops.ExtractMillisecond, ir.Int32Array), ] for attr, ex_op, ex_type in cases: result = getattr(self.col, attr)() assert result.get_name() == attr assert isinstance(result, ex_type) assert isinstance(result.op(), ex_op) def test_now(self): result = api.now() assert isinstance(result, ir.TimestampScalar) assert isinstance(result.op(), ops.TimestampNow) def test_timestamp_literals(self): ts_str = '2015-01-01 00:00:00' val = pd.Timestamp(ts_str) expr = ibis.literal(val) assert isinstance(expr, ir.TimestampScalar) expr = ibis.timestamp(ts_str) assert isinstance(expr, ir.TimestampScalar) self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71') def test_integer_to_timestamp(self): # #246 pass def test_comparison_timestamp(self): expr = self.col > (self.col.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanArray) def test_comparisons_string(self): val = '2015-01-01 00:00:00' expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar) expr2 = val < self.col op = expr2.op() assert isinstance(op, ops.Greater) assert isinstance(op.right, ir.TimestampScalar) def test_comparisons_pandas_timestamp(self): val = pd.Timestamp('2015-01-01 00:00:00') expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar)
class TestWrapping(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): func = api.scalar_function(['string'], 'string', name='Tester') func.register('identity', 'udf_testing') result = func('hello world') assert (ibis.impala.compile(result) == "SELECT udf_testing.identity('hello world') AS `tmp`") def test_sql_generation_from_infoclass(self): func = api.wrap_udf('test.so', ['string'], 'string', 'info_test') repr(func) func.register('info_test', 'udf_testing') result = func('hello world') assert (ibis.impala.compile(result) == "SELECT udf_testing.info_test('hello world') AS `tmp`") def test_udf_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_udf([t], t, 'test') ibis_type = validate_type(t) expr = func(sv) assert type(expr) == type(ibis_type.scalar_type()(expr.op())) # noqa: E501, E721 expr = func(av) assert type(expr) == type(ibis_type.array_type()(expr.op())) # noqa: E501, E721 def test_uda_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_uda([t], t, 'test') ibis_type = validate_type(t) expr1 = func(sv) expr2 = func(sv) expected_type1 = type(ibis_type.scalar_type()(expr1.op())) expected_type2 = type(ibis_type.scalar_type()(expr2.op())) assert isinstance(expr1, expected_type1) assert isinstance(expr2, expected_type2) def test_decimal(self): func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test') expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalColumn def test_udf_invalid_typecasting(self): cases = [ ('int8', self.all_cols[:1], self.all_cols[1:]), ('int16', self.all_cols[:2], self.all_cols[2:]), ('int32', self.all_cols[:3], self.all_cols[3:]), ('int64', self.all_cols[:4], self.all_cols[4:]), ('boolean', [], self.all_cols[:8] + self.all_cols[9:]), # allowing double here for now ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('string', [], self.all_cols[:7] + self.all_cols[8:]), ('timestamp', [], self.all_cols[:-1]), ('decimal', [], self.all_cols[:4] + self.all_cols[7:]) ] for t, valid_casts, invalid_casts in cases: func = self._register_udf([t], 'int32', 'typecast') for expr in valid_casts: func(expr) for expr in invalid_casts: self.assertRaises(IbisTypeError, func, expr) def test_mult_args(self): func = self._register_udf(['int32', 'double', 'string', 'boolean', 'timestamp'], 'int64', 'mult_types') expr = func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ColumnExpr) expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10')) assert issubclass(type(expr), ir.ScalarExpr) def _register_udf(self, inputs, output, name): func = api.scalar_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func def _register_uda(self, inputs, output, name): func = api.aggregate_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func
def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes')
def setUp(self): self.con = MockConnection() self.name = 'test_name' self.inputs = ['string', 'string'] self.output = 'int64'
def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') self.lineitem = self.con.table('tpch_lineitem')
class TestCreateTable(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = t = self.con.table('functional_alltypes') self.expr = t[t.bigint_col > 0] def test_create_external_table_as(self): path = '/path/to/table' select = build_ast(self.con.table('test1')).queries[0] statement = ddl.CTAS('another_table', select, external=True, can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` STORED AS PARQUET LOCATION '{0}' AS SELECT * FROM test1""".format(path) assert result == expected def test_create_table_with_location(self): path = '/path/to/table' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableWithSchema('another_table', schema, ddl.NoFormat(), can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) LOCATION '{0}'""".format(path) assert result == expected def test_create_table_like_parquet(self): directory = '/path/to/' path = '/path/to/parquetfile' statement = ddl.CreateTableParquet('new_table', directory, example_file=path, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE PARQUET '{0}' STORED AS PARQUET LOCATION '{1}'""".format(path, directory) assert result == expected def test_create_table_parquet_like_other(self): # alternative to "LIKE PARQUET" directory = '/path/to/' example_table = 'db.other' statement = ddl.CreateTableParquet('new_table', directory, example_table=example_table, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE {0} STORED AS PARQUET LOCATION '{1}'""".format(example_table, directory) assert result == expected def test_create_table_parquet_with_schema(self): directory = '/path/to/' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableParquet('new_table', directory, schema=schema, external=True, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(directory) assert result == expected def test_create_table_delimited(self): path = '/path/to/files/' schema = ibis.schema([('a', 'string'), ('b', 'int32'), ('c', 'double'), ('d', 'decimal(12,2)')]) stmt = ddl.CreateTableDelimited('new_table', path, schema, delimiter='|', escapechar='\\', lineterminator='\0', database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format(path) assert result == expected def test_create_external_table_avro(self): path = '/path/to/files/' avro_schema = { 'fields': [ {'name': 'a', 'type': 'string'}, {'name': 'b', 'type': 'int'}, {'name': 'c', 'type': 'double'}, {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2, 'name': 'd'} ], 'name': 'my_record', 'type': 'record' } stmt = ddl.CreateTableAvro('new_table', path, avro_schema, database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` STORED AS AVRO LOCATION '%s' TBLPROPERTIES ('avro.schema.literal'='{ "fields": [ { "name": "a", "type": "string" }, { "name": "b", "type": "int" }, { "name": "c", "type": "double" }, { "logicalType": "decimal", "name": "d", "precision": 4, "scale": 2, "type": "bytes" } ], "name": "my_record", "type": "record" }')""" % path assert result == expected def test_create_table_parquet(self): statement = _create_table('some_table', self.expr, database='bar', can_exist=False) result = statement.compile() expected = """\ CREATE TABLE bar.`some_table` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_no_overwrite(self): statement = _create_table('tname', self.expr, can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_avro_other_formats(self): statement = _create_table('tname', self.t, format='avro', can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS AVRO AS SELECT * FROM functional_alltypes""" assert result == expected self.assertRaises(ValueError, _create_table, 'tname', self.t, format='foo') def test_partition_by(self): pass