def test_kudu_schema_convert(self): spec = [ # name, type, is_nullable, is_primary_key ('a', dt.Int8(False), 'int8', False, True), ('b', dt.Int16(False), 'int16', False, True), ('c', dt.Int32(False), 'int32', False, False), ('d', dt.Int64(True), 'int64', True, False), ('e', dt.String(True), 'string', True, False), ('f', dt.Boolean(False), 'bool', False, False), ('g', dt.Float(False), 'float', False, False), ('h', dt.Double(True), 'double', True, False), # TODO # ('i', 'binary', False, False), ('j', dt.Timestamp(True), 'timestamp', True, False), ] builder = kudu.schema_builder() primary_keys = [] ibis_types = [] for name, itype, type_, is_nullable, is_primary_key in spec: builder.add_column(name, type_, nullable=is_nullable) if is_primary_key: primary_keys.append(name) ibis_types.append((name, itype)) builder.set_primary_keys(primary_keys) kschema = builder.build() ischema = ksupport.schema_kudu_to_ibis(kschema) expected = ibis.schema(ibis_types) assert_equal(ischema, expected)
def test_database_layer(con, alltypes): db = con.database() t = db.functional_alltypes assert_equal(t, alltypes) assert db.list_tables() == con.list_tables()
def test_window_bind_to_table(t): w = ibis.window(group_by="g", order_by=ibis.desc("f")) w2 = w.bind(t) expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f)) assert_equal(w2, expected)
def test_sqla_schema_conversion(self): typespec = [ # name, type, nullable ("smallint", sat.SmallInteger, False, dt.int16), ("int", sat.Integer, True, dt.int32), ("integer", sat.INTEGER(), True, dt.int64), ("bigint", sat.BigInteger, False, dt.int64), ("real", sat.REAL, True, dt.double), ("bool", sat.Boolean, True, dt.boolean), ("timestamp", sat.DateTime, True, dt.timestamp), ] sqla_types = [] ibis_types = [] for name, t, nullable, ibis_type in typespec: sqla_type = sa.Column(name, t, nullable=nullable) sqla_types.append(sqla_type) ibis_types.append((name, ibis_type(nullable))) table = sa.Table("tname", self.meta, *sqla_types) schema = alch.schema_from_table(table) expected = ibis.schema(ibis_types) assert_equal(schema, expected)
def test_mutate(table): one = table.f * 2 foo = (table.a + table.b).name('foo') expr = table.mutate(foo, one=one, two=2) expected = table[table, foo, one.name('one'), ibis.literal(2).name('two')] assert_equal(expr, expected)
def test_create_table_with_partition_column(con, temp_table_db): schema = ibis.schema( [ ('year', 'int32'), ('month', 'string'), ('day', 'int8'), ('value', 'double'), ] ) tmp_db, name = temp_table_db con.create_table( name, schema=schema, database=tmp_db, partition=['year', 'month'] ) # the partition column get put at the end of the table ex_schema = ibis.schema( [ ('day', 'int8'), ('value', 'double'), ('year', 'int32'), ('month', 'string'), ] ) table_schema = con.get_schema(name, database=tmp_db) assert_equal(table_schema, ex_schema) partition_schema = con.database(tmp_db).table(name).partition_schema() expected = ibis.schema([('year', 'int32'), ('month', 'string')]) assert_equal(partition_schema, expected)
def test_group_by_kwargs(table): t = table expr = (t.group_by(['f', t.h], z='g', z2=t.d) .aggregate(t.d.mean().name('foo'))) expected = (t.group_by(['f', t.h, t.g.name('z'), t.d.name('z2')]) .aggregate(t.d.mean().name('foo'))) assert_equal(expr, expected)
def test_set_column(self): def g(x): return x.f * 2 result = self.table.set_column('f', g) expected = self.table.set_column('f', self.table.f * 2) assert_equal(result, expected)
def test_add_column(self): def g(x): return x.f * 2 result = self.table.add_column(g, name='foo') expected = self.table.mutate(foo=g) assert_equal(result, expected)
def test_summary_expand_list(self): summ = self.table.f.summary() metric = self.table.g.group_concat().name('bar') result = self.table.aggregate([metric, summ]) expected = self.table.aggregate([metric] + summ.exprs()) assert_equal(result, expected)
def test_rewrite_join_projection_without_other_ops(self): # Drop out filters and other commutative table operations. Join # predicates are "lifted" to reference the base, unmodified join roots # Star schema with fact table table = self.con.table('star1') table2 = self.con.table('star2') table3 = self.con.table('star3') filtered = table[table['f'] > 0] pred1 = table['foo_id'] == table2['foo_id'] pred2 = filtered['bar_id'] == table3['bar_id'] j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields view = j2[[filtered, table2['value1'], table3['value2']]] # Construct the thing we expect to obtain ex_pred2 = table['bar_id'] == table3['bar_id'] ex_expr = (table.left_join(table2, [pred1]) .inner_join(table3, [ex_pred2])) rewritten_proj = L.substitute_parents(view) op = rewritten_proj.op() assert_equal(op.table, ex_expr) # Ensure that filtered table has been substituted with the base table assert op.selections[0] is table
def test_value_counts_convenience(self): # #152 result = self.table.g.value_counts() expected = (self.table.group_by('g') .aggregate(self.table.count().name('count'))) assert_equal(result, expected)
def test_unravel_compound_equijoin(table): t1 = ibis.table( [ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value1', 'double'), ], 'foo_table', ) t2 = ibis.table( [ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value2', 'double'), ], 'bar_table', ) p1 = t1.key1 == t2.key1 p2 = t1.key2 == t2.key2 p3 = t1.key3 == t2.key3 joined = t1.inner_join(t2, [p1 & p2 & p3]) expected = t1.inner_join(t2, [p1, p2, p3]) assert_equal(joined, expected)
def test_create_table_with_partition_column(self): schema = ibis.schema([('year', 'int32'), ('month', 'int8'), ('day', 'int8'), ('value', 'double')]) name = _tmp_name() self.con.create_table(name, schema=schema, database=self.tmp_db, partition=['year', 'month'], location=self._temp_location()) self.temp_tables.append(name) # the partition column get put at the end of the table ex_schema = ibis.schema([('day', 'int8'), ('value', 'double'), ('year', 'int32'), ('month', 'int8')]) table_schema = self.con.get_schema(name, database=self.tmp_db) assert_equal(table_schema, ex_schema) partition_schema = self.db.table(name).partition_schema() expected = ibis.schema([('year', 'int32'), ('month', 'int8')]) assert_equal(partition_schema, expected)
def test_having(table): m = table.mutate(foo=table.f * 2, bar=table.e / 2) expr = m.group_by('foo').having(lambda x: x.foo.sum() > 10).size() expected = m.group_by('foo').having(m.foo.sum() > 10).size() assert_equal(expr, expected)
def test_contains(table): expr = table.g.contains('foo') expected = table.g.find('foo') >= 0 assert_equal(expr, expected) with pytest.raises(TypeError): 'foo' in table.g
def test_self_join(self): # Self-joins are problematic with this design because column # expressions may reference either the left or right self. For example: # # SELECT left.key, sum(left.value - right.value) as total_deltas # FROM table left # INNER JOIN table right # ON left.current_period = right.previous_period + 1 # GROUP BY 1 # # One way around the self-join issue is to force the user to add # prefixes to the joined fields, then project using those. Not that # satisfying, though. left = self.table right = self.table.view() metric = (left['a'] - right['b']).mean().name('metric') joined = left.inner_join(right, [right['g'] == left['g']]) # basic check there's no referential problems result_repr = repr(joined) assert 'ref_0' in result_repr assert 'ref_1' in result_repr # Cannot be immediately materialized because of the schema overlap self.assertRaises(RelationError, joined.materialize) # Project out left table schema proj = joined[[left]] assert_equal(proj.schema(), left.schema()) # Try aggregating on top of joined aggregated = joined.aggregate([metric], by=[left['g']]) ex_schema = api.Schema(['g', 'metric'], ['string', 'double']) assert_equal(aggregated.schema(), ex_schema)
def test_database_layer(self): db = self.con.database() t = db.functional_alltypes assert_equal(t, self.alltypes) assert db.list_tables() == self.con.list_tables()
def test_set_column(table): def g(x): return x.f * 2 result = table.set_column('f', g) expected = table.set_column('f', table.f * 2) assert_equal(result, expected)
def test_null(self): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ir.NullLiteral) expr2 = ibis.null() assert_equal(expr, expr2)
def test_null(): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ir.NullLiteral) assert expr._arg.value is None expr2 = ibis.null() assert_equal(expr, expr2)
def test_groupby_mutate(table): t = table g = t.group_by('g').order_by('f') expr = g.mutate(foo=lambda x: x.f.lag(), bar=lambda x: x.f.rank()) expected = g.mutate(foo=t.f.lag(), bar=t.f.rank()) assert_equal(expr, expected)
def test_replace_column(table): tb = api.table([('a', 'int32'), ('b', 'double'), ('c', 'string')]) expr = tb.b.cast('int32') tb2 = tb.set_column('b', expr) expected = tb[tb.a, expr.name('b'), tb.c] assert_equal(tb2, expected)
def test_coalesce_instance_method(self): v7 = self.table.v7 v5 = self.table.v5.cast('string') v8 = self.table.v8.cast('string') result = v7.coalesce(v5, v8, 'foo') expected = ibis.coalesce(v7, v5, v8, 'foo') assert_equal(result, expected)
def test_sql_with_limit(self): query = """\ SELECT * FROM functional_alltypes LIMIT 10""" table = self.con.sql(query) ex_schema = self.con.get_schema('functional_alltypes') assert_equal(table.schema(), ex_schema)
def test_query_parquet_file_like_table(self): hdfs_path = pjoin(self.test_data_dir, "parquet/tpch_region") ex_schema = ibis.schema([("r_regionkey", "int16"), ("r_name", "string"), ("r_comment", "string")]) table = self.con.parquet_file(hdfs_path, like_table="tpch_region") assert_equal(table.schema(), ex_schema)
def test_mutate(self): one = self.table.f * 2 foo = (self.table.a + self.table.b).name('foo') expr = self.table.mutate(foo, one=one, two=2) expected = self.table[self.table, foo, one.name('one'), ibis.literal(2).name('two')] assert_equal(expr, expected)
def test_join_no_predicate_list(self): region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') pred = region.r_regionkey == nation.n_regionkey joined = region.inner_join(nation, pred) expected = region.inner_join(nation, [pred]) assert_equal(joined, expected)
def test_window_bind_to_table(self): w = ibis.window(group_by='g', order_by=ibis.desc('f')) w2 = w.bind(self.t) expected = ibis.window(group_by=self.t.g, order_by=ibis.desc(self.t.f)) assert_equal(w2, expected)
def test_getitem_slice(self): cases = [ (self.table.g[:3], self.table.g.substr(0, 3)), (self.table.g[2:6], self.table.g.substr(2, 4)), ] for case, expected in cases: assert_equal(case, expected)
def test_projection_array_expr(self): result = self.table[self.table.a] expected = self.table[[self.table.a]] assert_equal(result, expected)
def test_sql_with_limit(con): table = con.sql("SELECT * FROM functional_alltypes LIMIT 10") ex_schema = con.get_schema('functional_alltypes') assert_equal(table.schema(), ex_schema)
def test_add_column_proxies_to_mutate(table): result = table.add_column(ibis.now().cast('date'), name='date') expected = table.mutate(date=ibis.now().cast('date')) assert_equal(result, expected)
def test_get_schema(con, test_data_db): t = con.table('tpch_lineitem') schema = con.get_schema('tpch_lineitem', database=test_data_db) assert_equal(t.schema(), schema)
def test_projection_convenient_syntax(self): proj = self.table[self.table, self.table['a'].name('foo')] proj2 = self.table[[self.table, self.table['a'].name('foo')]] assert_equal(proj, proj2)
def test_projection_self(table): result = table[table] expected = table.projection(table) assert_equal(result, expected)
def test_topk_function_late_bind(airlines): # GH #520 expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean()) expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean()) assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
def test_distinct_count(dtable): result = dtable.string_col.distinct().count() expected = dtable.string_col.nunique().name('count') assert_equal(result, expected) assert isinstance(result.op(), ops.CountDistinct)
def test_projection_convenient_syntax(table): proj = table[table, table['a'].name('foo')] proj2 = table[[table, table['a'].name('foo')]] assert_equal(proj, proj2)
def test_projection_no_list(table): expr = (table.f * 2).name('bar') result = table.select(expr) expected = table.projection([expr]) assert_equal(result, expected)
def test_get_schema(self): t = self.con.table('tpch_lineitem') schema = self.con.get_schema('tpch_lineitem', database=self.test_data_db) assert_equal(t.schema(), schema)
def test_value_counts_unnamed_expr(con): nation = con.table('tpch_nation') expr = nation.n_name.lower().value_counts() expected = nation.n_name.lower().name('unnamed').value_counts() assert_equal(expr, expected)
def test_value_counts_convenience(table): # #152 result = table.g.value_counts() expected = table.group_by('g').aggregate(table.count().name('count')) assert_equal(result, expected)
def test_lineage(companies): # single table dependency funding_buckets = [ 0, 1000000, 10000000, 50000000, 100000000, 500000000, 1000000000, ] bucket = companies.funding_total_usd.bucket( funding_buckets, include_over=True ) mutated = companies.mutate( bucket=bucket, status=companies.status.fillna('Unknown') ) filtered = mutated[ (companies.founded_at > '2010-01-01') | companies.founded_at.isnull() ] grouped = filtered.group_by(['bucket', 'status']).size() results = list(lin.lineage(bucket)) expected = [bucket, companies.funding_total_usd, companies] for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(mutated.bucket)) expected = [ mutated.bucket, mutated, bucket.name('bucket'), bucket, companies.funding_total_usd, companies, ] assert len(results) == len(expected) for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(filtered.bucket)) expected = [ filtered.bucket, filtered, bucket.name('bucket'), bucket, companies.funding_total_usd, companies, ] assert len(results) == len(expected) for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(grouped.bucket)) expected = [ grouped.bucket, grouped, filtered.bucket, filtered, bucket.name('bucket'), bucket, companies.funding_total_usd, companies, ] assert len(results) == len(expected) for r, e in zip(results, expected): assert_equal(r, e)
def test_projection_self(self): result = self.table[self.table] expected = self.table.projection(self.table) assert_equal(result, expected)
def test_contains(self): expr = self.table.g.contains('foo') expected = self.table.g.like('%foo%') assert_equal(expr, expected) self.assertRaises(Exception, lambda: 'foo' in self.table.g)
def test_projection_array_expr(table): result = table[table.a] expected = table[[table.a]] assert_equal(result, expected)
def test_groupby_alias(table): t = table result = t.groupby('g').size() expected = t.group_by('g').size() assert_equal(result, expected)
def test_lineage(companies): # single table dependency funding_buckets = [ 0, 1000000, 10000000, 50000000, 100000000, 500000000, 1000000000, ] bucket_names = [ '0 to 1m', '1m to 10m', '10m to 50m', '50m to 100m', '100m to 500m', '500m to 1b', 'Over 1b', ] bucket = companies.funding_total_usd.bucket( funding_buckets, include_over=True ) mutated = companies.mutate( bucket=bucket, status=companies.status.fillna('Unknown') ) filtered = mutated[ (companies.founded_at > '2010-01-01') | companies.founded_at.isnull() ] grouped = filtered.group_by(['bucket', 'status']).size() # TODO(cpcloud): Should this be used? joined = grouped.mutate( # noqa bucket_name=lambda x: x.bucket.label(bucket_names).fillna('Unknown') ) results = list(lin.lineage(bucket)) expected = [bucket, companies.funding_total_usd, companies] for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(mutated.bucket)) expected = [ mutated.bucket, mutated, bucket.name('bucket'), companies.funding_total_usd, companies, ] for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(filtered.bucket)) expected = [ filtered.bucket, filtered, bucket.name('bucket'), companies.funding_total_usd, companies, ] for r, e in zip(results, expected): assert_equal(r, e) results = list(lin.lineage(grouped.bucket)) expected = [ grouped.bucket, grouped, filtered.bucket, filtered, bucket.name('bucket'), companies.funding_total_usd, companies, ] for r, e in zip(results, expected): assert_equal(r, e)
def test_filter_no_list(table): pred = table.a > 5 result = table.filter(pred) expected = table[pred] assert_equal(result, expected)
def test_distinct_count(functional_alltypes): result = functional_alltypes.string_col.distinct().count() expected = functional_alltypes.string_col.nunique().name('count') assert_equal(result, expected) assert isinstance(result.op(), ops.CountDistinct)