def setup(self): from odps.df.expr.tests.core import MockTable schema = Schema.from_lists(types._data_types.keys(), types._data_types.values()) self.expr = CollectionExpr(_source_data=None, _schema=schema) self.sourced_expr = CollectionExpr( _source_data=MockTable(client=self.odps.rest), _schema=schema)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=schema) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=schema2) schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'], datatypes('int64', 'string', 'dict<string, string>', 'list<string>')) table4 = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3) self.maxDiff = None
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64')) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_selecter_table_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar() data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_selecter_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) self.selecter = EngineSelecter()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5], datatypes('string', 'int64', 'float64', 'boolean', 'datetime', 'decimal')[:5]) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = SeahawksEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testSVGFormatter(self): t = MockTable(name='pyodps_test_svg', schema=self.schema, _client=self.odps.rest) expr = CollectionExpr(_source_data=t, _schema=self.schema) expr1 = expr.groupby('name').agg(id=expr['id'].sum()) expr2 = expr1['name', expr1.id + 3] engine = MixedEngine(self.odps) dag = engine.compile(expr2) nodes = dag.nodes() self.assertEqual(len(nodes), 1) expr3 = nodes[0].expr self.assertIsInstance(expr3, GroupByCollectionExpr) dot = ExprExecutionGraphFormatter(dag)._to_dot() self.assertNotIn('Projection', dot) expr1 = expr.groupby('name').agg(id=expr['id'].sum()).cache() expr2 = expr1['name', expr1.id + 3] engine = MixedEngine(self.odps) dag = engine.compile(expr2) nodes = dag.nodes() self.assertEqual(len(nodes), 2) dot = ExprExecutionGraphFormatter(dag)._to_dot() self.assertIn('Projection', dot)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def testMakeKV(self): from odps import types as odps_types data = [ ['name1', 1.0, 3.0, None, 10.0, None, None], ['name1', None, 3.0, 5.1, None, None, None], ['name1', 7.1, None, None, None, 8.2, None], ['name2', None, 1.2, 1.5, None, None, None], ['name2', None, 1.0, None, None, None, 1.1], ] kv_cols = ['k1', 'k2', 'k3', 'k5', 'k7', 'k9'] schema = Schema.from_lists(['name'] + kv_cols, [odps_types.string] + [odps_types.double] * 6) table_name = tn('pyodps_test_engine_make_kv') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(name=table_name, schema=schema) expr = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) try: self.odps.write_table(table, 0, data) expr1 = expr.to_kv(columns=kv_cols, kv_delim='=') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 'k1=1,k2=3,k5=10'], ['name1', 'k2=3,k3=5.1'], ['name1', 'k1=7.1,k7=8.2'], ['name2', 'k2=1.2,k3=1.5'], ['name2', 'k2=1,k9=1.1'], ] self.assertListEqual(result, expected) finally: table.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.df_schema = schema self.schema = df_schema_to_odps_schema(schema) self.df = None self.expr = None self.engine = SQLAlchemyEngine() import sqlalchemy from sqlalchemy import create_engine self.sql_engine = engine = create_engine('postgres://localhost/pyodps') # self.sql_engine = engine = create_engine('mysql://localhost/pyodps') # self.sql_engine = engine = create_engine('sqlite://') self.conn = engine.connect() self.metadata = metadata = sqlalchemy.MetaData(bind=engine) columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine) t = sqlalchemy.Table('pyodps_test_data', metadata, *columns) metadata.create_all() self.table = t self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def testScaleValue(self): data = [ ['name1', 4, 5.3], ['name2', 2, 3.5], ['name1', 4, 4.2], ['name1', 3, 2.2], ['name1', 3, 4.1], ] schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.bigint, types.double]) table_name = tn('pyodps_test_engine_scale_table') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(name=table_name, schema=schema) self.odps.write_table(table_name, 0, data) expr_input = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) expr = expr_input.min_max_scale(columns=['fid']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4, 1.0], ['name2', 2, 0.41935483870967744], ['name1', 4, 0.6451612903225807], ['name1', 3, 0.0], ['name1', 3, 0.6129032258064515]] result = sorted(result) expected = sorted(expected) for first, second in zip(result, expected): self.assertEqual(len(first), len(second)) for it1, it2 in zip(first, second): self.assertAlmostEqual(it1, it2) expr = expr_input.std_scale(columns=['fid']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4, 1.4213602653434203], ['name2', 2, -0.3553400663358544], ['name1', 4, 0.3355989515394193], ['name1', 3, -1.6385125281042194], ['name1', 3, 0.23689337755723686]] result = sorted(result) expected = sorted(expected) for first, second in zip(result, expected): self.assertEqual(len(first), len(second)) for it1, it2 in zip(first, second): self.assertAlmostEqual(it1, it2)
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [ ['name3', 5, -1], ['name4', 6, -2] ] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) try: expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6] ] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: table2.drop()
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [ ['name3', 5, -1], ['name4', 6, -2] ] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6] ] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testBizarreField(self): def my_func(row): return getattr(row, '012') * 2.0 datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', '012'], datatypes('string', 'int64', 'float64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64'])) udtf = list(self.engine._ctx._func_to_udfs.values())[0] udtf = get_function(udtf, UDF_CLASS_NAME) self.assertEqual([20, 40], runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [ ['name1', 4, -1], ['name2', 1, -2] ] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [ [to_str('name1'), 4], [to_str('name2'), 1] ] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) finally: table2.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = 'pyodps_test_engine_table' self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table( name='pyodps_test_engine_table', schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testCallableColumn(self): from odps.df.expr.expressions import CallableColumn from odps.df.expr.collections import ProjectCollectionExpr schema = Schema.from_lists(['name', 'f1', 'append_id'], [types.string, types.float64, types.int64]) expr = CollectionExpr(_source_data=None, _schema=schema) self.assertIsInstance(expr.append_id, CallableColumn) self.assertNotIsInstance(expr.f1, CallableColumn) projected = expr[expr.name, expr.append_id] self.assertIsInstance(projected, ProjectCollectionExpr) self.assertListEqual(projected.schema.names, ['name', 'append_id']) projected = expr[expr.name, expr.f1] self.assertNotIsInstance(projected.append_id, CallableColumn) appended = expr.append_id(id_col='id_col') self.assertIn('id_col', appended.schema)
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testApplyMap(self): from odps.df.expr.collections import ProjectCollectionExpr, Column from odps.df.expr.element import MappedExpr schema = Schema.from_lists(['idx', 'f1', 'f2', 'f3'], [types.int64] + [types.float64] * 3) expr = CollectionExpr(_source_data=None, _schema=schema) self.assertRaises( ValueError, lambda: expr.applymap( lambda v: v + 1, columns='idx', excludes='f1')) mapped = expr.applymap(lambda v: v + 1) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr) mapped = expr.applymap(lambda v: v + 1, columns='f1') self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr if c.name == 'f1' else Column) map_cols = set(['f1', 'f2', 'f3']) mapped = expr.applymap(lambda v: v + 1, columns=map_cols) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr if c.name in map_cols else Column) mapped = expr.applymap(lambda v: v + 1, excludes='idx') self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, Column if c.name == 'idx' else MappedExpr) exc_cols = set(['idx', 'f1']) mapped = expr.applymap(lambda v: v + 1, excludes=exc_cols) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, Column if c.name in exc_cols else MappedExpr)
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) import pandas as pd expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})\ .reset_index().values.tolist() for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testFilterOrder(self): table_name = tn('pyodps_test_division_error') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(table_name, 'divided bigint, divisor bigint', lifecycle=1) try: self.odps.write_table(table_name, [[2, 0], [1, 1], [1, 2], [5, 1], [5, 0]]) df = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(table.schema)) fdf = df[df.divisor > 0] ddf = fdf[(fdf.divided / fdf.divisor).rename('result'), ] expr = ddf[ddf.result > 1] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(result, [[ 5, ]]) finally: table.drop()
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) if self.sql_engine.name != 'mysql': expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) if self.sql_engine.name != 'mysql': expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testFilterPushdownThroughMultipleProjection(self): schema = Schema.from_lists(list('abcde'), ['string']*5) table = MockTable(name='pyodps_test_expr_table3', schema=schema) tab = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) labels2 = [] bins2 = [] for i in range(0, 30): a = str(7 * i) + '-' + str(7 * (i + 1)) b = 7 * i bins2.append(b) labels2.append(a) p1 = tab.select(tab.a, tab.c.astype('int').cut(bins2, labels=labels2, include_over=True).rename('c_cut'), tab.e.astype('int').rename('e'), tab.c.astype('int').rename('c')) p1['f'] = p1['e'] / p1['c'] t = [] l = [] for i in range(0, 20): a = 1 * i b = str(a) t.append(a) l.append(b) p2 = p1.select(p1.a, p1.c_cut, p1.f.cut(bins=t, labels=l, include_over=True).rename('f_cut')) expected = "SELECT t1.`a`, CASE WHEN (0 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 7) THEN '0-7' " \ "WHEN (7 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 14) " \ "THEN '7-14' WHEN (14 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 21) THEN '14-21' " \ "WHEN (21 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 28) " \ "THEN '21-28' WHEN (28 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 35) THEN '28-35' " \ "WHEN (35 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 42) THEN '35-42' " \ "WHEN (42 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 49) THEN '42-49' " \ "WHEN (49 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 56) " \ "THEN '49-56' WHEN (56 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 63) THEN '56-63' " \ "WHEN (63 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 70) THEN '63-70' " \ "WHEN (70 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 77) " \ "THEN '70-77' WHEN (77 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 84) " \ "THEN '77-84' WHEN (84 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 91) THEN '84-91' " \ "WHEN (91 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 98) " \ "THEN '91-98' WHEN (98 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 105) THEN '98-105' " \ "WHEN (105 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 112) " \ "THEN '105-112' WHEN (112 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 119) THEN '112-119' " \ "WHEN (119 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 126) " \ "THEN '119-126' WHEN (126 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 133) THEN '126-133' " \ "WHEN (133 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 140) " \ "THEN '133-140' WHEN (140 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 147) THEN '140-147' " \ "WHEN (147 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 154) " \ "THEN '147-154' WHEN (154 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 161) THEN '154-161' " \ "WHEN (161 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 168) " \ "THEN '161-168' WHEN (168 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 175) THEN '168-175' " \ "WHEN (175 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 182) " \ "THEN '175-182' WHEN (182 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 189) THEN '182-189' " \ "WHEN (189 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 196) " \ "THEN '189-196' WHEN (196 < CAST(t1.`c` AS BIGINT)) " \ "AND (CAST(t1.`c` AS BIGINT) <= 203) THEN '196-203' " \ "WHEN 203 < CAST(t1.`c` AS BIGINT) THEN '203-210' END AS `c_cut`, " \ "CASE WHEN (0 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 1) THEN '0' " \ "WHEN (1 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 2) " \ "THEN '1' WHEN (2 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 3) THEN '2' " \ "WHEN (3 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 4) " \ "THEN '3' WHEN (4 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 5) THEN '4' " \ "WHEN (5 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 6) THEN '5' " \ "WHEN (6 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 7) " \ "THEN '6' WHEN (7 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 8) THEN '7' " \ "WHEN (8 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 9) THEN '8' " \ "WHEN (9 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 10) " \ "THEN '9' WHEN (10 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 11) THEN '10' " \ "WHEN (11 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 12) " \ "THEN '11' WHEN (12 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 13) THEN '12' " \ "WHEN (13 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 14) THEN '13' " \ "WHEN (14 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 15) THEN '14' " \ "WHEN (15 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 16) THEN '15' " \ "WHEN (16 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 17) THEN '16' " \ "WHEN (17 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 18) " \ "THEN '17' WHEN (18 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 19) THEN '18' " \ "WHEN 19 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) THEN '19' END AS `f_cut` \n" \ "FROM mocked_project.`pyodps_test_expr_table3` t1 \n" \ "WHERE (CASE WHEN (0 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 1) THEN '0' " \ "WHEN (1 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 2) " \ "THEN '1' WHEN (2 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 3) THEN '2' " \ "WHEN (3 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 4) THEN '3' " \ "WHEN (4 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 5) THEN '4' " \ "WHEN (5 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 6) THEN '5' " \ "WHEN (6 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 7) THEN '6' " \ "WHEN (7 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 8) THEN '7' " \ "WHEN (8 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 9) THEN '8' " \ "WHEN (9 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 10) THEN '9' " \ "WHEN (10 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 11) THEN '10' " \ "WHEN (11 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 12) THEN '11' " \ "WHEN (12 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 13) THEN '12' " \ "WHEN (13 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 14) THEN '13' " \ "WHEN (14 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 15) THEN '14' " \ "WHEN (15 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 16) THEN '15' " \ "WHEN (16 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 17) THEN '16' " \ "WHEN (17 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 18) THEN '17' " \ "WHEN (18 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \ "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 19) THEN '18' " \ "WHEN 19 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) THEN '19' END) == '9'" self.assertEqual(str(expected), str(ODPSEngine(self.odps).compile(p2[p2.f_cut == '9'], prettify=False)))
def get_table2_df(self): schema = Schema.from_lists(['col21', 'col22'], datatypes('string', 'string')) table = MockTable(name=TEMP_TABLE_2_NAME, schema=schema) return CollectionExpr(_source_data=table, _schema=schema)
def testJoin(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: table2.drop()