def testBase(self): data = self._gen_data(10, value_range=(-1000, 1000)) expr = self.expr[self.expr.id < 10]['name', lambda x: x.id] result = self._get_result(self.engine.execute(expr).values) self.assertEqual(len([it for it in data if it[1] < 10]), len(result)) if len(result) > 0: self.assertEqual(2, len(result[0])) expr = self.expr[Scalar(3).rename('const'), self.expr.id, (self.expr.id + 1).rename('id2')] res = self.engine.execute(expr) result = self._get_result(res.values) self.assertEqual([c.name for c in res.columns], ['const', 'id', 'id2']) self.assertTrue(all(it[0] == 3 for it in result)) self.assertEqual(len(data), len(result)) self.assertEqual([it[1] + 1 for it in data], [it[2] for it in result]) expr = self.expr.sort('id')[:5] res = self.engine.execute(expr) result = self._get_result(res.values) self.assertEqual(sorted(data, key=lambda it: it[1])[:5], result) expr = self.expr.sort('id')[:5] # test do not use tunnel res = self.engine.execute(expr, use_tunnel=False) result = self._get_result(res.values) self.assertEqual(sorted(data, key=lambda it: it[1])[:5], result)
def testProjectPrune(self): expr = self.expr.select('name', 'id') new_expr = ColumnPruning(expr.to_dag()).prune() self.assertIsInstance(new_expr, ProjectCollectionExpr) self.assertIsNotNone(new_expr.input._source_data) expected = 'SELECT t1.`name`, t1.`id` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1' self.assertEqual(expected, ODPSEngine(self.odps).compile(expr, prettify=False)) expr = self.expr[Scalar(3).rename('const'), NullScalar('string').rename('string_const'), self.expr.id] expected = 'SELECT 3 AS `const`, CAST(NULL AS STRING) AS `string_const`, t1.`id` \n' \ 'FROM mocked_project.`pyodps_test_expr_table` t1' self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False))) expr = self.expr.select( pt=BuiltinFunction('max_pt', args=(self.expr._source_data.name, ))) expected = "SELECT max_pt('pyodps_test_expr_table') AS `pt` \n" \ "FROM mocked_project.`pyodps_test_expr_table` t1" self.assertEqual( to_str(expected), to_str(ODPSEngine(self.odps).compile(expr, prettify=False)))
def testBase(self): data = self._gen_data(10, value_range=(-1000, 1000)) expr = self.expr[self.expr.id < 10]['name', lambda x: x.id] result = self._get_result(self.engine.execute(expr).values) self.assertEqual(len([it for it in data if it[1] < 10]), len(result)) if len(result) > 0: self.assertEqual(2, len(result[0])) expr = self.expr[Scalar(3).rename('const'), self.expr.id, (self.expr.id + 1).rename('id2')] res = self.engine.execute(expr) result = self._get_result(res.values) self.assertEqual([c.name for c in res.columns], ['const', 'id', 'id2']) self.assertTrue(all(it[0] == 3 for it in result)) self.assertEqual(len(data), len(result)) self.assertEqual([it[1] + 1 for it in data], [it[2] for it in result]) expr = self.expr.sort('id')[:5] res = self.engine.execute(expr) result = self._get_result(res.values) self.assertListAlmostEqual( sorted(data, key=lambda it: it[1])[:5], [r[:-1] + [r[-1].replace(tzinfo=None)] for r in result], only_float=False, delta=.001) expr = self.expr[:1].filter(lambda x: x.name == data[1][0]) res = self.engine.execute(expr) self.assertEqual(len(res), 0)
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) if self.sql_engine.name != 'mysql': expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) if self.sql_engine.name != 'mysql': expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testGroupbyAggregation(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] self._gen_data(data=data) expr = self.expr.groupby(['name', 'id'])[lambda x: x.fid.min() * 2 < 8] \ .agg(self.expr.fid.max() + 1, new_id=self.expr.id.sum()) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 3, 5.1, 6], ['name2', 2, 4.5, 2]] result = sorted(result, key=lambda k: k[0]) self.assertListAlmostEqual(expected, result, only_float=False, delta=.001) expr = self.expr.name.value_counts()[:25] expected = [['name1', 4], ['name2', 1]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.name.topk(25) res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.groupby('name').count() res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual([it[1:] for it in expected], result) expected = [['name1', 2], ['name2', 1]] expr = self.expr.groupby('name').id.nunique() res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual([it[1:] for it in expected], result) expr = self.expr[self.expr['id'] > 2].name.value_counts()[:25] expected = [['name1', 4]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.groupby('name', Scalar(1).rename('constant')) \ .agg(id=self.expr.id.sum()) expected = [['name1', 1, 14], ['name2', 1, 2]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr[:1] expr = expr.groupby('name').agg(expr.id.sum()) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4]] self.assertEqual(expected, result) expr = self.expr.groupby('id').name.cat(sep=',') res = self.engine.execute(expr) result = self._get_result(res) expected = [['name2'], ['name1,name1'], ['name1,name1']] self.assertEqual(sorted(result), sorted(expected))
def testJoin(self): e = self.expr e1 = self.expr1 e2 = self.expr2 joined = e.join(e1, ['fid'], suffixes=('_tl', '_tr')) self.assertIsInstance(joined, JoinCollectionExpr) self.assertIsInstance(joined, InnerJoin) self.assertNotIsInstance(joined, LeftJoin) self.assertIsInstance(joined._predicate[0], Equal) self.assertEqual(joined._lhs, e) self.assertEqual(joined._rhs, e1) self.assertEqual(joined._how, 'INNER') self.assertEqual( sorted(joined._schema.names), sorted(['name_tl', 'id_tl', 'name_tr', 'id_tr', 'fid'])) self.assertEqual( sorted([t.name for t in joined._schema.types]), sorted(['string', 'int64', 'string', 'int64', 'float64'])) joined = e.inner_join(e1, ['fid', 'id']) self.assertIsInstance(joined, InnerJoin) self.assertNotIsInstance(joined, LeftJoin) predicate = reduce(operator.and_, joined._predicate) pred = predicate.args[0] self.assertIsInstance(pred, Equal) self.assertEqual(pred._lhs.name, 'fid') self.assertEqual(pred._rhs.name, 'fid') pred = predicate.args[1] self.assertIsInstance(pred, Equal) self.assertEqual(pred._lhs.name, 'id') self.assertEqual(pred._rhs.name, 'id') self.assertEqual(joined._lhs, e) self.assertEqual(joined._rhs, e1) self.assertEqual(joined._how, 'INNER') joined = e1.left_join(e, e.name == e1.name) self.assertIsInstance(joined, LeftJoin) self.assertEqual(joined._lhs, e1) self.assertEqual(joined._rhs, e) self.assertEqual(joined._how, 'LEFT OUTER') joined = e1.left_join(e, e.name == e1.name, merge_columns=True) self.assertIsInstance(joined, ProjectCollectionExpr) self.assertIsInstance(joined._input, LeftJoin) self.assertEqual(joined._input._lhs, e1) self.assertEqual(joined._input._rhs, e) self.assertEqual(joined._input._how, 'LEFT OUTER') self.assertIn('name', joined.schema.names) self.assertNotIn('id', joined.schema.names) self.assertNotIn('fid', joined.schema.names) joined = e1.right_join(e, [e.fid == e1.fid, e1.name == e.name]) self.assertIsInstance(joined, RightJoin) self.assertEqual(joined._lhs, e1) self.assertEqual(joined._rhs, e) self.assertEqual(joined._how, 'RIGHT OUTER') joined = e1.right_join(e, [e.id == e1.id]) self.assertIsInstance(joined, RightJoin) # self.assertEqual(joined._predicates, [(e.id, e1.id)]) self.assertEqual(joined._lhs, e1) self.assertEqual(joined._rhs, e) self.assertEqual(joined._how, 'RIGHT OUTER') joined = e1.outer_join(e, [('fid', 'fid'), ('name', 'name')]) self.assertIsInstance(joined, OuterJoin) self.assertEqual(joined._lhs, e1) self.assertEqual(joined._rhs, e) self.assertEqual(joined._how, 'FULL OUTER') joined = e.join(e1, ['fid', 'name'], 'OuTer') self.assertIsInstance(joined, OuterJoin) self.assertNotIsInstance(joined, InnerJoin) # self.assertEqual(joined._predicates, [(e.fid, e1.fid), (e.name, e1.name)]) self.assertEqual(joined._lhs, e) self.assertEqual(joined._rhs, e1) self.assertEqual(joined._how, 'FULL OUTER') # join + in projection e = e['fid', 'name'] joined = e.join(e1, ['fid'], 'LEFT') self.assertIsInstance(joined, LeftJoin) self.assertNotIsInstance(joined, InnerJoin) self.assertEqual(joined._lhs, e) self.assertIsInstance(joined._lhs, ProjectCollectionExpr) self.assertEqual(joined._rhs, e1) self.assertEqual(joined._how, 'LEFT OUTER') e1 = e1['fid', 'id'] joined = e.join(e1, [(e.fid, e1.fid)]) self.assertIsInstance(joined, JoinCollectionExpr) self.assertIsInstance(joined, InnerJoin) self.assertEqual(joined._lhs, e) self.assertEqual(joined._rhs, e1) self.assertEqual(joined._how, 'INNER') # projection on join e1 = self.expr1 e = self.expr joined = e.join(e1, ['fid']) project = joined[e1, e.name] self.assertIsInstance(project, ProjectCollectionExpr) self.assertSequenceEqual(project._schema.names, ['name_y', 'id_y', 'fid', 'name_x']) # on is empty, on source is eqaul, on field cannot transformed, other how self.assertRaises(ValueError, lambda: e.join(e1, [''])) self.assertRaises(ExpressionError, lambda: e.join(e1, [()])) self.assertRaises(ExpressionError, lambda: e.join(e1, e.fid == e.fid)) self.assertRaises(TypeError, lambda: e.join(e1, e.name == e1.fid)) e1 = self.expr1.select(name2=self.expr1.name, id2=self.expr1.id) joined = e.join(e1, on=(e.name == Scalar('tt') + e1.name2)) project = joined[e, e1['name2', ]] self.assertIsInstance(joined, JoinCollectionExpr) self.assertIsInstance(project, ProjectCollectionExpr) self.assertIs( next(f for f in project._fields if f.name == 'name2').input, joined) with self.assertRaises(ExpressionError): self.expr.join(self.expr1, on=self.expr.id == self.expr2.id) with self.assertRaises(ExpressionError): expr = self.expr.join(self.expr1, on='id') self.expr.join(expr, on=self.expr.id == self.expr.id) # first __setitem__, then join e = self.expr e1 = self.expr1['name', self.expr1.fid.rename('fid2')] e1['fid2'] = e1.fid2.astype('string') joined = e.join(e1, on='name') self.assertIsInstance(joined, JoinCollectionExpr) self.assertListEqual(joined.dtypes.names, ['name', 'id', 'fid', 'fid2']) self.assertEqual(joined.fid2.dtype.name, 'string')
def testGroupbyAggregation(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] self._gen_data(data=data) field = self.expr.groupby('name').sort(['id', -self.expr.fid]).row_number() expr = self.expr['name', 'id', 'fid', field] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 3, 4.1, 1], ['name1', 3, 2.2, 2], ['name1', 4, 5.3, 3], ['name1', 4, 4.2, 4], ['name2', 2, 3.5, 1], ] result = sorted(result, key=lambda k: (k[0], k[1], -k[2])) self.assertEqual(expected, result) expr = self.expr.name.value_counts(dropna=True)[:25] expected = [['name1', 4], ['name2', 1]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.name.topk(25) res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.groupby('name').count() res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(sorted([it[1:] for it in expected]), sorted(result)) expected = [['name1', 2], ['name2', 1]] expr = self.expr.groupby('name').id.nunique() res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual([it[1:] for it in expected], result) expr = self.expr[self.expr['id'] > 2].name.value_counts()[:25] expected = [['name1', 4]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr.groupby('name', Scalar(1).rename('constant')) \ .agg(id=self.expr.id.sum()) expected = [['name1', 1, 14], ['name2', 1, 2]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(expected, result) expr = self.expr[:1] expr = expr.groupby('name').agg(expr.id.sum()) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4]] self.assertEqual(expected, result)
def testPivotTable(self): data = [['name1', 1, 1.0, True, None], ['name1', 1, 5.0, True, None], ['name1', 2, 2.0, True, None], ['name2', 1, 3.0, False, None], ['name2', 3, 4.0, False, None]] self._gen_data(data=data) expr = self.expr expr1 = expr.pivot_table(rows='name', values='fid') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 8.0 / 3], ['name2', 3.5], ] self.assertListAlmostEqual(sorted(result), sorted(expected), only_float=False) expr2 = expr.pivot_table(rows='name', values='fid', aggfunc=['mean', 'sum']) res = self.engine.execute(expr2) result = self._get_result(res) expected = [ ['name1', 8.0 / 3, 8.0], ['name2', 3.5, 7.0], ] self.assertEqual(res.schema.names, ['name', 'fid_mean', 'fid_sum']) self.assertListAlmostEqual(sorted(result), sorted(expected), only_float=False) expr5 = expr.pivot_table(rows='id', values='fid', columns='name', aggfunc=['mean', 'sum']) expr6 = expr5['name1_fid_mean', expr5.groupby(Scalar(1)).sort('name1_fid_mean'). name1_fid_mean.astype('float').cumsum()] k = lambda x: list(0 if it is None else it for it in x) expected = [[2, 2], [3, 5], [None, 5]] res = self.engine.execute(expr6) result = self._get_result(res) self.assertEqual(sorted(result, key=k), sorted(expected, key=k)) expr3 = expr.pivot_table(rows='id', values='fid', columns='name', fill_value=0).distinct() res = self.engine.execute(expr3) result = self._get_result(res) expected = [ [2, 0, 2.0], [3, 4.0, 0], [1, 3.0, 3.0], ] self.assertEqual(res.schema.names, ['id', 'name2_fid_mean', 'name1_fid_mean']) self.assertEqual(result, expected) expr7 = expr.pivot_table(rows='id', values='fid', columns='name', aggfunc=['mean', 'sum']).cache() self.assertEqual(len(self.engine.execute(expr7)), 3) expr8 = self.expr.pivot_table(rows='id', values='fid', columns='name') self.assertEqual(len(self.engine.execute(expr8)), 3) self.assertNotIsInstance(expr8.schema, DynamicSchema) expr9 = (expr8['name1_fid_mean'] - expr8['name2_fid_mean']).rename('substract') self.assertEqual(len(self.engine.execute(expr9)), 3) expr10 = expr8.distinct() self.assertEqual(len(self.engine.execute(expr10)), 3)
def testJoin(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: table2.drop()