class Test(TestBase): def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'bigint', 'double', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.tb = DataFrame(table) import pandas as pd df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]], columns=['name', 'id', 'fid']) self.pd = DataFrame(df) self.expr = self.tb.join(self.pd, on='name') self.engine = MixedEngine(self.odps) def testMixedCompile(self): dag = self.engine.compile(self.expr) self.assertEqual(len(dag._graph), 2) topos = dag.topological_sort() root_node, expr_node = topos[0], topos[1] root = root_node.expr expr = expr_node.expr self.assertTrue(expr.is_ancestor(root)) self.assertIn(id(expr_node), dag._graph[id(root_node)]) self.assertEqual(len(available_engines(expr.data_source())), 1) def testCacheCompile(self): expr = self.tb['name', 'id'].cache() expr = expr.groupby('name').agg(expr.id.mean()).cache() expr = expr.distinct() dag = self.engine.compile(expr) self.assertEqual(len(dag._graph), 3) topos = dag.topological_sort() project_node, groupby_node, distinct_node = topos[0], topos[1], topos[ 2] distincted = distinct_node.expr self.assertIn(id(groupby_node), dag._graph[id(project_node)]) self.assertIn(id(distinct_node), dag._graph[id(groupby_node)]) self.assertIsInstance(distincted, DistinctCollectionExpr) def testDep(self): expr = self.tb.pivot_table(rows='id', columns='name', values='fid') dag = self.engine.compile(expr) self.assertEqual(len(dag._graph), 2) self.assertEqual(sum(len(v) for v in dag._graph.values()), 1)
def testPivotTable(self): data = [['name1', 1, 1.0, True], ['name1', 1, 5.0, True], ['name1', 2, 2.0, True], ['name2', 1, 3.0, False], ['name2', 3, 4.0, False]] table_name = tn('pyodps_test_mixed_engine_pivot_table') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists( ['name', 'id', 'fid', 'ismale'], ['string', 'bigint', 'double', 'boolean'])) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot_table(rows='name', values='fid') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 8.0 / 3], ['name2', 3.5], ] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot_table(rows='name', values='fid', aggfunc=['mean', 'sum']) res = self.engine.execute(expr2) result = self._get_result(res) expected = [ ['name1', 8.0 / 3, 8.0], ['name2', 3.5, 7.0], ] self.assertEqual(res.schema.names, ['name', 'fid_mean', 'fid_sum']) self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot_table(rows='id', values='fid', columns='name', fill_value=0).distinct() res = self.engine.execute(expr3) result = self._get_result(res) expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]] self.assertEqual(res.schema.names, ['id', 'name1_fid_mean', 'name2_fid_mean']) self.assertEqual(result, expected) class Agg(object): def buffer(self): return [0] def __call__(self, buffer, val): buffer[0] += val def merge(self, buffer, pbuffer): buffer[0] += pbuffer[0] def getvalue(self, buffer): return buffer[0] aggfuncs = OrderedDict([('my_sum', Agg), ('mean', 'mean')]) expr4 = expr.pivot_table(rows='id', values='fid', columns='name', fill_value=0, aggfunc=aggfuncs) res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]] self.assertEqual(res.schema.names, [ 'id', 'name1_fid_my_sum', 'name2_fid_my_sum', 'name1_fid_mean', 'name2_fid_mean' ]) self.assertEqual(result, expected) finally: table.drop()
def testPivotTable(self): data = [ ["name1", 1, 1.0, True], ["name1", 1, 5.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False], ] table_name = tn("pyodps_test_mixed_engine_pivot_table") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot_table(rows="name", values="fid") res = self.engine.execute(expr1) result = self._get_result(res) expected = [["name1", 8.0 / 3], ["name2", 3.5]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot_table(rows="name", values="fid", aggfunc=["mean", "sum"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [["name1", 8.0 / 3, 8.0], ["name2", 3.5, 7.0]] self.assertEqual(res.schema.names, ["name", "fid_mean", "fid_sum"]) self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot_table(rows="id", values="fid", columns="name", aggfunc=["mean", "sum"]) expr6 = expr5[ "name1_fid_mean", expr5.groupby(Scalar(1)).sort("name1_fid_mean").name1_fid_mean.astype("float").cumsum(), ] k = lambda x: list(0 if it is None else it for it in x) # TODO: fix this situation, act different compared to pandas expected = [[2, 2], [3, 5], [None, None]] res = self.engine.execute(expr6) result = self._get_result(res) self.assertEqual(sorted(result, key=k), sorted(expected, key=k)) expr3 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0).distinct() res = self.engine.execute(expr3) result = self._get_result(res) expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]] self.assertEqual(res.schema.names, ["id", "name1_fid_mean", "name2_fid_mean"]) self.assertEqual(result, expected) class Agg(object): def buffer(self): return [0] def __call__(self, buffer, val): buffer[0] += val def merge(self, buffer, pbuffer): buffer[0] += pbuffer[0] def getvalue(self, buffer): return buffer[0] aggfuncs = OrderedDict([("my_sum", Agg), ("mean", "mean")]) expr4 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0, aggfunc=aggfuncs) res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]] self.assertEqual( res.schema.names, ["id", "name1_fid_my_sum", "name2_fid_my_sum", "name1_fid_mean", "name2_fid_mean"] ) self.assertEqual(result, expected) finally: table.drop()