class Test(TestBase):
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'bigint', 'double', 'boolean', 'decimal',
                      'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.tb = DataFrame(table)

        import pandas as pd

        df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]],
                          columns=['name', 'id', 'fid'])
        self.pd = DataFrame(df)

        self.expr = self.tb.join(self.pd, on='name')

        self.engine = MixedEngine(self.odps)

    def testMixedCompile(self):
        dag = self.engine.compile(self.expr)

        self.assertEqual(len(dag._graph), 2)

        topos = dag.topological_sort()
        root_node, expr_node = topos[0], topos[1]
        root = root_node.expr
        expr = expr_node.expr

        self.assertTrue(expr.is_ancestor(root))
        self.assertIn(id(expr_node), dag._graph[id(root_node)])
        self.assertEqual(len(available_engines(expr.data_source())), 1)

    def testCacheCompile(self):
        expr = self.tb['name', 'id'].cache()
        expr = expr.groupby('name').agg(expr.id.mean()).cache()
        expr = expr.distinct()

        dag = self.engine.compile(expr)

        self.assertEqual(len(dag._graph), 3)

        topos = dag.topological_sort()
        project_node, groupby_node, distinct_node = topos[0], topos[1], topos[
            2]
        distincted = distinct_node.expr

        self.assertIn(id(groupby_node), dag._graph[id(project_node)])
        self.assertIn(id(distinct_node), dag._graph[id(groupby_node)])
        self.assertIsInstance(distincted, DistinctCollectionExpr)

    def testDep(self):
        expr = self.tb.pivot_table(rows='id', columns='name', values='fid')

        dag = self.engine.compile(expr)

        self.assertEqual(len(dag._graph), 2)
        self.assertEqual(sum(len(v) for v in dag._graph.values()), 1)
Exemple #2
0
    def testPivotTable(self):
        data = [['name1', 1, 1.0, True], ['name1', 1, 5.0, True],
                ['name1', 2, 2.0, True], ['name2', 1, 3.0, False],
                ['name2', 3, 4.0, False]]

        table_name = tn('pyodps_test_mixed_engine_pivot_table')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(
                ['name', 'id', 'fid', 'ismale'],
                ['string', 'bigint', 'double', 'boolean']))
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot_table(rows='name', values='fid')
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [
                ['name1', 8.0 / 3],
                ['name2', 3.5],
            ]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot_table(rows='name',
                                     values='fid',
                                     aggfunc=['mean', 'sum'])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [
                ['name1', 8.0 / 3, 8.0],
                ['name2', 3.5, 7.0],
            ]
            self.assertEqual(res.schema.names, ['name', 'fid_mean', 'fid_sum'])
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot_table(rows='id',
                                     values='fid',
                                     columns='name',
                                     fill_value=0).distinct()
            res = self.engine.execute(expr3)
            result = self._get_result(res)

            expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]]

            self.assertEqual(res.schema.names,
                             ['id', 'name1_fid_mean', 'name2_fid_mean'])
            self.assertEqual(result, expected)

            class Agg(object):
                def buffer(self):
                    return [0]

                def __call__(self, buffer, val):
                    buffer[0] += val

                def merge(self, buffer, pbuffer):
                    buffer[0] += pbuffer[0]

                def getvalue(self, buffer):
                    return buffer[0]

            aggfuncs = OrderedDict([('my_sum', Agg), ('mean', 'mean')])
            expr4 = expr.pivot_table(rows='id',
                                     values='fid',
                                     columns='name',
                                     fill_value=0,
                                     aggfunc=aggfuncs)
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0],
                        [3, 0, 4.0, 0, 4.0]]

            self.assertEqual(res.schema.names, [
                'id', 'name1_fid_my_sum', 'name2_fid_my_sum', 'name1_fid_mean',
                'name2_fid_mean'
            ])
            self.assertEqual(result, expected)
        finally:
            table.drop()
    def testPivotTable(self):
        data = [
            ["name1", 1, 1.0, True],
            ["name1", 1, 5.0, True],
            ["name1", 2, 2.0, True],
            ["name2", 1, 3.0, False],
            ["name2", 3, 4.0, False],
        ]

        table_name = tn("pyodps_test_mixed_engine_pivot_table")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot_table(rows="name", values="fid")
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [["name1", 8.0 / 3], ["name2", 3.5]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot_table(rows="name", values="fid", aggfunc=["mean", "sum"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [["name1", 8.0 / 3, 8.0], ["name2", 3.5, 7.0]]
            self.assertEqual(res.schema.names, ["name", "fid_mean", "fid_sum"])
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot_table(rows="id", values="fid", columns="name", aggfunc=["mean", "sum"])
            expr6 = expr5[
                "name1_fid_mean",
                expr5.groupby(Scalar(1)).sort("name1_fid_mean").name1_fid_mean.astype("float").cumsum(),
            ]

            k = lambda x: list(0 if it is None else it for it in x)

            # TODO: fix this situation, act different compared to pandas
            expected = [[2, 2], [3, 5], [None, None]]
            res = self.engine.execute(expr6)
            result = self._get_result(res)
            self.assertEqual(sorted(result, key=k), sorted(expected, key=k))

            expr3 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0).distinct()
            res = self.engine.execute(expr3)
            result = self._get_result(res)

            expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]]

            self.assertEqual(res.schema.names, ["id", "name1_fid_mean", "name2_fid_mean"])
            self.assertEqual(result, expected)

            class Agg(object):
                def buffer(self):
                    return [0]

                def __call__(self, buffer, val):
                    buffer[0] += val

                def merge(self, buffer, pbuffer):
                    buffer[0] += pbuffer[0]

                def getvalue(self, buffer):
                    return buffer[0]

            aggfuncs = OrderedDict([("my_sum", Agg), ("mean", "mean")])
            expr4 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0, aggfunc=aggfuncs)
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]]

            self.assertEqual(
                res.schema.names, ["id", "name1_fid_my_sum", "name2_fid_my_sum", "name1_fid_mean", "name2_fid_mean"]
            )
            self.assertEqual(result, expected)
        finally:
            table.drop()