Esempio n. 1
0
 def setup(self):
     from odps.df.expr.tests.core import MockTable
     schema = Schema.from_lists(types._data_types.keys(),
                                types._data_types.values())
     self.expr = CollectionExpr(_source_data=None, _schema=schema)
     self.sourced_expr = CollectionExpr(
         _source_data=MockTable(client=self.odps.rest), _schema=schema)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'), ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table,
                                   _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1,
                                    _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'],
                                    datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3,
                                    _schema=Schema(columns=schema2.columns))
Esempio n. 3
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'), ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1, _schema=schema)

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema)

        schema2 = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'],
                                    datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=schema2)

        schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'],
                                    datatypes('int64', 'string',
                                              'dict<string, string>',
                                              'list<string>'))
        table4 = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3)

        self.maxDiff = None
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64'))
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_selecter_table_%s' %
                        str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()

        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = tn('pyodps_test_selecter_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        self.selecter = EngineSelecter()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5],
            datatypes('string', 'int64', 'float64', 'boolean', 'datetime',
                      'decimal')[:5])
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = SeahawksEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
    def testSVGFormatter(self):
        t = MockTable(name='pyodps_test_svg', schema=self.schema, _client=self.odps.rest)
        expr = CollectionExpr(_source_data=t, _schema=self.schema)

        expr1 = expr.groupby('name').agg(id=expr['id'].sum())
        expr2 = expr1['name', expr1.id + 3]

        engine = MixedEngine(self.odps)
        dag = engine.compile(expr2)
        nodes = dag.nodes()
        self.assertEqual(len(nodes), 1)
        expr3 = nodes[0].expr
        self.assertIsInstance(expr3, GroupByCollectionExpr)
        dot = ExprExecutionGraphFormatter(dag)._to_dot()
        self.assertNotIn('Projection', dot)

        expr1 = expr.groupby('name').agg(id=expr['id'].sum()).cache()
        expr2 = expr1['name', expr1.id + 3]

        engine = MixedEngine(self.odps)
        dag = engine.compile(expr2)
        nodes = dag.nodes()
        self.assertEqual(len(nodes), 2)
        dot = ExprExecutionGraphFormatter(dag)._to_dot()
        self.assertIn('Projection', dot)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
    def testMakeKV(self):
        from odps import types as odps_types
        data = [
            ['name1', 1.0, 3.0, None, 10.0, None, None],
            ['name1', None, 3.0, 5.1, None, None, None],
            ['name1', 7.1, None, None, None, 8.2, None],
            ['name2', None, 1.2, 1.5, None, None, None],
            ['name2', None, 1.0, None, None, None, 1.1],
        ]
        kv_cols = ['k1', 'k2', 'k3', 'k5', 'k7', 'k9']
        schema = Schema.from_lists(['name'] + kv_cols, [odps_types.string] +
                                   [odps_types.double] * 6)
        table_name = tn('pyodps_test_engine_make_kv')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(name=table_name, schema=schema)
        expr = CollectionExpr(_source_data=table,
                              _schema=odps_schema_to_df_schema(schema))
        try:
            self.odps.write_table(table, 0, data)
            expr1 = expr.to_kv(columns=kv_cols, kv_delim='=')

            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [
                ['name1', 'k1=1,k2=3,k5=10'],
                ['name1', 'k2=3,k3=5.1'],
                ['name1', 'k1=7.1,k7=8.2'],
                ['name2', 'k2=1.2,k3=1.5'],
                ['name2', 'k2=1,k9=1.1'],
            ]

            self.assertListEqual(result, expected)
        finally:
            table.drop()
Esempio n. 10
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.df_schema = schema
        self.schema = df_schema_to_odps_schema(schema)
        self.df = None
        self.expr = None

        self.engine = SQLAlchemyEngine()

        import sqlalchemy
        from sqlalchemy import create_engine

        self.sql_engine = engine = create_engine('postgres://localhost/pyodps')
        # self.sql_engine = engine = create_engine('mysql://localhost/pyodps')
        # self.sql_engine = engine = create_engine('sqlite://')
        self.conn = engine.connect()

        self.metadata = metadata = sqlalchemy.MetaData(bind=engine)
        columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine)
        t = sqlalchemy.Table('pyodps_test_data', metadata, *columns)

        metadata.create_all()

        self.table = t
        self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                   datatypes('string', 'int64', 'float64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
    def testScaleValue(self):
        data = [
            ['name1', 4, 5.3],
            ['name2', 2, 3.5],
            ['name1', 4, 4.2],
            ['name1', 3, 2.2],
            ['name1', 3, 4.1],
        ]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                   [types.string, types.bigint, types.double])
        table_name = tn('pyodps_test_engine_scale_table')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(name=table_name, schema=schema)
        self.odps.write_table(table_name, 0, data)
        expr_input = CollectionExpr(_source_data=table,
                                    _schema=odps_schema_to_df_schema(schema))

        expr = expr_input.min_max_scale(columns=['fid'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['name1', 4, 1.0], ['name2', 2, 0.41935483870967744],
                    ['name1', 4, 0.6451612903225807], ['name1', 3, 0.0],
                    ['name1', 3, 0.6129032258064515]]

        result = sorted(result)
        expected = sorted(expected)

        for first, second in zip(result, expected):
            self.assertEqual(len(first), len(second))
            for it1, it2 in zip(first, second):
                self.assertAlmostEqual(it1, it2)

        expr = expr_input.std_scale(columns=['fid'])

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = [['name1', 4, 1.4213602653434203],
                    ['name2', 2, -0.3553400663358544],
                    ['name1', 4, 0.3355989515394193],
                    ['name1', 3, -1.6385125281042194],
                    ['name1', 3, 0.23689337755723686]]

        result = sorted(result)
        expected = sorted(expected)

        for first, second in zip(result, expected):
            self.assertEqual(len(first), len(second))
            for it1, it2 in zip(first, second):
                self.assertAlmostEqual(it1, it2)
    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])
        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [
            ['name3', 5, -1],
            ['name4', 6, -2]
        ]

        self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2])

        try:
            expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name'])

            res = self.engine.execute(expr)
            result = self._get_result(res)

            expected = [
                ['name1', 4],
                ['name1', 3],
                ['name2', 2],
                ['name3', 5],
                ['name4', 6]
            ]

            result = sorted(result)
            expected = sorted(expected)

            self.assertEqual(len(result), len(expected))
            for e, r in zip(result, expected):
                self.assertEqual([to_str(t) for t in e],
                                 [to_str(t) for t in r])

        finally:
            table2.drop()
Esempio n. 14
0
    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [
            ['name3', 5, -1],
            ['name4', 6, -2]
        ]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        try:
            expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name'])

            res = self.engine.execute(expr)
            result = self._get_result(res)

            expected = [
                ['name1', 4],
                ['name1', 3],
                ['name2', 2],
                ['name3', 5],
                ['name4', 6]
            ]

            result = sorted(result)
            expected = sorted(expected)

            self.assertEqual(len(result), len(expected))
            for e, r in zip(result, expected):
                self.assertEqual([to_str(t) for t in e],
                                 [to_str(t) for t in r])

        finally:
            [conn.close() for conn in _engine_to_connections.values()]
            table2.drop()
    def testBizarreField(self):
        def my_func(row):
            return getattr(row, '012') * 2.0

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', '012'],
                                   datatypes('string', 'int64', 'float64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64']))
        udtf = list(self.engine._ctx._func_to_udfs.values())[0]
        udtf = get_function(udtf, UDF_CLASS_NAME)
        self.assertEqual([20, 40],
                         runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])
        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [
            ['name1', 4, -1],
            ['name2', 1, -2]
        ]

        self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2])

        try:
            expr = self.expr.join(expr2)['name', 'id2']

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(len(result), 5)
            expected = [
                [to_str('name1'), 4],
                [to_str('name2'), 1]
            ]
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(len(result), 2)
            expected = [to_str('name1'), 4]
            self.assertTrue(all(it == expected for it in result))

        finally:
            table2.drop()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None],
            ['name2', 2, 3.5, None, None],
            ['name1', 4, 4.2, None, None],
            ['name1', 3, 2.2, None, None],
            ['name1', 3, 4.1, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = tn('pyodps_test_engine_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        id_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'id'
        ][0]
        fid_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'fid'
        ][0]
        expected = [[k, sum(
            v[fid_idx] for v in row)] for k, row in itertools.groupby(
                sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])]
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
Esempio n. 19
0
    def testCallableColumn(self):
        from odps.df.expr.expressions import CallableColumn
        from odps.df.expr.collections import ProjectCollectionExpr

        schema = Schema.from_lists(['name', 'f1', 'append_id'],
                                   [types.string, types.float64, types.int64])
        expr = CollectionExpr(_source_data=None, _schema=schema)
        self.assertIsInstance(expr.append_id, CallableColumn)
        self.assertNotIsInstance(expr.f1, CallableColumn)

        projected = expr[expr.name, expr.append_id]
        self.assertIsInstance(projected, ProjectCollectionExpr)
        self.assertListEqual(projected.schema.names, ['name', 'append_id'])

        projected = expr[expr.name, expr.f1]
        self.assertNotIsInstance(projected.append_id, CallableColumn)

        appended = expr.append_id(id_col='id_col')
        self.assertIn('id_col', appended.schema)
Esempio n. 20
0
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        id_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'id'
        ][0]
        fid_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'fid'
        ][0]
        expected = [[k, sum(
            v[fid_idx] for v in row)] for k, row in itertools.groupby(
                sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])]
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
Esempio n. 21
0
    def testApplyMap(self):
        from odps.df.expr.collections import ProjectCollectionExpr, Column
        from odps.df.expr.element import MappedExpr

        schema = Schema.from_lists(['idx', 'f1', 'f2', 'f3'],
                                   [types.int64] + [types.float64] * 3)
        expr = CollectionExpr(_source_data=None, _schema=schema)

        self.assertRaises(
            ValueError, lambda: expr.applymap(
                lambda v: v + 1, columns='idx', excludes='f1'))

        mapped = expr.applymap(lambda v: v + 1)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, MappedExpr)

        mapped = expr.applymap(lambda v: v + 1, columns='f1')
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, MappedExpr if c.name == 'f1' else Column)

        map_cols = set(['f1', 'f2', 'f3'])
        mapped = expr.applymap(lambda v: v + 1, columns=map_cols)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c,
                                  MappedExpr if c.name in map_cols else Column)

        mapped = expr.applymap(lambda v: v + 1, excludes='idx')
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c, Column if c.name == 'idx' else MappedExpr)

        exc_cols = set(['idx', 'f1'])
        mapped = expr.applymap(lambda v: v + 1, excludes=exc_cols)
        self.assertIsInstance(mapped, ProjectCollectionExpr)
        for c in mapped._fields:
            self.assertIsInstance(c,
                                  Column if c.name in exc_cols else MappedExpr)
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = 'pyodps_test_engine_table2'
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0,
                              [table2.new_record(values=d) for d in data2])

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        import pandas as pd
        expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})\
            .reset_index().values.tolist()
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
    def testFilterOrder(self):
        table_name = tn('pyodps_test_division_error')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(table_name,
                                       'divided bigint, divisor bigint',
                                       lifecycle=1)

        try:
            self.odps.write_table(table_name,
                                  [[2, 0], [1, 1], [1, 2], [5, 1], [5, 0]])
            df = CollectionExpr(_source_data=table,
                                _schema=odps_schema_to_df_schema(table.schema))
            fdf = df[df.divisor > 0]
            ddf = fdf[(fdf.divided / fdf.divisor).rename('result'), ]
            expr = ddf[ddf.result > 1]

            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(result, [[
                5,
            ]])
        finally:
            table.drop()
Esempio n. 24
0
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        try:
            expr = self.expr.join(expr2)['name', 'id2']

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(len(result), 5)
            expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.join(expr2, on=['name',
                                             ('id', 'id2')])[self.expr.name,
                                                             expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(len(result), 2)
            expected = [to_str('name1'), 4]
            self.assertTrue(all(it == expected for it in result))

            expr = self.expr.left_join(expr2,
                                       on=['name',
                                           ('id', 'id2')])[self.expr.name,
                                                           expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [['name1', 4], ['name2', None], ['name1', 4],
                        ['name1', None], ['name1', None]]
            self.assertEqual(len(result), 5)
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.right_join(expr2,
                                        on=['name',
                                            ('id', 'id2')])[self.expr.name,
                                                            expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [
                ['name1', 4],
                ['name1', 4],
                [None, 1],
            ]
            self.assertEqual(len(result), 3)
            self.assertTrue(all(it in expected for it in result))

            if self.sql_engine.name != 'mysql':
                expr = self.expr.outer_join(expr2,
                                            on=['name',
                                                ('id', 'id2')])[self.expr.name,
                                                                expr2.id2]
                res = self.engine.execute(expr)
                result = self._get_result(res)
                expected = [
                    ['name1', 4],
                    ['name1', 4],
                    ['name2', None],
                    ['name1', None],
                    ['name1', None],
                    [None, 1],
                ]
                self.assertEqual(len(result), 6)
                self.assertTrue(all(it in expected for it in result))

            grouped = self.expr.groupby('name').agg(
                new_id=self.expr.id.sum()).cache()
            self.engine.execute(self.expr.join(grouped, on='name'))

            if self.sql_engine.name != 'mysql':
                expr = self.expr.join(expr2, on=[
                    'name', ('id', 'id2')
                ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ]
                self.engine.execute(expr)
        finally:
            [conn.close() for conn in _engine_to_connections.values()]
            table2.drop()
    def testFilterPushdownThroughMultipleProjection(self):
        schema = Schema.from_lists(list('abcde'), ['string']*5)
        table = MockTable(name='pyodps_test_expr_table3', schema=schema)
        tab = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema))

        labels2 = []
        bins2 = []
        for i in range(0, 30):
            a = str(7 * i) + '-' + str(7 * (i + 1))
            b = 7 * i
            bins2.append(b)
            labels2.append(a)

        p1 = tab.select(tab.a,
                        tab.c.astype('int').cut(bins2, labels=labels2, include_over=True).rename('c_cut'),
                        tab.e.astype('int').rename('e'),
                        tab.c.astype('int').rename('c'))
        p1['f'] = p1['e'] / p1['c']
        t = []
        l = []
        for i in range(0, 20):
            a = 1 * i
            b = str(a)
            t.append(a)
            l.append(b)
        p2 = p1.select(p1.a, p1.c_cut, p1.f.cut(bins=t, labels=l, include_over=True).rename('f_cut'))

        expected = "SELECT t1.`a`, CASE WHEN (0 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 7) THEN '0-7' " \
                   "WHEN (7 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 14) " \
                   "THEN '7-14' WHEN (14 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 21) THEN '14-21' " \
                   "WHEN (21 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 28) " \
                   "THEN '21-28' WHEN (28 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 35) THEN '28-35' " \
                   "WHEN (35 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 42) THEN '35-42' " \
                   "WHEN (42 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 49) THEN '42-49' " \
                   "WHEN (49 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 56) " \
                   "THEN '49-56' WHEN (56 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 63) THEN '56-63' " \
                   "WHEN (63 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 70) THEN '63-70' " \
                   "WHEN (70 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 77) " \
                   "THEN '70-77' WHEN (77 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 84) " \
                   "THEN '77-84' WHEN (84 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 91) THEN '84-91' " \
                   "WHEN (91 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 98) " \
                   "THEN '91-98' WHEN (98 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 105) THEN '98-105' " \
                   "WHEN (105 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 112) " \
                   "THEN '105-112' WHEN (112 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 119) THEN '112-119' " \
                   "WHEN (119 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 126) " \
                   "THEN '119-126' WHEN (126 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 133) THEN '126-133' " \
                   "WHEN (133 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 140) " \
                   "THEN '133-140' WHEN (140 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 147) THEN '140-147' " \
                   "WHEN (147 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 154) " \
                   "THEN '147-154' WHEN (154 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 161) THEN '154-161' " \
                   "WHEN (161 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 168) " \
                   "THEN '161-168' WHEN (168 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 175) THEN '168-175' " \
                   "WHEN (175 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 182) " \
                   "THEN '175-182' WHEN (182 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 189) THEN '182-189' " \
                   "WHEN (189 < CAST(t1.`c` AS BIGINT)) AND (CAST(t1.`c` AS BIGINT) <= 196) " \
                   "THEN '189-196' WHEN (196 < CAST(t1.`c` AS BIGINT)) " \
                   "AND (CAST(t1.`c` AS BIGINT) <= 203) THEN '196-203' " \
                   "WHEN 203 < CAST(t1.`c` AS BIGINT) THEN '203-210' END AS `c_cut`, " \
                   "CASE WHEN (0 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 1) THEN '0' " \
                   "WHEN (1 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 2) " \
                   "THEN '1' WHEN (2 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 3) THEN '2' " \
                   "WHEN (3 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 4) " \
                   "THEN '3' WHEN (4 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 5) THEN '4' " \
                   "WHEN (5 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 6) THEN '5' " \
                   "WHEN (6 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 7) " \
                   "THEN '6' WHEN (7 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 8) THEN '7' " \
                   "WHEN (8 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 9) THEN '8' " \
                   "WHEN (9 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 10) " \
                   "THEN '9' WHEN (10 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 11) THEN '10' " \
                   "WHEN (11 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 12) " \
                   "THEN '11' WHEN (12 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 13) THEN '12' " \
                   "WHEN (13 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 14) THEN '13' " \
                   "WHEN (14 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 15) THEN '14' " \
                   "WHEN (15 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 16) THEN '15' " \
                   "WHEN (16 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 17) THEN '16' " \
                   "WHEN (17 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 18) " \
                   "THEN '17' WHEN (18 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 19) THEN '18' " \
                   "WHEN 19 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) THEN '19' END AS `f_cut` \n" \
                   "FROM mocked_project.`pyodps_test_expr_table3` t1 \n" \
                   "WHERE (CASE WHEN (0 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 1) THEN '0' " \
                   "WHEN (1 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 2) " \
                   "THEN '1' WHEN (2 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 3) THEN '2' " \
                   "WHEN (3 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 4) THEN '3' " \
                   "WHEN (4 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 5) THEN '4' " \
                   "WHEN (5 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 6) THEN '5' " \
                   "WHEN (6 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 7) THEN '6' " \
                   "WHEN (7 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 8) THEN '7' " \
                   "WHEN (8 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 9) THEN '8' " \
                   "WHEN (9 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 10) THEN '9' " \
                   "WHEN (10 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 11) THEN '10' " \
                   "WHEN (11 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 12) THEN '11' " \
                   "WHEN (12 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 13) THEN '12' " \
                   "WHEN (13 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 14) THEN '13' " \
                   "WHEN (14 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 15) THEN '14' " \
                   "WHEN (15 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 16) THEN '15' " \
                   "WHEN (16 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 17) THEN '16' " \
                   "WHEN (17 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 18) THEN '17' " \
                   "WHEN (18 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT))) " \
                   "AND ((CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) <= 19) THEN '18' " \
                   "WHEN 19 < (CAST(t1.`e` AS BIGINT) / CAST(t1.`c` AS BIGINT)) THEN '19' END) == '9'"

        self.assertEqual(str(expected), str(ODPSEngine(self.odps).compile(p2[p2.f_cut == '9'], prettify=False)))
Esempio n. 26
0
 def get_table2_df(self):
     schema = Schema.from_lists(['col21', 'col22'],
                                datatypes('string', 'string'))
     table = MockTable(name=TEMP_TABLE_2_NAME, schema=schema)
     return CollectionExpr(_source_data=table, _schema=schema)
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None],
            ['name2', 2, 3.5, None, None],
            ['name1', 4, 4.2, None, None],
            ['name1', 3, 2.2, None, None],
            ['name1', 3, 4.1, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])
        table_name = tn('pyodps_test_engine_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        expr2 = CollectionExpr(_source_data=table2,
                               _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        try:
            expr = self.expr.join(expr2)['name', 'id2']

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(len(result), 5)
            expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.join(expr2, on=['name',
                                             ('id', 'id2')])[self.expr.name,
                                                             expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(len(result), 2)
            expected = [to_str('name1'), 4]
            self.assertTrue(all(it == expected for it in result))

            expr = self.expr.left_join(expr2,
                                       on=['name',
                                           ('id', 'id2')])[self.expr.name,
                                                           expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [['name1', 4], ['name2', None], ['name1', 4],
                        ['name1', None], ['name1', None]]
            self.assertEqual(len(result), 5)
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.right_join(expr2,
                                        on=['name',
                                            ('id', 'id2')])[self.expr.name,
                                                            expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [
                ['name1', 4],
                ['name1', 4],
                [None, 1],
            ]
            self.assertEqual(len(result), 3)
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.outer_join(expr2,
                                        on=['name',
                                            ('id', 'id2')])[self.expr.name,
                                                            expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [
                ['name1', 4],
                ['name1', 4],
                ['name2', None],
                ['name1', None],
                ['name1', None],
                [None, 1],
            ]
            self.assertEqual(len(result), 6)
            self.assertTrue(all(it in expected for it in result))

            grouped = self.expr.groupby('name').agg(
                new_id=self.expr.id.sum()).cache()
            self.engine.execute(self.expr.join(grouped, on='name'))

            expr = self.expr.join(expr2, on=[
                'name', ('id', 'id2')
            ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ]
            self.engine.execute(expr)
        finally:
            table2.drop()