def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5],
            datatypes('string', 'int64', 'float64', 'boolean', 'datetime',
                      'decimal')[:5])
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = SeahawksEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
Example #3
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'],
                                   datatypes('int64', 'string', 'dict<string, string>', 'list<string>'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
Example #4
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.df_schema = schema
        self.schema = df_schema_to_odps_schema(schema)
        self.df = None
        self.expr = None

        self.engine = SQLAlchemyEngine()

        import sqlalchemy
        from sqlalchemy import create_engine

        self.sql_engine = engine = create_engine('postgres://localhost/pyodps')
        # self.sql_engine = engine = create_engine('mysql://localhost/pyodps')
        # self.sql_engine = engine = create_engine('sqlite://')
        self.conn = engine.connect()

        self.metadata = metadata = sqlalchemy.MetaData(bind=engine)
        columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine)
        t = sqlalchemy.Table('pyodps_test_data', metadata, *columns)

        metadata.create_all()

        self.table = t
        self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
Example #5
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'), ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1, _schema=schema)

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema)

        schema2 = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'],
                                    datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=schema2)

        schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'],
                                    datatypes('int64', 'string',
                                              'dict<string, string>',
                                              'list<string>'))
        table4 = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3)

        self.maxDiff = None
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'), ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table,
                                   _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1,
                                    _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'],
                                    datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3,
                                    _schema=Schema(columns=schema2.columns))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                   datatypes('string', 'int64', 'float64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
Example #10
0
    def testListMethods(self):
        expr = self.expr.hobbies[0]
        self.assertIsInstance(expr, ListDictGetItem)
        self.assertIsInstance(expr, StringSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('string'))

        expr = self.expr.hobbies.len()
        self.assertIsInstance(expr, ListDictLength)
        self.assertIsInstance(expr, Int64SequenceExpr)

        expr = self.expr.hobbies.sort()
        self.assertIsInstance(expr, ListSort)
        self.assertIsInstance(expr, ListSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('list<string>'))

        expr = self.expr.hobbies.contains('yacht')
        self.assertIsInstance(expr, ListContains)
        self.assertIsInstance(expr, BooleanSequenceExpr)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ["name", "id", "fid", "isMale", "scale", "birth"],
            datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
Example #12
0
    def testToList(self):
        expr = self.expr.int64.tolist()
        self.assertIsInstance(expr, ListScalar)
        self.assertEqual(expr.dtype, types.validate_data_type('list<int64>'))

        expr = self.expr.tolist()
        self.assertIsInstance(expr, Summary)
        self.assertLessEqual(len(expr.fields), len(types._data_types))
        self.assertTrue(all(isinstance(node, ToList) for node in expr.fields))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64'))
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id"], datatypes("string", "int64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64"))
        table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
Example #15
0
    def testDictMethods(self):
        expr = self.expr.relatives['abc']
        self.assertIsInstance(expr, ListDictGetItem)
        self.assertIsInstance(expr, StringSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('string'))

        expr = self.expr.relatives.len()
        self.assertIsInstance(expr, ListDictLength)
        self.assertIsInstance(expr, Int64SequenceExpr)

        expr = self.expr.relatives.keys()
        self.assertIsInstance(expr, DictKeys)
        self.assertIsInstance(expr, ListSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('list<string>'))

        expr = self.expr.relatives.values()
        self.assertIsInstance(expr, DictValues)
        self.assertIsInstance(expr, ListSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('list<string>'))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine = ODPSEngine(self.odps)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64'))
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
Example #18
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'string', 'int64', 'float64', 'boolean',
                      'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.data = self._gen_data(20, value_range=(-1000, 1000))
        self.df = pd.DataFrame(self.data, columns=schema.names)
        self.expr = DataFrame(self.df, schema=schema)
Example #19
0
    def testBuilders(self):
        expr = make_list(1, 2, 3, 4)
        self.assertIsInstance(expr, ListBuilder)
        self.assertIsInstance(expr, ListScalar)
        self.assertEqual(expr.dtype, validate_data_type('list<int32>'))

        expr = make_list(1, 2, 3, self.expr.id)
        self.assertIsInstance(expr, ListBuilder)
        self.assertIsInstance(expr, ListSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('list<int64>'))

        self.assertRaises(TypeError, make_list, 1, 2, 'str', type='int32')
        self.assertRaises(TypeError, make_list, 1, 2, 'str')
        expr = make_list(1, 2, 3, 4, type='int64')
        self.assertEqual(expr.dtype, validate_data_type('list<int64>'))
        expr = make_list(1.1, 2.2, 3.3, 4.4)
        self.assertEqual(expr.dtype, validate_data_type('list<float64>'))
        expr = make_list(1, 2, 3, 65535)
        self.assertEqual(expr.dtype, validate_data_type('list<int32>'))
        expr = make_list(1, 2, 3, compat.long_type(12345678910))
        self.assertEqual(expr.dtype, validate_data_type('list<int64>'))
        expr = make_list(1, 2, 3, 3.5)
        self.assertEqual(expr.dtype, validate_data_type('list<float64>'))

        self.assertRaises(ValueError, make_dict, 1, 2, 3)

        expr = make_dict(1, 2, 3, 4)
        self.assertIsInstance(expr, DictBuilder)
        self.assertIsInstance(expr, DictScalar)
        self.assertEqual(expr.dtype, validate_data_type('dict<int32,int32>'))

        expr = make_dict(1, 2, 3, 4, key_type='int16', value_type='int64')
        self.assertIsInstance(expr, DictBuilder)
        self.assertIsInstance(expr, DictScalar)
        self.assertEqual(expr.dtype, validate_data_type('dict<int16,int64>'))

        expr = make_dict(1, 2, 3, self.expr.id)
        self.assertIsInstance(expr, DictBuilder)
        self.assertIsInstance(expr, DictSequenceExpr)
        self.assertEqual(expr.dtype, validate_data_type('dict<int32,int64>'))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_selecter_table_%s' %
                        str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()

        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = tn('pyodps_test_selecter_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        self.selecter = EngineSelecter()
Example #21
0
    def testUnion(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [
            ['name3', 5, -1],
            ['name4', 6, -2]
        ]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        try:
            expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name'])

            res = self.engine.execute(expr)
            result = self._get_result(res)

            expected = [
                ['name1', 4],
                ['name1', 3],
                ['name2', 2],
                ['name3', 5],
                ['name4', 6]
            ]

            result = sorted(result)
            expected = sorted(expected)

            self.assertEqual(len(result), len(expected))
            for e, r in zip(result, expected):
                self.assertEqual([to_str(t) for t in e],
                                 [to_str(t) for t in r])

        finally:
            [conn.close() for conn in _engine_to_connections.values()]
            table2.drop()
    def testBizarreField(self):
        def my_func(row):
            return getattr(row, '012') * 2.0

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', '012'],
                                   datatypes('string', 'int64', 'float64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64']))
        udtf = list(self.engine._ctx._func_to_udfs.values())[0]
        udtf = get_function(udtf, UDF_CLASS_NAME)
        self.assertEqual([20, 40],
                         runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
Example #24
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = DynamicSchema.from_schema(
            Schema.from_lists(
                ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                          'datetime')))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        schema2 = DynamicSchema.from_schema(Schema.from_lists(
            ['name2', 'id', 'fid2'], datatypes('string', 'int64', 'float64')),
                                            default_type=types.string)
        table2 = MockTable(name='pyodps_test_expr_tabl2', schema=schema2)

        self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema)
        self.expr2 = DynamicCollectionExpr(_source_data=table2,
                                           _schema=schema2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'),
                                   ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'], datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = DynamicSchema.from_schema(
            Schema.from_lists(
                ["name", "id", "fid", "isMale", "scale", "birth"],
                datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
            )
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        schema2 = DynamicSchema.from_schema(
            Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")),
            default_type=types.string,
        )
        table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2)

        self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema)
        self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
Example #29
0
    def testExplode(self):
        expr = self.expr.hobbies.explode()
        self.assertIsInstance(expr, RowAppliedCollectionExpr)
        self.assertIs(expr.input, self.expr)
        self.assertEqual(expr._func, 'EXPLODE')
        self.assertEqual(expr.dtypes.names, [self.expr.hobbies.name])
        self.assertEqual(expr.dtypes.types, [self.expr.hobbies.dtype.value_type])

        expr = self.expr.hobbies.explode('exploded')
        self.assertEqual(expr.dtypes.names, ['exploded'])

        self.assertRaises(ValueError, self.expr.hobbies.explode, ['abc', 'def'])

        expr = self.expr.hobbies.explode(pos=True)
        self.assertIsInstance(expr, RowAppliedCollectionExpr)
        self.assertIs(expr.input, self.expr)
        self.assertEqual(expr._func, 'POSEXPLODE')
        self.assertEqual(expr.dtypes.names,
                         [self.expr.hobbies.name + '_pos', self.expr.hobbies.name])
        self.assertEqual(expr.dtypes.types,
                         [validate_data_type('int64'), self.expr.hobbies.dtype.value_type])

        expr = self.expr.hobbies.explode(['pos', 'exploded'], pos=True)
        self.assertEqual(expr.dtypes.names, ['pos', 'exploded'])

        expr = self.expr.hobbies.explode('exploded', pos=True)
        self.assertEqual(expr.dtypes.names, ['exploded_pos', 'exploded'])

        expr = self.expr.relatives.explode()
        self.assertIsInstance(expr, RowAppliedCollectionExpr)
        self.assertIs(expr.input, self.expr)
        self.assertEqual(expr._func, 'EXPLODE')
        self.assertEqual(expr.dtypes.names,
                         [self.expr.relatives.name + '_key', self.expr.relatives.name + '_value'])
        self.assertEqual(expr.dtypes.types,
                         [self.expr.relatives.dtype.key_type, self.expr.relatives.dtype.value_type])

        expr = self.expr.relatives.explode(['k', 'v'])
        self.assertEqual(expr.dtypes.names, ['k', 'v'])

        self.assertRaises(ValueError, self.expr.relatives.explode, ['abc'])
        self.assertRaises(ValueError, self.expr.relatives.explode, ['abc'], pos=True)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
Example #31
0
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        id_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'id'
        ][0]
        fid_idx = [
            idx for idx, col in enumerate(self.expr.schema.names)
            if col == 'fid'
        ][0]
        expected = [[k, sum(
            v[fid_idx] for v in row)] for k, row in itertools.groupby(
                sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])]
        for it in zip(sorted(expected, key=lambda it: it[0]),
                      sorted(result, key=lambda it: it[0])):
            self.assertAlmostEqual(it[0][0], it[1][0])
            self.assertAlmostEqual(it[0][1], it[1][1])
Example #32
0
# limitations under the License.

import functools

from odps.df.expr.expressions import CollectionExpr
from odps.df.expr.tests.core import MockTable
from odps.df.types import validate_data_type
from odps.ml.expr.op import *
from odps.ml.tests.base import MLTestBase, tn
from odps.ml.utils import KVConfig
from odps.models.table import TableSchema as Schema

TEMP_TABLE_1_NAME = tn('pyodps_test_ops_test_table1')
TEMP_TABLE_2_NAME = tn('pyodps_test_ops_test_table2')

datatypes = lambda *types: [validate_data_type(t) for t in types]


class TestOp(MLTestBase):
    def testBaseMethods(self):
        fields = [
            MLField('f%02d' % fid, 'string', FieldRole.FEATURE)
            for fid in range(5)
        ]
        fields_set_singleton = list(
            DFOperation._set_singleton_role(fields, {'f00': FieldRole.WEIGHT}))
        self.assertSetEqual(fields_set_singleton[0].role,
                            set([FieldRole.FEATURE, FieldRole.WEIGHT]))

        fields_set_singleton2 = list(
            DFOperation._set_singleton_role(fields_set_singleton,
 def setup(self):
     datatypes = lambda *types: [validate_data_type(t) for t in types]
     self.schema = Schema.from_lists(['name', 'id', 'fid'],
                                     datatypes('string', 'int64', 'float64'))
 def setup(self):
     datatypes = lambda *types: [validate_data_type(t) for t in types]
     self.schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64"))
Example #35
0
    def testJoin(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    datatypes('string', 'int64', 'int64'))
        table_name = tn('pyodps_test_engine_table2')
        table2 = self._create_table_and_insert_data(table_name, schema2, data2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        self._gen_data(data=data)

        try:
            expr = self.expr.join(expr2)['name', 'id2']

            res = self.engine.execute(expr)
            result = self._get_result(res)

            self.assertEqual(len(result), 5)
            expected = [[to_str('name1'), 4], [to_str('name2'), 1]]
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.join(expr2, on=['name',
                                             ('id', 'id2')])[self.expr.name,
                                                             expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            self.assertEqual(len(result), 2)
            expected = [to_str('name1'), 4]
            self.assertTrue(all(it == expected for it in result))

            expr = self.expr.left_join(expr2,
                                       on=['name',
                                           ('id', 'id2')])[self.expr.name,
                                                           expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [['name1', 4], ['name2', None], ['name1', 4],
                        ['name1', None], ['name1', None]]
            self.assertEqual(len(result), 5)
            self.assertTrue(all(it in expected for it in result))

            expr = self.expr.right_join(expr2,
                                        on=['name',
                                            ('id', 'id2')])[self.expr.name,
                                                            expr2.id2]
            res = self.engine.execute(expr)
            result = self._get_result(res)
            expected = [
                ['name1', 4],
                ['name1', 4],
                [None, 1],
            ]
            self.assertEqual(len(result), 3)
            self.assertTrue(all(it in expected for it in result))

            if self.sql_engine.name != 'mysql':
                expr = self.expr.outer_join(expr2,
                                            on=['name',
                                                ('id', 'id2')])[self.expr.name,
                                                                expr2.id2]
                res = self.engine.execute(expr)
                result = self._get_result(res)
                expected = [
                    ['name1', 4],
                    ['name1', 4],
                    ['name2', None],
                    ['name1', None],
                    ['name1', None],
                    [None, 1],
                ]
                self.assertEqual(len(result), 6)
                self.assertTrue(all(it in expected for it in result))

            grouped = self.expr.groupby('name').agg(
                new_id=self.expr.id.sum()).cache()
            self.engine.execute(self.expr.join(grouped, on='name'))

            if self.sql_engine.name != 'mysql':
                expr = self.expr.join(expr2, on=[
                    'name', ('id', 'id2')
                ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ]
                self.engine.execute(expr)
        finally:
            [conn.close() for conn in _engine_to_connections.values()]
            table2.drop()
 def setup(self):
     datatypes = lambda *types: [validate_data_type(t) for t in types]
     self.schema = Schema.from_lists(['name', 'id', 'fid'],
                                     datatypes('string', 'int64',
                                               'float64'))