def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5], datatypes('string', 'int64', 'float64', 'boolean', 'datetime', 'decimal')[:5]) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = SeahawksEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'], datatypes('int64', 'string', 'dict<string, string>', 'list<string>')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.df_schema = schema self.schema = df_schema_to_odps_schema(schema) self.df = None self.expr = None self.engine = SQLAlchemyEngine() import sqlalchemy from sqlalchemy import create_engine self.sql_engine = engine = create_engine('postgres://localhost/pyodps') # self.sql_engine = engine = create_engine('mysql://localhost/pyodps') # self.sql_engine = engine = create_engine('sqlite://') self.conn = engine.connect() self.metadata = metadata = sqlalchemy.MetaData(bind=engine) columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine) t = sqlalchemy.Table('pyodps_test_data', metadata, *columns) metadata.create_all() self.table = t self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=schema) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=schema2) schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'], datatypes('int64', 'string', 'dict<string, string>', 'list<string>')) table4 = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3) self.maxDiff = None
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def testListMethods(self): expr = self.expr.hobbies[0] self.assertIsInstance(expr, ListDictGetItem) self.assertIsInstance(expr, StringSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('string')) expr = self.expr.hobbies.len() self.assertIsInstance(expr, ListDictLength) self.assertIsInstance(expr, Int64SequenceExpr) expr = self.expr.hobbies.sort() self.assertIsInstance(expr, ListSort) self.assertIsInstance(expr, ListSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('list<string>')) expr = self.expr.hobbies.contains('yacht') self.assertIsInstance(expr, ListContains) self.assertIsInstance(expr, BooleanSequenceExpr)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def testToList(self): expr = self.expr.int64.tolist() self.assertIsInstance(expr, ListScalar) self.assertEqual(expr.dtype, types.validate_data_type('list<int64>')) expr = self.expr.tolist() self.assertIsInstance(expr, Summary) self.assertLessEqual(len(expr.fields), len(types._data_types)) self.assertTrue(all(isinstance(node, ToList) for node in expr.fields))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64')) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id"], datatypes("string", "int64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64")) table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def testDictMethods(self): expr = self.expr.relatives['abc'] self.assertIsInstance(expr, ListDictGetItem) self.assertIsInstance(expr, StringSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('string')) expr = self.expr.relatives.len() self.assertIsInstance(expr, ListDictLength) self.assertIsInstance(expr, Int64SequenceExpr) expr = self.expr.relatives.keys() self.assertIsInstance(expr, DictKeys) self.assertIsInstance(expr, ListSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('list<string>')) expr = self.expr.relatives.values() self.assertIsInstance(expr, DictValues) self.assertIsInstance(expr, ListSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('list<string>'))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.engine = ODPSEngine(self.odps)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.data = self._gen_data(20, value_range=(-1000, 1000)) self.df = pd.DataFrame(self.data, columns=schema.names) self.expr = DataFrame(self.df, schema=schema)
def testBuilders(self): expr = make_list(1, 2, 3, 4) self.assertIsInstance(expr, ListBuilder) self.assertIsInstance(expr, ListScalar) self.assertEqual(expr.dtype, validate_data_type('list<int32>')) expr = make_list(1, 2, 3, self.expr.id) self.assertIsInstance(expr, ListBuilder) self.assertIsInstance(expr, ListSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('list<int64>')) self.assertRaises(TypeError, make_list, 1, 2, 'str', type='int32') self.assertRaises(TypeError, make_list, 1, 2, 'str') expr = make_list(1, 2, 3, 4, type='int64') self.assertEqual(expr.dtype, validate_data_type('list<int64>')) expr = make_list(1.1, 2.2, 3.3, 4.4) self.assertEqual(expr.dtype, validate_data_type('list<float64>')) expr = make_list(1, 2, 3, 65535) self.assertEqual(expr.dtype, validate_data_type('list<int32>')) expr = make_list(1, 2, 3, compat.long_type(12345678910)) self.assertEqual(expr.dtype, validate_data_type('list<int64>')) expr = make_list(1, 2, 3, 3.5) self.assertEqual(expr.dtype, validate_data_type('list<float64>')) self.assertRaises(ValueError, make_dict, 1, 2, 3) expr = make_dict(1, 2, 3, 4) self.assertIsInstance(expr, DictBuilder) self.assertIsInstance(expr, DictScalar) self.assertEqual(expr.dtype, validate_data_type('dict<int32,int32>')) expr = make_dict(1, 2, 3, 4, key_type='int16', value_type='int64') self.assertIsInstance(expr, DictBuilder) self.assertIsInstance(expr, DictScalar) self.assertEqual(expr.dtype, validate_data_type('dict<int16,int64>')) expr = make_dict(1, 2, 3, self.expr.id) self.assertIsInstance(expr, DictBuilder) self.assertIsInstance(expr, DictSequenceExpr) self.assertEqual(expr.dtype, validate_data_type('dict<int32,int64>'))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_selecter_table_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar() data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_selecter_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) self.selecter = EngineSelecter()
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [ ['name3', 5, -1], ['name4', 6, -2] ] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6] ] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testBizarreField(self): def my_func(row): return getattr(row, '012') * 2.0 datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', '012'], datatypes('string', 'int64', 'float64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64'])) udtf = list(self.engine._ctx._func_to_udfs.values())[0] udtf = get_function(udtf, UDF_CLASS_NAME) self.assertEqual([20, 40], runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = 'pyodps_test_engine_table' self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table( name='pyodps_test_engine_table', schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = DynamicSchema.from_schema( Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))) table = MockTable(name='pyodps_test_expr_table', schema=schema) schema2 = DynamicSchema.from_schema(Schema.from_lists( ['name2', 'id', 'fid2'], datatypes('string', 'int64', 'float64')), default_type=types.string) table2 = MockTable(name='pyodps_test_expr_tabl2', schema=schema2) self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema) self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = DynamicSchema.from_schema( Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) ) table = MockTable(name="pyodps_test_expr_table", schema=schema) schema2 = DynamicSchema.from_schema( Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")), default_type=types.string, ) table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2) self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema) self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
def testExplode(self): expr = self.expr.hobbies.explode() self.assertIsInstance(expr, RowAppliedCollectionExpr) self.assertIs(expr.input, self.expr) self.assertEqual(expr._func, 'EXPLODE') self.assertEqual(expr.dtypes.names, [self.expr.hobbies.name]) self.assertEqual(expr.dtypes.types, [self.expr.hobbies.dtype.value_type]) expr = self.expr.hobbies.explode('exploded') self.assertEqual(expr.dtypes.names, ['exploded']) self.assertRaises(ValueError, self.expr.hobbies.explode, ['abc', 'def']) expr = self.expr.hobbies.explode(pos=True) self.assertIsInstance(expr, RowAppliedCollectionExpr) self.assertIs(expr.input, self.expr) self.assertEqual(expr._func, 'POSEXPLODE') self.assertEqual(expr.dtypes.names, [self.expr.hobbies.name + '_pos', self.expr.hobbies.name]) self.assertEqual(expr.dtypes.types, [validate_data_type('int64'), self.expr.hobbies.dtype.value_type]) expr = self.expr.hobbies.explode(['pos', 'exploded'], pos=True) self.assertEqual(expr.dtypes.names, ['pos', 'exploded']) expr = self.expr.hobbies.explode('exploded', pos=True) self.assertEqual(expr.dtypes.names, ['exploded_pos', 'exploded']) expr = self.expr.relatives.explode() self.assertIsInstance(expr, RowAppliedCollectionExpr) self.assertIs(expr.input, self.expr) self.assertEqual(expr._func, 'EXPLODE') self.assertEqual(expr.dtypes.names, [self.expr.relatives.name + '_key', self.expr.relatives.name + '_value']) self.assertEqual(expr.dtypes.types, [self.expr.relatives.dtype.key_type, self.expr.relatives.dtype.value_type]) expr = self.expr.relatives.explode(['k', 'v']) self.assertEqual(expr.dtypes.names, ['k', 'v']) self.assertRaises(ValueError, self.expr.relatives.explode, ['abc']) self.assertRaises(ValueError, self.expr.relatives.explode, ['abc'], pos=True)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
# limitations under the License. import functools from odps.df.expr.expressions import CollectionExpr from odps.df.expr.tests.core import MockTable from odps.df.types import validate_data_type from odps.ml.expr.op import * from odps.ml.tests.base import MLTestBase, tn from odps.ml.utils import KVConfig from odps.models.table import TableSchema as Schema TEMP_TABLE_1_NAME = tn('pyodps_test_ops_test_table1') TEMP_TABLE_2_NAME = tn('pyodps_test_ops_test_table2') datatypes = lambda *types: [validate_data_type(t) for t in types] class TestOp(MLTestBase): def testBaseMethods(self): fields = [ MLField('f%02d' % fid, 'string', FieldRole.FEATURE) for fid in range(5) ] fields_set_singleton = list( DFOperation._set_singleton_role(fields, {'f00': FieldRole.WEIGHT})) self.assertSetEqual(fields_set_singleton[0].role, set([FieldRole.FEATURE, FieldRole.WEIGHT])) fields_set_singleton2 = list( DFOperation._set_singleton_role(fields_set_singleton,
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] self.schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] self.schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64"))
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) if self.sql_engine.name != 'mysql': expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) if self.sql_engine.name != 'mysql': expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()