def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_selecter_table_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar() data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_selecter_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) self.selecter = EngineSelecter()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5], datatypes('string', 'int64', 'float64', 'boolean', 'datetime', 'decimal')[:5]) self.schema = df_schema_to_odps_schema(schema) table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_')) self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table(name=table_name, schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = SeahawksEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass def inc(self, *args, **kwargs): pass def status(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testMakeKV(self): from odps import types as odps_types data = [ ['name1', 1.0, 3.0, None, 10.0, None, None], ['name1', None, 3.0, 5.1, None, None, None], ['name1', 7.1, None, None, None, 8.2, None], ['name2', None, 1.2, 1.5, None, None, None], ['name2', None, 1.0, None, None, None, 1.1], ] kv_cols = ['k1', 'k2', 'k3', 'k5', 'k7', 'k9'] schema = Schema.from_lists(['name'] + kv_cols, [odps_types.string] + [odps_types.double] * 6) table_name = tn('pyodps_test_engine_make_kv') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(name=table_name, schema=schema) expr = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) try: self.odps.write_table(table, 0, data) expr1 = expr.to_kv(columns=kv_cols, kv_delim='=') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 'k1=1,k2=3,k5=10'], ['name1', 'k2=3,k3=5.1'], ['name1', 'k1=7.1,k7=8.2'], ['name2', 'k2=1.2,k3=1.5'], ['name2', 'k2=1,k9=1.1'], ] self.assertListEqual(result, expected) finally: table.drop()
def testAXFException(self): import sqlalchemy data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] self._gen_data(data=data) table_name = tn('pyodps_test_engine_axf_seahawks_table') try: schema = Schema.from_lists(self.schema.names, self.schema.types, ['ds'], ['string']) self.odps.create_table(table_name, schema) df = self.engine.persist(self.expr, table_name, partition='ds=today', create_partition=True) with self.assertRaises(sqlalchemy.exc.DatabaseError): self.engine.execute(df.input) finally: self.odps.delete_table(table_name, if_exists=True)
def testScaleValue(self): data = [ ['name1', 4, 5.3], ['name2', 2, 3.5], ['name1', 4, 4.2], ['name1', 3, 2.2], ['name1', 3, 4.1], ] schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.bigint, types.double]) table_name = tn('pyodps_test_engine_scale_table') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(name=table_name, schema=schema) self.odps.write_table(table_name, 0, data) expr_input = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(schema)) expr = expr_input.min_max_scale(columns=['fid']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4, 1.0], ['name2', 2, 0.41935483870967744], ['name1', 4, 0.6451612903225807], ['name1', 3, 0.0], ['name1', 3, 0.6129032258064515]] result = sorted(result) expected = sorted(expected) for first, second in zip(result, expected): self.assertEqual(len(first), len(second)) for it1, it2 in zip(first, second): self.assertAlmostEqual(it1, it2) expr = expr_input.std_scale(columns=['fid']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4, 1.4213602653434203], ['name2', 2, -0.3553400663358544], ['name1', 4, 0.3355989515394193], ['name1', 3, -1.6385125281042194], ['name1', 3, 0.23689337755723686]] result = sorted(result) expected = sorted(expected) for first, second in zip(result, expected): self.assertEqual(len(first), len(second)) for it1, it2 in zip(first, second): self.assertAlmostEqual(it1, it2)
def testPersist(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] self._gen_data(data=data) table_name = tn('pyodps_test_engine_persist_seahawks_table') try: df = self.engine.persist(self.expr, table_name) res = self.engine.execute(df) result = self._get_result(res) self.assertEqual(len(result), 5) self.assertEqual(data, result) finally: self.odps.delete_table(table_name, if_exists=True) try: schema = Schema.from_lists(self.schema.names, self.schema.types, ['ds'], ['string']) self.odps.create_table(table_name, schema) df = self.engine.persist(self.expr, table_name, partition='ds=today', create_partition=True) res = self.engine.execute(df) result = self._get_result(res) self.assertEqual(len(result), 5) self.assertEqual(data, [r[:-1] for r in result]) finally: self.odps.delete_table(table_name, if_exists=True) try: self.engine.persist(self.expr, table_name, partitions=['name']) t = self.odps.get_table(table_name) self.assertEqual(2, len(list(t.partitions))) with t.open_reader(partition='name=name1', reopen=True) as r: self.assertEqual(4, r.count) with t.open_reader(partition='name=name2', reopen=True) as r: self.assertEqual(1, r.count) finally: self.odps.delete_table(table_name, if_exists=True)
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [ ['name3', 5, -1], ['name4', 6, -2] ] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr['name', 'id'].distinct().union(expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6] ] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testPersist(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] self._gen_data(data=data) table_name = tn('pyodps_test_engine_persist_table') try: df = self.engine.persist(self.expr, table_name) res = df.to_pandas() result = self._get_result(res) self.assertEqual(len(result), 5) self.assertEqual(data, result) finally: self.odps.delete_table(table_name, if_exists=True) try: schema = Schema.from_lists(self.schema.names, self.schema.types, ['ds'], ['string']) self.odps.create_table(table_name, schema) df = self.engine.persist(self.expr, table_name, partition='ds=today', create_partition=True) res = self.odps_engine.execute(df) result = self._get_result(res) self.assertEqual(len(result), 5) self.assertEqual(data, [d[:-1] for d in result]) finally: self.odps.delete_table(table_name, if_exists=True) try: self.engine.persist(self.expr, table_name, partitions=['name']) t = self.odps.get_table(table_name) self.assertEqual(2, len(list(t.partitions))) with t.open_reader(partition='name=name1', reopen=True) as r: self.assertEqual(4, r.count) with t.open_reader(partition='name=name2', reopen=True) as r: self.assertEqual(1, r.count) finally: self.odps.delete_table(table_name, if_exists=True)
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testUnion(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name3', 5, -1], ['name4', 6, -2]] self.odps.write_table(table2, 0, data2) try: expr = self.expr['name', 'id'].distinct().union( expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6]] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: table2.drop()
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testFilterOrder(self): table_name = tn('pyodps_test_division_error') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table(table_name, 'divided bigint, divisor bigint', lifecycle=1) try: self.odps.write_table(table_name, [[2, 0], [1, 1], [1, 2], [5, 1], [5, 0]]) df = CollectionExpr(_source_data=table, _schema=odps_schema_to_df_schema(table.schema)) fdf = df[df.divisor > 0] ddf = fdf[(fdf.divided / fdf.divisor).rename('result'), ] expr = ddf[ddf.result > 1] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(result, [[ 5, ]]) finally: table.drop()
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) if self.sql_engine.name != 'mysql': expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) if self.sql_engine.name != 'mysql': expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testJoin(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) expr = self.expr.left_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name2', None], ['name1', 4], ['name1', None], ['name1', None]] self.assertEqual(len(result), 5) self.assertTrue(all(it in expected for it in result)) expr = self.expr.right_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], [None, 1], ] self.assertEqual(len(result), 3) self.assertTrue(all(it in expected for it in result)) expr = self.expr.outer_join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) expected = [ ['name1', 4], ['name1', 4], ['name2', None], ['name1', None], ['name1', None], [None, 1], ] self.assertEqual(len(result), 6) self.assertTrue(all(it in expected for it in result)) grouped = self.expr.groupby('name').agg( new_id=self.expr.id.sum()).cache() self.engine.execute(self.expr.join(grouped, on='name')) expr = self.expr.join(expr2, on=[ 'name', ('id', 'id2') ])[lambda x: x.groupby(Scalar(1)).sort('name').row_number(), ] self.engine.execute(expr) finally: table2.drop()