def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def testChineseSchema(self): s = Schema.from_lists([u'用户'], ['string'], ['分区'], ['bigint']) self.assertIn('用户', s) self.assertEqual(s.get_column('用户').type.name, 'string') self.assertEqual(s.get_partition(u'分区').type.name, 'bigint') self.assertEqual(s['用户'].type.name, 'string') self.assertEqual(s[u'分区'].type.name, 'bigint') s2 = Schema.from_lists(['用户'], ['string'], [u'分区'], ['bigint']) self.assertEqual(s, s2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id"], datatypes("string", "int64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64")) table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64')) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) table._client = self.config.odps.rest self.expr = CollectionExpr(_source_data=table, _schema=schema) schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64], ['part1', 'part2'], [types.string, types.int64]) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2) table2._client = self.config.odps.rest self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
def testTableResource(self): test_table_name = tn('pyodps_t_tmp_resource_table') schema = Schema.from_lists(['id', 'name'], ['string', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.odps.create_table(test_table_name, schema) resource_name = tn('pyodps_t_tmp_table_resource') try: self.odps.delete_resource(resource_name) except errors.NoSuchObject: pass res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) self.assertIs(res, self.odps.get_resource(resource_name)) del res.parent[resource_name] # delete from cache self.assertIsNot(res, self.odps.get_resource(resource_name)) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) test_table_name = tn('pyodps_t_tmp_resource_table') test_table_partition = 'pt=test,sec=1' schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) table.create_partition(test_table_partition) resource_name = tn('pyodps_t_tmp_table_resource') res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) test_table_partition = 'pt=test,sec=2' table.create_partition(test_table_partition) res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) self.odps.delete_resource(resource_name) self.odps.delete_table(test_table_name)
def testBloomFilter(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [ ['name1'], ['name3'] ] self._gen_data(data=data) schema2 = Schema.from_lists(['name', ], [types.string]) import pandas as pd expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names), _schema=schema2) expr = self.expr.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) result = self._get_result(res) self.assertTrue(all(r[0] != 'name2' for r in result))
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.int64, types.int64]) self._gen_data(data=data) data2 = [ ['name1', 4, -1], ['name2', 1, -2] ] import pandas as pd expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names), _schema=schema2) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'}) self.assertEqual(expected.reset_index().values.tolist(), result)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64")) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.ctx = ExecuteContext()
def testRoomStores(self): class FakeRoom(Room): def _init(self): return room = FakeRoom("__test") room._room_dir = tempfile.mkdtemp() try: s = Schema.from_lists(["name", "id"], ["string", "bigint"]) table_name = "pyodps_test_room_stores" self.odps.delete_table(table_name, if_exists=True) t = self.odps.create_table(table_name, s) data = [["name1", 1], ["name2", 2]] with t.open_writer() as writer: writer.write(data) del t t = self.odps.get_table(table_name) self.assertEqual(t.schema.names, ["name", "id"]) try: room.store("table", t) t2 = room["table"] self.assertEqual(t2.name, table_name) with t2.open_reader() as reader: values = [r.values for r in reader] self.assertEqual(data, values) finally: t.drop() finally: shutil.rmtree(room._room_dir)
def testNullableRecord(self): s = Schema.from_lists( ['col%s'%i for i in range(8)], ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>']) r = Record(schema=s, values=[None]*8) self.assertSequenceEqual(r.values, [None]*8)
def testReadMapArraySQLInstance(self): test_table = tn('pyodps_t_tmp_read_map_array_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists( ['idx', 'map_col', 'array_col'], ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)], ) ) data = [ [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']], [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']], ] self.odps.write_table(test_table, data) with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader: read_data = [list(r.values) for r in reader] read_data = sorted(read_data, key=lambda r: r[0]) expected_data = sorted(data, key=lambda r: r[0]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def _create_table(self, table_name): fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm'] types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal', 'array<string>', 'map<string,bigint>'] self.odps.delete_table(table_name, if_exists=True) return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types))
def testCreateDeleteTable(self): test_table_name = tn("pyodps_t_tmp_create_table") schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertIsNone(table._getattr("owner")) self.assertIsNotNone(table.owner) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def testCreateDeleteTable(self): test_table_name = tn('pyodps_t_tmp_create_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def setUp(self): TestBase.setUp(self) self.pr = cProfile.Profile() self.pr.enable() fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal'] self.SCHEMA = Schema.from_lists(fields, types)
def testReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_read_write_table') schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [444, '中文', False]] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, records) self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [record.values for record in self.odps.read_table(table, length, step=2)]) self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)]) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testListInstancesInPage(self): test_table = tn('pyodps_t_tmp_list_instances_in_page') delay_udf = textwrap.dedent(""" from odps.udf import annotate import sys import time @annotate("bigint->bigint") class Delayer(object): def evaluate(self, arg0): print('Start Logging') sys.stdout.flush() time.sleep(45) print('End Logging') sys.stdout.flush() return arg0 """) resource_name = tn('test_delayer_function_resource') function_name = tn('test_delayer_function') if self.odps.exist_resource(resource_name + '.py'): self.odps.delete_resource(resource_name + '.py') res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf) if self.odps.exist_function(function_name): self.odps.delete_function(function_name) fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ]) data = [[random.randint(0, 1000)] for _ in compat.irange(100)] self.odps.delete_table(test_table, if_exists=True) t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint'])) self.odps.write_table(t, data) instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num" .format(function_name, test_table)) try: self.assertEqual(instance.status, Instance.Status.RUNNING) self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate( status=Instance.Status.RUNNING, from_time=datetime.now()-timedelta(days=2), end_time=datetime.now()+timedelta(days=1), max_items=20)]) self.waitContainerFilled(lambda: instance.tasks) task = instance.tasks[0] task.put_info('testInfo', 'TestInfo') self.assertIsNotNone(task.warnings) self.waitContainerFilled(lambda: task.workers, 30) self.assertIsNotNone(task.workers[0].get_log('stdout')) finally: try: instance.stop() except: pass res.drop() fun.drop() t.drop()
def testRecordMultiFields(self): s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint']) r = Record(values=[1, 2], schema=s) self.assertEqual(r['col1', 'col2'], ['1', 2]) self.assertRaises(AttributeError, lambda: r['col3']) self.assertRaises(AttributeError, lambda: r['col3', ])
def testGetAttrs(self): schema = Schema.from_lists(['name', 'id'], [types.string, types.int64]) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name', '_source_name', '_engine', '_cache_data', '_need_cache', '_cached_args') self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
def testCreateTableWithChineseColumn(self): test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns") schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = DynamicSchema.from_schema( Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) ) table = MockTable(name="pyodps_test_expr_table", schema=schema) schema2 = DynamicSchema.from_schema( Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")), default_type=types.string, ) table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2) self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema) self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
def setup(self): test_table_name = tn('pyodps_test_dataframe') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.table = self.odps.create_table(test_table_name, schema) with self.table.open_writer() as w: w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ["name", "id", "fid", "isMale", "scale", "birth"], datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"), ) table = MockTable(name="pyodps_test_expr_table", schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema)
def testCreateTableWithChineseColumn(self): test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns') schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'), ['ds'], datatypes('string')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns)) table1 = MockTable(name='pyodps_test_expr_table1', schema=schema) self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns)) table2 = MockTable(name='pyodps_test_expr_table2', schema=schema) self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns)) schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'), ['part1', 'part2'], datatypes('string', 'int64')) table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2) self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
def testPivot(self): data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]] table_name = tn("pyodps_test_mixed_engine_pivot") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn("name3", str(cm.exception)) expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows="id", columns="name", values="fid") expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows="id", columns="name", values="fid") expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.expr = CollectionExpr(_source_data=table, _schema=schema) self.engine = ODPSEngine(self.odps)
def testToPandas(self): table_name = tn('pyodps_test_mixed_engine_to_pandas') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['col%s' % i for i in range(7)], [ 'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'datetime' ])) expr2 = DataFrame(table2) data2 = [[ 1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None ]] self.odps.write_table(table2, 0, data2) pd_df = expr2.to_pandas() self.assertSequenceEqual(data2[0], pd_df.iloc[0].tolist()) wrapped_pd_df = expr2.to_pandas(wrap=True) self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df.execute()))) pd_df_col = expr2.col0.to_pandas() self.assertSequenceEqual([data2[0][0]], pd_df_col.tolist()) wrapped_pd_df_col = expr2.col0.to_pandas(wrap=True) self.assertSequenceEqual([data2[0][0]], list(next(wrapped_pd_df_col.execute()))) pd_df_future = expr2.to_pandas(async_=True) self.assertSequenceEqual(data2[0], pd_df_future.result().iloc[0].tolist()) wrapped_pd_df_future = expr2.to_pandas(async_=True, wrap=True) self.assertSequenceEqual( data2[0], list(next(wrapped_pd_df_future.result().execute()))) delay = Delay() pd_df_future = expr2.to_pandas(delay=delay) delay.execute() self.assertSequenceEqual(data2[0], pd_df_future.result().iloc[0].tolist()) exc_future = (expr2.col0 / 0).to_pandas(async_=True) self.assertRaises(ODPSError, exc_future.result)
def testLimitedInstanceTunnel(self): test_table = tn('pyodps_t_tmp_limit_instance_tunnel') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table(test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [ table.new_record([3]), ]) instance = self.odps.execute_sql('select * from %s' % test_table) instance = TunnelLimitedInstance(client=instance._client, parent=instance.parent, name=instance.id) TunnelLimitedInstance._exc = errors.InvalidArgument( 'Mock fallback error') self.assertRaises(errors.InvalidArgument, instance.open_reader, tunnel=True) with instance.open_reader() as reader: self.assertTrue(hasattr(reader, 'raw')) TunnelLimitedInstance._exc = requests.Timeout('Mock timeout') self.assertRaises(requests.Timeout, instance.open_reader, tunnel=True) with instance.open_reader() as reader: self.assertTrue(hasattr(reader, 'raw')) TunnelLimitedInstance._exc = errors.InstanceTypeNotSupported( 'Mock instance not supported') self.assertRaises(errors.InstanceTypeNotSupported, instance.open_reader, tunnel=True) with instance.open_reader() as reader: self.assertTrue(hasattr(reader, 'raw')) TunnelLimitedInstance._exc = errors.NoPermission( 'Mock permission error') self.assertRaises(errors.NoPermission, instance.open_reader, limit=False) with instance.open_reader() as reader: self.assertFalse(hasattr(reader, 'raw'))
def testReadWritePartitionTable(self): test_table_name = tn('pyodps_t_tmp_read_write_partition_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['pt'], ['string']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) table._upload_ids = dict() pt1 = 'pt=20151122' pt2 = 'pt=20151123' table.create_partition(pt1) table.create_partition(pt2) with table.open_reader(pt1) as reader: self.assertEqual(len(list(reader)), 0) with table.open_writer(pt1, commit=False) as writer: record = table.new_record([1, 'name1']) writer.write(record) record = table.new_record() record[0] = 3 record[1] = 'name3' writer.write(record) self.assertEqual(len(table._upload_ids), 1) upload_id = list(table._upload_ids.values())[0] with table.open_writer(pt1): self.assertEqual(len(table._upload_ids), 1) self.assertEqual(upload_id, list(table._upload_ids.values())[0]) with table.open_writer(pt2) as writer: writer.write([2, 'name2']) with table.open_reader(pt1, reopen=True) as reader: records = list(reader) self.assertEqual(len(records), 2) self.assertEqual(sum(r[0] for r in records), 4) with table.open_reader(pt2, reopen=True) as reader: records = list(reader) self.assertEqual(len(records), 1) self.assertEqual(sum(r[0] for r in records), 2) table.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) table_name = 'pyodps_test_engine_table' self.odps.delete_table(table_name, if_exists=True) self.table = self.odps.create_table( name='pyodps_test_engine_table', schema=self.schema) self.expr = CollectionExpr(_source_data=self.table, _schema=schema) self.engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testCreateDeleteTable(self): test_table_name = tn('pyodps_t_tmp_create_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], [ 'ds', ], [ 'string', ]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertIsNone(table._getattr('owner')) self.assertIsNotNone(table.owner) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) str_schema = ('id bigint, name string', 'ds string') table = tables.create(test_table_name, str_schema, lifecycle=10) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def testBizarreField(self): def my_func(row): return getattr(row, '012') * 2.0 datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists(['name', 'id', 'fid', '012'], datatypes('string', 'int64', 'float64', 'float64')) table = MockTable(name='pyodps_test_expr_table', schema=schema) expr = CollectionExpr(_source_data=table, _schema=schema) self.engine.compile(expr.apply(my_func, axis=1, names=['out_col'], types=['float64'])) udtf = list(self.engine._ctx._func_to_udfs.values())[0] six.exec_(udtf, globals(), locals()) udtf = locals()[UDF_CLASS_NAME] self.assertEqual([20, 40], runners.simple_run(udtf, [('name1', 1, None, 10), ('name2', 2, None, 20)]))
def testStringAsBinary(self): try: options.tunnel.string_as_binary = True s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint']) r = Record(values=[1, 2], schema=s) self.assertEqual(r['col1', 'col2'], [b'1', 2]) self.assertIsInstance(r[0], bytes) r[0] = u'junk' self.assertEqual(r[0], b'junk') self.assertIsInstance(r[0], bytes) r[0] = b'junk' self.assertEqual(r[0], b'junk') self.assertIsInstance(r[0], bytes) finally: options.tunnel.string_as_binary = False
def _initialize_table(self): if self._odps_client.exist_table(self._table, self._project): self._odps_table = self._odps_client.get_table( self._table, self._project ) else: if self._columns is None or self._column_types is None: raise ValueError( "columns and column_types need to be " "specified for non-existing table." ) schema = Schema.from_lists( self._columns, self._column_types, ["worker"], ["string"] ) self._odps_table = self._odps_client.create_table( self._table, schema )
def testReadMapArraySQLInstance(self): test_table = tn('pyodps_t_tmp_read_map_array_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists( ['idx', 'map_col', 'array_col'], [ 'bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string) ], )) data = [ [ 0, { 'key1': 'value1', 'key2': 'value2' }, ['item1', 'item2', 'item3'] ], [1, { 'key3': 'value3', 'key4': 'value4' }, ['item4', 'item5']], ] self.odps.write_table(test_table, data) inst = self.odps.execute_sql('select * from %s' % test_table) with inst.open_reader(table.schema, use_tunnel=False) as reader: read_data = [list(r.values) for r in reader] read_data = sorted(read_data, key=lambda r: r[0]) expected_data = sorted(data, key=lambda r: r[0]) self.assertSequenceEqual(read_data, expected_data) with inst.open_reader(table.schema, use_tunnel=True) as reader: read_data = [list(r.values) for r in reader] read_data = sorted(read_data, key=lambda r: r[0]) expected_data = sorted(data, key=lambda r: r[0]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None], ['name2', 2, 3.5, None, None], ['name1', 4, 4.2, None, None], ['name1', 3, 2.2, None, None], ['name1', 3, 4.1, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = tn('pyodps_test_engine_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, data2) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) try: expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result)) finally: table2.drop()
def testReadChineseSQLInstance(self): test_table = tn('pyodps_t_tmp_read_chn_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True) data = [[1, '中文'], [2, '测试数据']] self.odps.write_table( table, 0, [table.new_record(it) for it in data]) with self.odps.execute_sql('select name from %s' % test_table).open_reader() as reader: read_data = sorted([to_str(r[0]) for r in reader]) expected_data = sorted([to_str(r[1]) for r in data]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'bigint', 'double', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.tb = DataFrame(table) import pandas as pd df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]], columns=['name', 'id', 'fid']) self.pd = DataFrame(df) self.expr = self.tb.join(self.pd, on='name') self.engine = MixedEngine(self.odps)
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name3', 5, -1], ['name4', 6, -2]] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) try: expr = self.expr['name', 'id'].distinct().union( expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6]] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: table2.drop()
def testCallableColumn(self): from odps.df.expr.expressions import CallableColumn from odps.df.expr.collections import ProjectCollectionExpr schema = Schema.from_lists(['name', 'f1', 'append_id'], [types.string, types.float64, types.int64]) expr = CollectionExpr(_source_data=None, _schema=schema) self.assertIsInstance(expr.append_id, CallableColumn) self.assertNotIsInstance(expr.f1, CallableColumn) projected = expr[expr.name, expr.append_id] self.assertIsInstance(projected, ProjectCollectionExpr) self.assertListEqual(projected.schema.names, ['name', 'append_id']) projected = expr[expr.name, expr.f1] self.assertNotIsInstance(projected.append_id, CallableColumn) appended = expr.append_id(id_col='id_col') self.assertIn('id_col', appended.schema)
def testFillna(self): test_table_name = tn('pyodps_test_dataframe_fillna') self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table( test_table_name, Schema.from_lists(['val1', 'val2', 'val3', 'val4'], ['bigint'] * 4, ['name'], ['string'])) table.create_partition('name=a') df = DataFrame(table.get_partition('name=a')) columns = df.columns[:3] df2 = df[columns].fillna(0, subset=columns[:2]) df2.head() def sum_val(row): return sum(row) df2['new_field'] = df2.apply(sum_val, axis=1, reduce=True, rtype='int') df2.head()
def _gen_table(self, partition=None, partition_type=None, partition_val=None, size=100): def gen_name(name): if '<' in name: name = name.split('<', 1)[0] if len(name) > 4: name = name[:4] else: name = name[:2] return name test_table_name = 'pyodps_test_tunnel' types = ['bigint', 'string', 'double', 'datetime', 'boolean', 'decimal'] types.append(self._gen_random_array_type().name) types.append(self._gen_random_map_type().name) random.shuffle(types) names = [gen_name(t) for t in types] self.odps.delete_table(test_table_name, if_exists=True) partition_names = [partition, ] if partition else None partition_types = [partition_type, ] if partition_type else None table = self.odps.create_table( test_table_name, Schema.from_lists(names, types, partition_names=partition_names, partition_types=partition_types)) if partition_val: table.create_partition('%s=%s' % (partition, partition_val)) data = [] for _ in range(size): record = [] for t in types: n = t.split('<', 1)[0] method = getattr(self, '_gen_random_'+n) if n in ('map', 'array'): record.append(method(t)) else: record.append(method()) if partition is not None and partition_val is not None: record.append(partition_val) data.append(record) return table, data
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.df = pd.DataFrame(None, columns=schema.names) self.expr = CollectionExpr(_source_data=self.df, _schema=schema) self.engine = PandasEngine(self.odps) self.odps_engine = ODPSEngine(self.odps) class FakeBar(object): def update(self, *args, **kwargs): pass self.faked_bar = FakeBar()
def testTableIO(self): schema = Schema.from_lists(['key', 'value', 'double', 'datetime', 'boolean'], ['bigint', 'string', 'double', 'datetime', 'boolean']) label = self.client.sync_call('test', 'write_label') print('Write label: ' + label) writer = self.client.create_record_writer(label, schema) cur_time = datetime.datetime.now().replace(microsecond=0) rec = Record(schema=schema, values=[10, 'abcd', 1.56, cur_time, False]) for _ in range(10): writer.write(rec) writer.close() time.sleep(3) label = channel_client.sync_call('test', 'read_label') print('Read label: ' + label) reader = channel_client.create_record_reader(label, schema) records = list(reader) self.assertListEqual(records, [rec] * 20) reader.close()
def testSimpleArrayReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_simpe_read_write_table') schema = Schema.from_lists(['num'], ['string'], ['pt'], ['string']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) partition = 'pt=20151122' table.create_partition(partition) with table.open_writer(partition) as writer: writer.write(['1', ]) with table.open_reader(partition) as reader: self.assertEqual(reader.count, 1) record = next(reader) self.assertEqual(record[0], '1') self.assertEqual(record.num, '1') table.drop()
def testPandasCompilation(self): import pandas as pd import numpy as np df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) schema = Schema.from_lists(list('abc'), [types.int8] * 3) expr = CollectionExpr(_source_data=df, _schema=schema) expr = expr['a', 'b'] compiler = PandasCompiler() dag = compiler.compile(expr) self.assertEqual(len(dag._graph), 4) topos = dag.topological_sort() self.assertIsInstance(topos[0][0], CollectionExpr) self.assertIsInstance(topos[1][0], Column) self.assertIsInstance(topos[2][0], Column) self.assertIsInstance(topos[3][0], ProjectCollectionExpr)
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name1', 4, -1], ['name2', 1, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) id_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'id' ][0] fid_idx = [ idx for idx, col in enumerate(self.expr.schema.names) if col == 'fid' ][0] expected = [[k, sum( v[fid_idx] for v in row)] for k, row in itertools.groupby( sorted(data, key=lambda r: r[id_idx]), lambda r: r[id_idx])] for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])
def testUnion(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] data2 = [['name3', 5, -1], ['name4', 6, -2]] datatypes = lambda *types: [validate_data_type(t) for t in types] schema2 = Schema.from_lists(['name', 'id2', 'id3'], datatypes('string', 'int64', 'int64')) table_name = tn('pyodps_test_engine_table2') table2 = self._create_table_and_insert_data(table_name, schema2, data2) expr2 = CollectionExpr(_source_data=table2, _schema=schema2) self._gen_data(data=data) try: expr = self.expr['name', 'id'].distinct().union( expr2[expr2.id2.rename('id'), 'name']) res = self.engine.execute(expr) result = self._get_result(res) expected = [['name1', 4], ['name1', 3], ['name2', 2], ['name3', 5], ['name4', 6]] result = sorted(result) expected = sorted(expected) self.assertEqual(len(result), len(expected)) for e, r in zip(result, expected): self.assertEqual([to_str(t) for t in e], [to_str(t) for t in r]) finally: [conn.close() for conn in _engine_to_connections.values()] table2.drop()
def testLargeColumnsFormatter(self): names = list( itertools.chain(*[[name + str(i) for name in self.schema.names] for i in range(10)])) types = self.schema.types * 10 schema = Schema.from_lists(names, types) gen_row = lambda: list( itertools.chain(*(self._random_values().values for _ in range(10)))) data = [ Record(schema=df_schema_to_odps_schema(schema), values=gen_row()) for _ in range(10) ] pd = ResultFrame(data=data, schema=schema, pandas=True) result = ResultFrame(data=data, schema=schema, pandas=False) self.assertEqual(to_str(repr(pd)), to_str(repr(result))) self.assertEqual(to_str(pd._repr_html_()), to_str(result._repr_html_()))
def testReadSQLWrite(self): test_table = tn('pyodps_t_tmp_read_sql_instance_write') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table( table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [table.new_record([3]), ]) test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2') self.odps.delete_table(test_table2, if_exists=True) table2 = self.odps.create_table(test_table2, table.schema) try: with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader: with table2.open_writer() as writer: for record in reader: writer.write(table2.new_record(record.values)) finally: table.drop() table2.drop()
def testApplyMap(self): from odps.df.expr.collections import ProjectCollectionExpr, Column from odps.df.expr.element import MappedExpr schema = Schema.from_lists(['idx', 'f1', 'f2', 'f3'], [types.int64] + [types.float64] * 3) expr = CollectionExpr(_source_data=None, _schema=schema) self.assertRaises( ValueError, lambda: expr.applymap( lambda v: v + 1, columns='idx', excludes='f1')) mapped = expr.applymap(lambda v: v + 1) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr) mapped = expr.applymap(lambda v: v + 1, columns='f1') self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr if c.name == 'f1' else Column) map_cols = set(['f1', 'f2', 'f3']) mapped = expr.applymap(lambda v: v + 1, columns=map_cols) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, MappedExpr if c.name in map_cols else Column) mapped = expr.applymap(lambda v: v + 1, excludes='idx') self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, Column if c.name == 'idx' else MappedExpr) exc_cols = set(['idx', 'f1']) mapped = expr.applymap(lambda v: v + 1, excludes=exc_cols) self.assertIsInstance(mapped, ProjectCollectionExpr) for c in mapped._fields: self.assertIsInstance(c, Column if c.name in exc_cols else MappedExpr)
def testCreateDataFrameFromPartition(self): from odps.types import PartitionSpec test_table_name = tn('pyodps_test_dataframe_partition') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds'], ['string']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) with table.open_writer('ds=today', create_partition=True) as w: w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']]) try: df = DataFrame(table.get_partition('ds=today')) self.assertEqual(df.count().execute(), 3) df = table.get_partition('ds=today').to_df() partition = df.data self.assertIs(partition.table, table) self.assertEqual(partition.partition_spec, PartitionSpec('ds=today')) self.assertEqual(df.count().execute(), 3) finally: table.drop()
def testListInstancesInPage(self): test_table = tn('pyodps_t_tmp_list_instances_in_page') data = [[random.randint(0, 1000)] for _ in compat.irange(10000)] self.odps.delete_table(test_table, if_exists=True) t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint'])) self.odps.write_table(t, data) instance = self.odps.run_sql('select sum(num) from {0} group by num'.format(test_table)) try: self.assertEqual(instance.status, Instance.Status.RUNNING) self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate( status=Instance.Status.RUNNING, from_time=datetime.now()-timedelta(days=2), end_time=datetime.now()+timedelta(days=1), max_items=20)]) finally: try: instance.stop() except: pass t.drop()
def testJoin(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.int64, types.int64]) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] import pandas as pd expr2 = CollectionExpr(_source_data=pd.DataFrame( data2, columns=schema2.names), _schema=schema2) expr = self.expr.join(expr2)['name', 'id2'] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 5) expected = [[to_str('name1'), 4], [to_str('name2'), 1]] self.assertTrue(all(it in expected for it in result)) expr = self.expr.join(expr2, on=['name', ('id', 'id2')])[self.expr.name, expr2.id2] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(len(result), 2) expected = [to_str('name1'), 4] self.assertTrue(all(it == expected for it in result))
def testJoinGroupby(self): data = [ ['name1', 4, 5.3, None, None, None], ['name2', 2, 3.5, None, None, None], ['name1', 4, 4.2, None, None, None], ['name1', 3, 2.2, None, None, None], ['name1', 3, 4.1, None, None, None], ] schema2 = Schema.from_lists(['name', 'id2', 'id3'], [types.string, types.bigint, types.bigint]) table_name = 'pyodps_test_engine_table2' self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=schema2) expr2 = CollectionExpr(_source_data=table2, _schema=odps_schema_to_df_schema(schema2)) self._gen_data(data=data) data2 = [['name1', 4, -1], ['name2', 1, -2]] self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) expr = self.expr.join(expr2, on='name')[self.expr] expr = expr.groupby('id').agg(expr.fid.sum()) res = self.engine.execute(expr) result = self._get_result(res) import pandas as pd expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})\ .reset_index().values.tolist() for it in zip(sorted(expected, key=lambda it: it[0]), sorted(result, key=lambda it: it[0])): self.assertAlmostEqual(it[0][0], it[1][0]) self.assertAlmostEqual(it[0][1], it[1][1])