def testTokenizedSignServerAccount(self): server = SignServer(token=str(uuid.uuid4())) server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) self.assertRaises( SignServerError, lambda: odps.delete_table(tn('test_sign_account_table'), if_exists=True)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address, token=server.token) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async=True) finally: server.stop()
def testPandasPersist(self): import pandas as pd, numpy as np tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) t = self.odps.create_table( tmp_table_name, ('a bigint, b bigint, c bigint', 'ds string')) t.create_partition('ds=today') try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name, partition='ds=today', odps=self.odps) self.assertPandasEqual(df[list('abc')].to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name) self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist2') self.odps.delete_table(tmp_table_name, if_exists=True) try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name)
def testVolumeArchiveResource(self): volume_name = tn('pyodps_t_tmp_resource_archive_volume') resource_name = tn('pyodps_t_tmp_volume_archive_resource') + '.zip' partition_name = 'test_partition' file_name = 'test_file.zip' try: self.odps.delete_volume(volume_name) except errors.ODPSError: pass try: self.odps.delete_resource(resource_name) except errors.ODPSError: pass file_io = six.BytesIO() zfile = zipfile.ZipFile(file_io, 'a', zipfile.ZIP_DEFLATED, False) zfile.writestr('file1.txt', FILE_CONTENT) zfile.writestr('file2.txt', OVERWRITE_FILE_CONTENT) zfile.close() self.odps.create_parted_volume(volume_name) with self.odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, file_io.getvalue()) volume_file = self.odps.get_volume_partition(volume_name, partition_name).files[file_name] self.odps.create_resource(resource_name, 'volumearchive', volume_file=volume_file) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, VolumeArchiveResource) self.assertEqual(res.type, Resource.Type.VOLUMEARCHIVE) self.assertEqual(res.volume_path, volume_file.path) self.odps.delete_resource(resource_name)
def testVolumeArchiveResource(self): volume_name = tn('pyodps_t_tmp_resource_archive_volume') resource_name = tn('pyodps_t_tmp_volume_archive_resource') + '.zip' partition_name = 'test_partition' file_name = 'test_file.zip' try: self.odps.delete_volume(volume_name) except errors.ODPSError: pass try: self.odps.delete_resource(resource_name) except errors.ODPSError: pass file_io = six.BytesIO() zfile = zipfile.ZipFile(file_io, 'a', zipfile.ZIP_DEFLATED, False) zfile.writestr('file1.txt', FILE_CONTENT) zfile.writestr('file2.txt', OVERWRITE_FILE_CONTENT) zfile.close() self.odps.create_parted_volume(volume_name) with self.odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, file_io.getvalue()) volume_file = self.odps.get_volume_partition( volume_name, partition_name).files[file_name] self.odps.create_resource(resource_name, 'volumearchive', volume_file=volume_file) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, VolumeArchiveResource) self.assertEqual(res.type, Resource.Type.VOLUMEARCHIVE) self.assertEqual(res.volume_path, volume_file.path) self.odps.delete_resource(resource_name)
def testRunScript(self): import pandas as pd from io import BytesIO from odps.utils import to_binary client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: mars_source_table_name = tn('mars_script_datasource') mars_des_table_name = tn('mars_script_datastore') self._create_table(mars_source_table_name) self.odps.delete_table(mars_des_table_name, if_exists=True) data = self._gen_data() self.odps.write_table(mars_source_table_name, data) code = BytesIO( to_binary( script.format(mars_source_table_name, self.odps.endpoint, mars_des_table_name, self.odps.endpoint))) self.odps.run_script_in_mars(code, runtime_endpoint=self.odps.endpoint) result = self.odps.get_table( mars_des_table_name).to_df().to_pandas() expected = self.odps.get_table( mars_source_table_name).to_df().to_pandas() pd.testing.assert_frame_equal(result, expected) finally: client.stop_server()
def testSQLCostInstance(self): test_table = tn('pyodps_t_tmp_sql_cost_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table(test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table(table, [[1], [2], [3]]) sql_cost = self.odps.execute_sql_cost('select * from %s' % test_table) self.assertIsInstance(sql_cost, Instance.SQLCost) self.assertEqual(sql_cost.udf_num, 0) self.assertEqual(sql_cost.complexity, 1.0) self.assertGreaterEqual(sql_cost.input_size, 480) test_table = tn('pyodps_t_tmp_sql_cost_odps2_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table(test_table, schema=Schema.from_lists(['size'], ['tinyint']), if_not_exists=True) self.odps.write_table(table, [[1], [2], [3]]) sql_cost = self.odps.execute_sql_cost('select * from %s' % test_table) self.assertIsInstance(sql_cost, Instance.SQLCost) self.assertEqual(sql_cost.udf_num, 0) self.assertEqual(sql_cost.complexity, 1.0) self.assertGreaterEqual(sql_cost.input_size, 480)
def testRunMarsJob(self): import pandas as pd odps_entry = self.odps mars_source_table_name = tn('mars_script_datasource') mars_des_table_name = tn('mars_script_datastore') self._create_table(mars_source_table_name) self.odps.delete_table(mars_des_table_name, if_exists=True) data = self._gen_data() self.odps.write_table(mars_source_table_name, data) def func(s_name, d_name): from odps.accounts import BearerTokenAccount df = odps_entry.to_mars_dataframe( s_name, runtime_endpoint=odps_entry.endpoint).to_pandas() odps_entry.persist_mars_dataframe( df, d_name, unknown_as_string=True, runtime_endpoint=odps_entry.endpoint) self.odps.run_mars_job(func, args=(mars_source_table_name, mars_des_table_name), name=str(uuid.uuid4()), worker_cpu=4, worker_mem=8) result = self.odps.get_table(mars_des_table_name).to_df().to_pandas() expected = self.odps.get_table( mars_source_table_name).to_df().to_pandas() pd.testing.assert_frame_equal(result, expected)
def testVolumeFileResource(self): volume_name = tn('pyodps_t_tmp_resource_file_volume') resource_name = tn('pyodps_t_tmp_volume_file_resource') partition_name = 'test_partition' file_name = 'test_file.txt' try: self.odps.delete_volume(volume_name) except errors.ODPSError: pass try: self.odps.delete_resource(resource_name) except errors.ODPSError: pass self.odps.create_parted_volume(volume_name) with self.odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, FILE_CONTENT) volume_file = self.odps.get_volume_partition(volume_name, partition_name).files[file_name] self.odps.create_resource(resource_name, 'volumefile', volume_file=volume_file) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, VolumeFileResource) self.assertEqual(res.type, Resource.Type.VOLUMEFILE) self.assertEqual(res.volume_path, volume_file.path) self.odps.delete_resource(resource_name)
def testReadSQLWrite(self): test_table = tn('pyodps_t_tmp_read_sql_instance_write') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table(test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table(table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [ table.new_record([3]), ]) test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2') self.odps.delete_table(test_table2, if_exists=True) table2 = self.odps.create_table(test_table2, table.schema) try: with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader: with table2.open_writer() as writer: for record in reader: writer.write(table2.new_record(record.values)) finally: table.drop() table2.drop()
def testExecuteSql(self): FakeShell = namedtuple('FakeShell', 'user_ns') magic_class = ODPSSql(FakeShell(user_ns={})) magic_class._odps = self.odps test_table_name = tn('pyodps_t_test_sql_magic') test_content = [['line1'], ['line2']] self.odps.delete_table(test_table_name, if_exists=True) self.odps.create_table(test_table_name, 'col string', lifecycle=1) self.odps.write_table(test_table_name, test_content) options.use_instance_tunnel = False result = magic_class.execute('select * from %s' % test_table_name) self.assertListEqual(self._get_result(result), test_content) options.use_instance_tunnel = True result = magic_class.execute('select * from %s' % test_table_name) self.assertListEqual(self._get_result(result), test_content) result = magic_class.execute('show tables') self.assertTrue(len(result) > 0) table_name = tn('pyodps_test_magics_create_table_result') magic_class.execute('create table %s (col string) lifecycle 1' % table_name) magic_class.execute('drop table %s' % table_name)
def test_plenty_create(self): del_insts = [ self.odps.run_sql('drop table {0}'.format( tn('tmp_pyodps_create_temp_%d' % n))) for n in range(10) ] [inst.wait_for_completion() for inst in del_insts] script = PLENTY_CREATE_CODE.format(odps_info=self._get_odps_json(), import_paths=json.dumps(sys.path)) script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str( os.getpid()) + '_plenty_script.py' with open(script_name, 'w') as script_file: script_file.write(script) script_file.close() env = copy.deepcopy(os.environ) env.update({'WAIT_CLEANUP': '1'}) subprocess.call([sys.executable, script_name], close_fds=True, env=env) sleep(5) trial = 4 case = lambda: all(not self.odps.exist_table( tn('tmp_pyodps_create_temp_%d' % tid)) for tid in range(10)) while not case(): trial -= 1 sleep(5) if trial == 0: assert case()
def testVolumeFileResource(self): volume_name = tn('pyodps_t_tmp_resource_file_volume') resource_name = tn('pyodps_t_tmp_volume_file_resource') partition_name = 'test_partition' file_name = 'test_file.txt' try: self.odps.delete_volume(volume_name) except errors.ODPSError: pass try: self.odps.delete_resource(resource_name) except errors.ODPSError: pass self.odps.create_parted_volume(volume_name) with self.odps.open_volume_writer(volume_name, partition_name) as writer: writer.write(file_name, FILE_CONTENT) volume_file = self.odps.get_volume_partition( volume_name, partition_name).files[file_name] self.odps.create_resource(resource_name, 'volumefile', volume_file=volume_file) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, VolumeFileResource) self.assertEqual(res.type, Resource.Type.VOLUMEFILE) self.assertEqual(res.volume_path, volume_file.path) self.odps.delete_resource(resource_name)
def testCreateMarsCluster(self): import pandas as pd mars_source_table_name = tn('mars_datasource') mars_des_table_name = tn('mars_datastore') self._create_table(mars_source_table_name) self.odps.delete_table(mars_des_table_name, if_exists=True) data = self._gen_data() self.odps.write_table(mars_source_table_name, data) client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: self.assertFalse(client._with_notebook) df = self.odps.to_mars_dataframe( mars_source_table_name, runtime_endpoint=self.odps.endpoint) df_head = df.head(2) self.odps.persist_mars_dataframe( df_head, mars_des_table_name, unknown_as_string=True, runtime_endpoint=self.odps.endpoint) des = self.odps.to_mars_dataframe( mars_des_table_name, runtime_endpoint=self.odps.endpoint) expected = self.odps.get_table( mars_source_table_name).to_df().to_pandas() result = des.to_pandas() pd.testing.assert_frame_equal(expected.head(2), result) self.odps.delete_table(mars_source_table_name) self.odps.delete_table(mars_des_table_name) finally: client.stop_server()
def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def testViewTable(self): import pandas as pd mars_source_table_name = tn('mars_view_datasource') self.odps.delete_table(mars_source_table_name, if_exists=True) self.odps.create_table(mars_source_table_name, schema='col1 int, col2 string') self.odps.write_table(mars_source_table_name, [[1, 'test1'], [2, 'test2']]) mars_view_table_name = tn('mars_view_table') self.odps.execute_sql( 'DROP VIEW IF EXISTS {}'.format(mars_view_table_name)) sql = 'create view {} (view_col1, view_col2) as select * from {}'.format( mars_view_table_name, mars_source_table_name) self.odps.execute_sql(sql) client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: df = self.odps.to_mars_dataframe( mars_view_table_name, runtime_endpoint=self.odps.endpoint) result = df.execute().to_pandas() expected = pd.DataFrame({ 'view_col1': [1, 2], 'view_col2': ['test1', 'test2'] }) pd.testing.assert_frame_equal(result, expected) finally: client.stop_server()
def testCachePersist(self): expr = self.odps_df data2 = [['name1', 3.2], ['name3', 2.4]] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def testCreateDeleteUpdateFunction(self): test_resource_name = tn('pyodps_t_tmp_test_function_resource') + '.py' test_function_name2 = tn( 'pyodps_t_tmp_test_function_resource2') + '.py' test_function_name = tn('pyodps_t_tmp_test_function') try: self.odps.delete_resource(test_resource_name) except errors.NoSuchObject: pass try: self.odps.delete_function(test_function_name) except errors.NoSuchObject: pass try: self.odps.delete_resource(test_function_name2) except errors.NoSuchObject: pass test_resource = self.odps.create_resource(test_resource_name, 'py', file_obj=FILE_CONTENT) test_function = self.odps.create_function( test_function_name, class_type=test_resource_name.split('.', 1)[0] + '.MyPlus', resources=[ test_resource, ]) self.assertIsNotNone(test_function.name) self.assertIsNotNone(test_function.owner) self.assertIsNotNone(test_function.creation_time) self.assertIsNotNone(test_function.class_type) self.assertEqual(len(test_function.resources), 1) with self.odps.open_resource(name=test_resource_name, mode='r') as fp: self.assertEqual(to_str(fp.read()), to_str(FILE_CONTENT)) secondary_user = self.config.get('test', 'secondary_user') self.assertNotEqual(test_function.owner, secondary_user) test_resource2 = self.odps.create_resource(test_function_name2, 'file', file_obj='Hello World') test_function.resources.append(test_resource2) test_function.owner = secondary_user test_function.update() test_function_id = id(test_function) del test_function.project.functions[test_function.name] test_function = self.odps.get_function(test_function_name) self.assertNotEqual(test_function_id, id(test_function)) self.assertEqual(len(test_function.resources), 2) self.assertEqual(test_function.owner, secondary_user) test_resource.drop() test_resource2.drop() test_function.drop()
def testListInstancesInPage(self): test_table = tn('pyodps_t_tmp_list_instances_in_page') delay_udf = textwrap.dedent(""" from odps.udf import annotate import sys import time @annotate("bigint->bigint") class Delayer(object): def evaluate(self, arg0): print('Start Logging') sys.stdout.flush() time.sleep(45) print('End Logging') sys.stdout.flush() return arg0 """) resource_name = tn('test_delayer_function_resource') function_name = tn('test_delayer_function') if self.odps.exist_resource(resource_name + '.py'): self.odps.delete_resource(resource_name + '.py') res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf) if self.odps.exist_function(function_name): self.odps.delete_function(function_name) fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ]) data = [[random.randint(0, 1000)] for _ in compat.irange(100)] self.odps.delete_table(test_table, if_exists=True) t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint'])) self.odps.write_table(t, data) instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num" .format(function_name, test_table)) try: self.assertEqual(instance.status, Instance.Status.RUNNING) self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate( status=Instance.Status.RUNNING, from_time=datetime.now()-timedelta(days=2), end_time=datetime.now()+timedelta(days=1), max_items=20)]) self.waitContainerFilled(lambda: instance.tasks) task = instance.tasks[0] task.put_info('testInfo', 'TestInfo') self.assertIsNotNone(task.warnings) self.waitContainerFilled(lambda: task.workers, 30) self.assertIsNotNone(task.workers[0].get_log('stdout')) finally: try: instance.stop() except: pass res.drop() fun.drop() t.drop()
def testTableResource(self): test_table_name = tn('pyodps_t_tmp_resource_table') schema = Schema.from_lists(['id', 'name'], ['string', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.odps.create_table(test_table_name, schema) resource_name = tn('pyodps_t_tmp_table_resource') try: self.odps.delete_resource(resource_name) except errors.NoSuchObject: pass res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) self.assertIs(res, self.odps.get_resource(resource_name)) del res.parent[resource_name] # delete from cache self.assertIsNot(res, self.odps.get_resource(resource_name)) res = self.odps.get_resource(resource_name) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertIsNone(res.get_source_table_partition()) test_table_name = tn('pyodps_t_tmp_resource_table') test_table_partition = 'pt=test,sec=1' schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) table.create_partition(test_table_partition) resource_name = tn('pyodps_t_tmp_table_resource') res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) test_table_partition = 'pt=test,sec=2' table.create_partition(test_table_partition) res = res.update(partition=test_table_partition) self.assertIsInstance(res, TableResource) self.assertEqual(res.get_source_table().name, test_table_name) self.assertEqual(str(res.get_source_table_partition()), str(types.PartitionSpec(test_table_partition))) self.assertIs(res, self.odps.get_resource(resource_name)) self.odps.delete_resource(resource_name) self.odps.delete_table(test_table_name)
def testHorzConcat(self): options.ml.dry_run = False table_name = tn('test_horz_concat_table2_xxx_yyy') self.odps.delete_table(table_name, if_exists=True) result_table_name = tn('test_horz_concat_result') self.odps.delete_table(result_table_name, if_exists=True) self.odps_df[self.odps_df.name, (self.odps_df.id * 2).rename('ren_id')].persist(table_name) df2 = self.odps.get_table(table_name).to_df() df2 = df2[:3] expr = self.odps_df.concat(df2.ren_id, axis=1) expr.persist(result_table_name, lifecycle=1)
def testBearerTokenAccount(self): self.odps.delete_table(tn('test_bearer_token_account_table'), if_exists=True) t = self.odps.create_table(tn('test_bearer_token_account_table'), 'col string', lifecycle=1) with t.open_writer() as writer: records = [['val1'], ['val2'], ['val3']] writer.write(records) inst = self.odps.execute_sql('select count(*) from {0}'.format( tn('test_bearer_token_account_table')), async_=True) inst.wait_for_success() task_name = inst.get_task_names()[0] logview_address = inst.get_logview_address() token = logview_address[logview_address.find('token=') + len('token='):] bearer_token_account = BearerTokenAccount(token=token) bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=bearer_token_account) bearer_token_instance = bearer_token_odps.get_instance(inst.id) self.assertEqual(inst.get_task_result(task_name), bearer_token_instance.get_task_result(task_name)) self.assertEqual(inst.get_task_summary(task_name), bearer_token_instance.get_task_summary(task_name)) with self.assertRaises(errors.NoPermission): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test1'), 'col string', lifecycle=1) fake_token_account = BearerTokenAccount(token='fake-token') bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=fake_token_account) with self.assertRaises(errors.ODPSError): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test2'), 'col string', lifecycle=1)
def testSubPartitions(self): test_table_name = tn('pyodps_t_tmp_sub_partitions_table') root_partition = 'type=test' sub_partitions = ['s=%s' % i for i in range(3)] schema = Schema.from_lists([ 'id', ], [ 'string', ], ['type', 's'], ['string', 'string']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) partitions = [root_partition + ',' + p for p in sub_partitions] partitions.append('type=test2,s=0') for partition in partitions: table.create_partition(partition) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions]), sorted([str(p.partition_spec) for p in table.partitions])) self.assertEqual(len(list(table.iterate_partitions(root_partition))), 3) table.delete_partition(partitions[0]) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]), sorted([str(p.partition_spec) for p in table.partitions])) self.odps.delete_table(test_table_name)
def testArrowTunnelMultipleParts(self): import pandas as pd mars_source_table_name = tn('mars_arrow_tunnel_datasource_mpart') self.odps.delete_table(mars_source_table_name, if_exists=True) table = self.odps.create_table(mars_source_table_name, schema=('col1 int, col2 string', 'pt string'), lifecycle=1) for pid in range(5): pt = table.create_partition('pt=test_part%d' % pid) with pt.open_writer() as writer: writer.write([[1 + pid * 2, 'test1'], [2 + pid * 2, 'test2']]) r = self.odps.to_mars_dataframe(mars_source_table_name, append_partitions=True, add_offset=True).execute().to_pandas() expected = table.to_df().to_pandas() pd.testing.assert_frame_equal(r, expected) r = self.odps.to_mars_dataframe(mars_source_table_name, partition='pt>test_part1', append_partitions=True, add_offset=True).execute().to_pandas() expected = table.to_df().to_pandas().query( 'pt>"test_part1"').reset_index(drop=True) pd.testing.assert_frame_equal(r, expected)
def testReadMapArraySQLInstance(self): test_table = tn('pyodps_t_tmp_read_map_array_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists( ['idx', 'map_col', 'array_col'], ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)], ) ) data = [ [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']], [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']], ] self.odps.write_table(test_table, data) with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader: read_data = [list(r.values) for r in reader] read_data = sorted(read_data, key=lambda r: r[0]) expected_data = sorted(data, key=lambda r: r[0]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def testPartitions(self): test_table_name = tn('pyodps_t_tmp_partitions_table') partitions = ['s=%s' % i for i in range(3)] schema = Schema.from_lists([ 'id', ], [ 'string', ], [ 's', ], [ 'string', ]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) for partition in partitions: table.create_partition(partition) self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions]), sorted([str(p.partition_spec) for p in table.partitions])) table.get_partition(partitions[0]).drop() self.assertEqual( sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]), sorted([str(p.partition_spec) for p in table.partitions])) p = next(table.partitions) self.assertGreater(len(p.columns), 0) p.reload() self.assertGreater(len(p.columns), 0) self.odps.delete_table(test_table_name)
def testReadBinarySQLInstance(self): try: options.tunnel.string_as_binary = True test_table = tn('pyodps_t_tmp_read_binary_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True) data = [[ 1, u'中'.encode('utf-8') + b'\\\\n\\\n' + u'文'.encode('utf-8') + b' ,\r\xe9' ], [ 2, u'测试'.encode('utf-8') + b'\x00\x01\x02' + u'数据'.encode('utf-8') + b'\xe9' ]] self.odps.write_table(table, 0, [table.new_record(it) for it in data]) with self.odps.execute_sql( 'select name from %s' % test_table).open_reader(tunnel=False) as reader: read_data = sorted([r[0] for r in reader]) expected_data = sorted([r[1] for r in data]) self.assertSequenceEqual(read_data, expected_data) table.drop() finally: options.tunnel.string_as_binary = False
def testCreateInstance(self): test_table = tn('pyodps_t_tmp_create_instance') task = SQLTask(query='drop table if exists %s' % test_table) instance = self.odps._project.instances.create(task=task) instance.wait_for_completion() self.assertTrue(instance.is_successful()) self.assertFalse(self.odps.exist_table(test_table)) task = SQLTask(query='create table %s(id string);' % test_table) instance = self.odps._project.instances.create(task=task) instance.wait_for_completion() self.assertTrue(instance.is_successful()) self.assertTrue(self.odps.exist_table(test_table)) instance = self.odps.execute_sql('drop table %s' % test_table) self.assertTrue(instance.is_successful()) self.assertFalse(self.odps.exist_table(test_table)) tasks = instance.get_tasks() self.assertTrue(any(map(lambda task: isinstance(task, SQLTask), tasks))) for name in instance.get_task_names(): self.assertIsNotNone(instance.get_task_detail(name)) self.assertIsNotNone(instance.get_task_detail2(name)) # test stop self.assertRaises(errors.InvalidStateSetting, instance.stop)
def testRecordReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_read_write_table') schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [5940813139082772990, '中文', False]] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, records) self.assertSequenceEqual( texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [ record.values for record in self.odps.read_table(table, length, step=2) ]) self.assertSequenceEqual( texted_data, [record.values for record in table.head(length)]) table.truncate() self.assertEqual([], list(self.odps.read_table(table))) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_read_write_table') schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [444, '中文', False]] length = len(data) records = [Record(schema=schema, values=values) for values in data] texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, records) self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [record.values for record in self.odps.read_table(table, length, step=2)]) self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)]) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testCreateDeleteTable(self): test_table_name = tn("pyodps_t_tmp_create_table") schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertIsNone(table._getattr("owner")) self.assertIsNotNone(table.owner) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def testPersistExecute(self): delay = Delay() filtered = self.df[self.df.id > 0].cache() persist_table_name = tn('pyodps_test_delay_persist') schema = Schema.from_lists(['id', 'name', 'value'], ['bigint', 'string', 'bigint'], ['pt', 'ds'], ['string', 'string']) self.odps.delete_table(persist_table_name, if_exists=True) self.odps.create_table(persist_table_name, schema) future1 = filtered[filtered.value > 2].persist(persist_table_name, partition='pt=a,ds=d1', delay=delay) future2 = filtered[filtered.value < 2].persist(persist_table_name, partition='pt=a,ds=d2', delay=delay) delay.execute() df1 = future1.result() df2 = future2.result() self.assertEqual([c.lhs.name for c in df1.predicate.children()], ['pt', 'ds']) result1 = self._get_result(df1.execute()) self.assertEqual([r[:-2] for r in result1], [d for d in self.data if d[2] > 2]) self.assertEqual([c.lhs.name for c in df2.predicate.children()], ['pt', 'ds']) result2 = self._get_result(df2.execute()) self.assertEqual([r[:-2] for r in result2], [d for d in self.data if d[2] < 2])
def testArrowTunnel(self): import pandas as pd import numpy as np import mars.dataframe as md mars_des_table_name = tn('mars_arrow_tunnel_datastore') self.odps.delete_table(mars_des_table_name, if_exists=True) data = pd.DataFrame({ 'col1': np.random.rand(1000, ), 'col2': np.random.randint(0, 100, (1000, )), 'col3': np.random.choice(['a', 'b', 'c'], size=(1000, )) }) df = md.DataFrame(data, chunk_size=300) self.odps.persist_mars_dataframe(df, mars_des_table_name, unknown_as_string=True) expected = self.odps.get_table(mars_des_table_name).to_df().to_pandas() pd.testing.assert_frame_equal( expected.sort_values('col1').reset_index(drop=True), data.sort_values('col1').reset_index(drop=True)) r = self.odps.to_mars_dataframe(mars_des_table_name, chunk_size=200).execute().to_pandas() expected = self.odps.get_table(mars_des_table_name).to_df().to_pandas() pd.testing.assert_frame_equal(r.reset_index(drop=True), expected.reset_index(drop=True))
def testCreateDeleteTable(self): test_table_name = tn('pyodps_t_tmp_create_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',]) tables = self.odps._project.tables tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = tables.create(test_table_name, schema, lifecycle=10) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertEqual(table.lifecycle, 10) tables.delete(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5) self.assertEqual(table.name, test_table_name) self.assertEqual(table.schema, schema) self.assertNotEqual(table.lifecycle, 10) self.assertEqual(table.shard.shard_num, 10) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name))
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def testPandasPersistODPS2(self): import pandas as pd import numpy as np data_int8 = np.random.randint(0, 10, (1, ), dtype=np.int8) data_int16 = np.random.randint(0, 10, (1, ), dtype=np.int16) data_int32 = np.random.randint(0, 10, (1, ), dtype=np.int32) data_int64 = np.random.randint(0, 10, (1, ), dtype=np.int64) data_float32 = np.random.random((1, )).astype(np.float32) data_float64 = np.random.random((1, )).astype(np.float64) df = DataFrame( pd.DataFrame( OrderedDict([('data_int8', data_int8), ('data_int16', data_int16), ('data_int32', data_int32), ('data_int64', data_int64), ('data_float32', data_float32), ('data_float64', data_float64)]))) tmp_table_name = tn('pyodps_test_mixed_persist_odps2_types') self.odps.delete_table(tmp_table_name, if_exists=True) df.persist(tmp_table_name, lifecycle=1, drop_table=True, odps=self.odps) t = self.odps.get_table(tmp_table_name) expected_types = [ odps_types.tinyint, odps_types.smallint, odps_types.int_, odps_types.bigint, odps_types.float_, odps_types.double ] self.assertEqual(expected_types, t.schema.types)
def testArrayReadWriteTable(self): test_table_name = tn('pyodps_t_tmp_read_write_table') schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True], [444, '中文', False]] length = len(data) texted_data = [[it[0], to_str(it[1]), it[2]] for it in data] self.odps.write_table(table, 0, data) self.assertSequenceEqual( texted_data, [record.values for record in self.odps.read_table(table, length)]) self.assertSequenceEqual(texted_data[::2], [ record.values for record in self.odps.read_table(table, length, step=2) ]) self.assertSequenceEqual( texted_data, [record.values for record in table.head(length)]) self.odps.delete_table(test_table_name) self.assertFalse(self.odps.exist_table(test_table_name))
def testFullPartitionedTable(self): import pandas as pd mars_source_table_name = tn('mars_cupid_datasource_mpart') self.odps.delete_table(mars_source_table_name, if_exists=True) table = self.odps.create_table(mars_source_table_name, schema=('col1 int, col2 string', 'pt1 string, pt2 string'), lifecycle=1) for pid in range(5): pt = table.create_partition('pt1=test_part%d,pt2=test_part%d' % (pid, pid)) with pt.open_writer() as writer: writer.write([[1 + pid * 2, 'test1'], [2 + pid * 2, 'test2']]) client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: df = self.odps.to_mars_dataframe( mars_source_table_name, runtime_endpoint=self.odps.endpoint, append_partitions=True, add_offset=True) result = df.execute().to_pandas() expected = table.to_df().to_pandas() pd.testing.assert_frame_equal(result, expected) finally: client.stop_server()
def testSqlToDataFrame(self): import pandas as pd mars_source_table_name = tn('mars_sql_datasource') self._create_table(mars_source_table_name) data = self._gen_data() self.odps.write_table(mars_source_table_name, data) client = self.odps.create_mars_cluster(2, 4, 8, name=str(uuid.uuid4())) try: sql = 'select count(1) as count from {}'.format( mars_source_table_name) df = self.odps.sql_to_mars_dataframe(sql) r = df.execute().to_pandas() pd.testing.assert_frame_equal(r, pd.DataFrame([4], columns=['count'])) sql = """ SELECT t1.`id`, MAX(t1.`int_num`) AS `int_num_max`, MAX(t1.`float_num`) AS `float_num_max` FROM cupid_test_release.`{}` t1 GROUP BY t1.`id` """.format(mars_source_table_name) df2 = self.odps.sql_to_mars_dataframe(sql) r2 = df2.execute().to_pandas() expected = self.odps.execute_sql(sql).open_reader().to_pandas() pd.testing.assert_frame_equal(r2, expected) finally: client.stop_server()
def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [['name1', 5], ['name2', 6]] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed_%d' % os.getpid()) if self.odps.exist_table(table): self.t = self.odps.get_table(table) else: self.t = self.odps.create_table(table, Schema.from_lists(names, types), lifecycle=1) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def testCreateTableWithChineseColumn(self): test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns") schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def testCreateTableWithChineseColumn(self): test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns') schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
def setup(self): test_table_name = tn('pyodps_test_dataframe') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string']) self.odps.delete_table(test_table_name, if_exists=True) self.table = self.odps.create_table(test_table_name, schema) with self.table.open_writer() as w: w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
def testPivot(self): data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]] table_name = tn("pyodps_test_mixed_engine_pivot") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn("name3", str(cm.exception)) expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows="id", columns="name", values="fid") expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows="id", columns="name", values="fid") expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop()
def testReadSQLWrite(self): test_table = tn('pyodps_t_tmp_read_sql_instance_write') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table( table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [table.new_record([3]), ]) test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2') self.odps.delete_table(test_table2, if_exists=True) table2 = self.odps.create_table(test_table2, table.schema) try: with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader: with table2.open_writer() as writer: for record in reader: writer.write(table2.new_record(record.values)) finally: table.drop() table2.drop()
def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df["name"] == "name1"] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values["id"])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df["name"] == "name1"] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values["id"][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) class TunnelOnlyODPSEngine(ODPSEngine): def execute(self, expr, **kw): expr = self._pre_process(expr) head = kw.get("head") return self._handle_cases(expr, head=head) engine = MixedEngine(self.odps) engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps, global_optimize=False) res = engine.execute(self.odps_df["id"], head=3) self.assertIsNotNone(res) self.assertEqual(sum(res.values["id"]), 6) table_name = tn("pyodps_df_mixed2") self.odps.delete_table(table_name, if_exists=True) table = next(self.odps_df.data_source()) table2 = self.odps.create_table(table_name, table.schema) try: res = DataFrame(table2).head(10) self.assertEqual(len(res), 0) finally: table2.drop()
def test_plenty_create(self): del_insts = [self.odps.run_sql('drop table {0}'.format(tn('tmp_pyodps_create_temp_%d' % n))) for n in range(10)] [inst.wait_for_completion() for inst in del_insts] script = PLENTY_CREATE_CODE.format(odps_info=self._get_odps_json(), import_paths=json.dumps(sys.path)) script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(os.getpid()) + '_plenty_script.py' with open(script_name, 'w') as script_file: script_file.write(script) script_file.close() env = copy.deepcopy(os.environ) env.update({'WAIT_CLEANUP': '1'}) subprocess.call([sys.executable, script_name], close_fds=True, env=env) sleep(5) trial = 4 case = lambda: all(not self.odps.exist_table(tn('tmp_pyodps_create_temp_%d' % tid)) for tid in range(10)) while not case(): trial -= 1 sleep(5) if trial == 0: assert case()
def testBigintPartitionedCache(self): table = tn("pyodps_test_bigint_partitioned_cache") self.odps.delete_table(table, if_exists=True) expr = self.odps_df.persist(table, partitions=["id"]) @output(["id", "name"], ["int", "string"]) def handle(row): return row.id + 1, row.name expr = expr["tt" + expr.name, expr.id].cache() new_expr = expr.map_reduce(mapper=handle) res = self.engine.execute(new_expr) self.assertEqual(len(res), 3)
def testPandasPersist(self): import pandas as pd, numpy as np self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) self.odps.delete_table(tmp_table_name)
def testExtractKV(self): data = [ ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"], ["name1", "", "3=1,4=2"], ["name1", "k1=7.1,k7=8.2", "1=1,5=6"], ["name2", "k2=1.2,k3=1.5", None], ["name2", "k9=1.1,k2=1", "4=2"], ] table_name = tn("pyodps_test_mixed_engine_extract_kv") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"]) ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=") res = self.engine.execute(expr1) result = self._get_result(res) expected_cols = [ "name", "kv_k1", "kv_k2", "kv_k3", "kv_k5", "kv_k7", "kv_k9", "kv2_1", "kv2_2", "kv2_3", "kv2_4", "kv2_5", ] expected = [ ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None], ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None], ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0], ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None], ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None], ] self.assertListEqual([c.name for c in res.columns], expected_cols) self.assertEqual(result, expected) finally: table.drop()
def testReadWritePartitionTable(self): test_table_name = tn('pyodps_t_tmp_read_write_partition_table') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['pt'], ['string']) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) pt1 = 'pt=20151122' pt2 = 'pt=20151123' table.create_partition(pt1) table.create_partition(pt2) with table.open_writer(pt1, commit=False) as writer: record = table.new_record() record[0] = 1 record[1] = 'name1' writer.write(record) record = table.new_record() record[0] = 3 record[1] = 'name3' writer.write(record) self.assertEqual(len(table._upload_ids), 1) upload_id = list(table._upload_ids.values())[0] with table.open_writer(pt1): self.assertEqual(len(table._upload_ids), 1) self.assertEqual(upload_id, list(table._upload_ids.values())[0]) with table.open_writer(pt2) as writer: record = table.new_record() record[0] = 2 record[1] = 'name2' writer.write(record) with table.open_reader(pt1) as reader: records = list(reader) self.assertEqual(len(records), 2) self.assertEqual(sum(r[0] for r in records), 4) with table.open_reader(pt2) as reader: records = list(reader) self.assertEqual(len(records), 1) self.assertEqual(sum(r[0] for r in records), 2) table.drop()
def testReadWritePartitionTable(self): test_table_name = tn("pyodps_t_tmp_read_write_partition_table") schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["pt"], ["string"]) self.odps.delete_table(test_table_name, if_exists=True) self.assertFalse(self.odps.exist_table(test_table_name)) table = self.odps.create_table(test_table_name, schema) table._upload_ids = dict() pt1 = "pt=20151122" pt2 = "pt=20151123" table.create_partition(pt1) table.create_partition(pt2) with table.open_reader(pt1) as reader: self.assertEqual(len(list(reader)), 0) with table.open_writer(pt1, commit=False) as writer: record = table.new_record([1, "name1"]) writer.write(record) record = table.new_record() record[0] = 3 record[1] = "name3" writer.write(record) self.assertEqual(len(table._upload_ids), 1) upload_id = list(table._upload_ids.values())[0] with table.open_writer(pt1): self.assertEqual(len(table._upload_ids), 1) self.assertEqual(upload_id, list(table._upload_ids.values())[0]) with table.open_writer(pt2) as writer: writer.write([2, "name2"]) with table.open_reader(pt1, reopen=True) as reader: records = list(reader) self.assertEqual(len(records), 2) self.assertEqual(sum(r[0] for r in records), 4) with table.open_reader(pt2, reopen=True) as reader: records = list(reader) self.assertEqual(len(records), 1) self.assertEqual(sum(r[0] for r in records), 2) table.drop()
def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(["name", "id"], ["string", "int"]) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) odps_df2 = self.pd_df.persist(tn("pyodps_df_mixed2"), odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) finally: next(odps_df2.data_source()).drop()
def testReadNonAsciiSQLInstance(self): test_table = tn('pyodps_t_tmp_read_non_ascii_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True) data = [[1, '中\\\\n\\\n文 ,\r '], [2, '测试\x00\x01\x02数据']] self.odps.write_table( table, 0, [table.new_record(it) for it in data]) with self.odps.execute_sql('select name from %s' % test_table).open_reader() as reader: read_data = sorted([to_str(r[0]) for r in reader]) expected_data = sorted([to_str(r[1]) for r in data]) self.assertSequenceEqual(read_data, expected_data) table.drop()
def testToPandas(self): table_name = tn('pyodps_test_mixed_engine_to_pandas') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['col%s' % i for i in range(7)], ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'datetime'])) expr2 = DataFrame(table2) data2 = [ [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None] ] self.odps.write_table(table2, 0, data2) pd_df = expr2.to_pandas() self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist()) wrapeed_pd_df = expr2.to_pandas(wrap=True) self.assertSequenceEqual(data2[0], list(next(wrapeed_pd_df.execute())))
def testReadSQLInstance(self): test_table = tn('pyodps_t_tmp_read_sql_instance') self.odps.delete_table(test_table, if_exists=True) table = self.odps.create_table( test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True) self.odps.write_table( table, 0, [table.new_record([1]), table.new_record([2])]) self.odps.write_table(table, [table.new_record([3]), ]) instance = self.odps.execute_sql('select * from %s' % test_table) with instance.open_reader(table.schema) as reader: self.assertEqual(len(list(reader[::2])), 2) with instance.open_reader(table.schema) as reader: self.assertEqual(len(list(reader[1::2])), 1) hints = {'odps.sql.mapper.split.size': 16} instance = self.odps.run_sql('select sum(size) as count from %s' % test_table, hints=hints) while len(instance.get_task_names()) == 0 or \ compat.lvalues(instance.get_task_statuses())[0].status == Instance.Task.TaskStatus.WAITING: continue while True: progress = instance.get_task_progress(instance.get_task_names()[0]) if len(progress.stages) == 0: continue self.assertGreater(len(progress.get_stage_progress_formatted_string().split()), 2) break instance.wait_for_success() with instance.open_reader(Schema.from_lists(['count'], ['bigint'])) as reader: records = list(reader) self.assertEqual(len(records), 1) self.assertEqual(records[0]['count'], 6) with instance.open_reader() as reader: records = list(reader) self.assertEqual(len(records), 1) self.assertEqual(records[0]['count'], '6') table.drop()
def testBloomFilter(self): import numpy as np data2 = [["name1"], ["name3"]] table_name = tn("pyodps_test_mixed_engine_bf_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(["name"], ["string"])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) try: expr = self.odps_df.bloom_filter("name", expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res["name"] != "name2")) finally: table2.drop()
def testSimpleArrayReadWriteTable(self): test_table_name = tn("pyodps_t_tmp_simpe_read_write_table") schema = Schema.from_lists(["num"], ["string"], ["pt"], ["string"]) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) partition = "pt=20151122" table.create_partition(partition) with table.open_writer(partition) as writer: writer.write(["1"]) with table.open_reader(partition) as reader: self.assertEqual(reader.count, 1) record = next(reader) self.assertEqual(record[0], "1") self.assertEqual(record.num, "1") table.drop()
def setup(self): import pandas as pd odps_data = [["name1", 1], ["name2", 2], ["name1", 3]] pd_data = [["name1", 5], ["name2", 6]] names = ["name", "id"] types = ["string", "bigint"] table = tn("pyodps_df_mixed") self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)