Ejemplo n.º 1
0
    def testTokenizedSignServerAccount(self):
        server = SignServer(token=str(uuid.uuid4()))
        server.accounts[
            self.odps.account.access_id] = self.odps.account.secret_access_key
        try:
            server.start(('127.0.0.1', 0))
            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            self.assertRaises(
                SignServerError,
                lambda: odps.delete_table(tn('test_sign_account_table'),
                                          if_exists=True))

            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address,
                                        token=server.token)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            odps.delete_table(tn('test_sign_account_table'), if_exists=True)
            t = odps.create_table(tn('test_sign_account_table'),
                                  'col string',
                                  lifecycle=1)
            self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
            t.drop(async=True)
        finally:
            server.stop()
    def testPandasPersist(self):
        import pandas as pd, numpy as np

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)
        t = self.odps.create_table(
            tmp_table_name, ('a bigint, b bigint, c bigint', 'ds string'))
        t.create_partition('ds=today')
        try:
            pd_df = pd.DataFrame(np.arange(9).reshape(3, 3),
                                 columns=list('abc'))
            df = DataFrame(pd_df).persist(tmp_table_name,
                                          partition='ds=today',
                                          odps=self.odps)

            self.assertPandasEqual(df[list('abc')].to_pandas(), pd_df)
        finally:
            self.odps.delete_table(tmp_table_name)

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist2')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        try:
            pd_df = pd.DataFrame(np.arange(9).reshape(3, 3),
                                 columns=list('abc'))
            df = DataFrame(pd_df).persist(tmp_table_name)

            self.assertPandasEqual(df.to_pandas(), pd_df)
        finally:
            self.odps.delete_table(tmp_table_name)
    def testVolumeArchiveResource(self):
        volume_name = tn('pyodps_t_tmp_resource_archive_volume')
        resource_name = tn('pyodps_t_tmp_volume_archive_resource') + '.zip'
        partition_name = 'test_partition'
        file_name = 'test_file.zip'
        try:
            self.odps.delete_volume(volume_name)
        except errors.ODPSError:
            pass
        try:
            self.odps.delete_resource(resource_name)
        except errors.ODPSError:
            pass

        file_io = six.BytesIO()
        zfile = zipfile.ZipFile(file_io, 'a', zipfile.ZIP_DEFLATED, False)
        zfile.writestr('file1.txt', FILE_CONTENT)
        zfile.writestr('file2.txt', OVERWRITE_FILE_CONTENT)
        zfile.close()

        self.odps.create_parted_volume(volume_name)
        with self.odps.open_volume_writer(volume_name, partition_name) as writer:
            writer.write(file_name, file_io.getvalue())

        volume_file = self.odps.get_volume_partition(volume_name, partition_name).files[file_name]
        self.odps.create_resource(resource_name, 'volumearchive', volume_file=volume_file)
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, VolumeArchiveResource)
        self.assertEqual(res.type, Resource.Type.VOLUMEARCHIVE)
        self.assertEqual(res.volume_path, volume_file.path)
        self.odps.delete_resource(resource_name)
Ejemplo n.º 4
0
    def testVolumeArchiveResource(self):
        volume_name = tn('pyodps_t_tmp_resource_archive_volume')
        resource_name = tn('pyodps_t_tmp_volume_archive_resource') + '.zip'
        partition_name = 'test_partition'
        file_name = 'test_file.zip'
        try:
            self.odps.delete_volume(volume_name)
        except errors.ODPSError:
            pass
        try:
            self.odps.delete_resource(resource_name)
        except errors.ODPSError:
            pass

        file_io = six.BytesIO()
        zfile = zipfile.ZipFile(file_io, 'a', zipfile.ZIP_DEFLATED, False)
        zfile.writestr('file1.txt', FILE_CONTENT)
        zfile.writestr('file2.txt', OVERWRITE_FILE_CONTENT)
        zfile.close()

        self.odps.create_parted_volume(volume_name)
        with self.odps.open_volume_writer(volume_name,
                                          partition_name) as writer:
            writer.write(file_name, file_io.getvalue())

        volume_file = self.odps.get_volume_partition(
            volume_name, partition_name).files[file_name]
        self.odps.create_resource(resource_name,
                                  'volumearchive',
                                  volume_file=volume_file)
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, VolumeArchiveResource)
        self.assertEqual(res.type, Resource.Type.VOLUMEARCHIVE)
        self.assertEqual(res.volume_path, volume_file.path)
        self.odps.delete_resource(resource_name)
    def testRunScript(self):
        import pandas as pd
        from io import BytesIO
        from odps.utils import to_binary

        client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4()))
        try:
            mars_source_table_name = tn('mars_script_datasource')
            mars_des_table_name = tn('mars_script_datastore')
            self._create_table(mars_source_table_name)
            self.odps.delete_table(mars_des_table_name, if_exists=True)
            data = self._gen_data()
            self.odps.write_table(mars_source_table_name, data)

            code = BytesIO(
                to_binary(
                    script.format(mars_source_table_name, self.odps.endpoint,
                                  mars_des_table_name, self.odps.endpoint)))

            self.odps.run_script_in_mars(code,
                                         runtime_endpoint=self.odps.endpoint)
            result = self.odps.get_table(
                mars_des_table_name).to_df().to_pandas()
            expected = self.odps.get_table(
                mars_source_table_name).to_df().to_pandas()
            pd.testing.assert_frame_equal(result, expected)
        finally:
            client.stop_server()
    def testSQLCostInstance(self):
        test_table = tn('pyodps_t_tmp_sql_cost_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(test_table,
                                       schema=Schema.from_lists(['size'],
                                                                ['bigint']),
                                       if_not_exists=True)
        self.odps.write_table(table, [[1], [2], [3]])

        sql_cost = self.odps.execute_sql_cost('select * from %s' % test_table)
        self.assertIsInstance(sql_cost, Instance.SQLCost)
        self.assertEqual(sql_cost.udf_num, 0)
        self.assertEqual(sql_cost.complexity, 1.0)
        self.assertGreaterEqual(sql_cost.input_size, 480)

        test_table = tn('pyodps_t_tmp_sql_cost_odps2_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(test_table,
                                       schema=Schema.from_lists(['size'],
                                                                ['tinyint']),
                                       if_not_exists=True)
        self.odps.write_table(table, [[1], [2], [3]])

        sql_cost = self.odps.execute_sql_cost('select * from %s' % test_table)
        self.assertIsInstance(sql_cost, Instance.SQLCost)
        self.assertEqual(sql_cost.udf_num, 0)
        self.assertEqual(sql_cost.complexity, 1.0)
        self.assertGreaterEqual(sql_cost.input_size, 480)
    def testRunMarsJob(self):
        import pandas as pd

        odps_entry = self.odps
        mars_source_table_name = tn('mars_script_datasource')
        mars_des_table_name = tn('mars_script_datastore')
        self._create_table(mars_source_table_name)
        self.odps.delete_table(mars_des_table_name, if_exists=True)
        data = self._gen_data()
        self.odps.write_table(mars_source_table_name, data)

        def func(s_name, d_name):
            from odps.accounts import BearerTokenAccount

            df = odps_entry.to_mars_dataframe(
                s_name, runtime_endpoint=odps_entry.endpoint).to_pandas()
            odps_entry.persist_mars_dataframe(
                df,
                d_name,
                unknown_as_string=True,
                runtime_endpoint=odps_entry.endpoint)

        self.odps.run_mars_job(func,
                               args=(mars_source_table_name,
                                     mars_des_table_name),
                               name=str(uuid.uuid4()),
                               worker_cpu=4,
                               worker_mem=8)

        result = self.odps.get_table(mars_des_table_name).to_df().to_pandas()
        expected = self.odps.get_table(
            mars_source_table_name).to_df().to_pandas()
        pd.testing.assert_frame_equal(result, expected)
    def testVolumeFileResource(self):
        volume_name = tn('pyodps_t_tmp_resource_file_volume')
        resource_name = tn('pyodps_t_tmp_volume_file_resource')
        partition_name = 'test_partition'
        file_name = 'test_file.txt'
        try:
            self.odps.delete_volume(volume_name)
        except errors.ODPSError:
            pass
        try:
            self.odps.delete_resource(resource_name)
        except errors.ODPSError:
            pass

        self.odps.create_parted_volume(volume_name)
        with self.odps.open_volume_writer(volume_name, partition_name) as writer:
            writer.write(file_name, FILE_CONTENT)

        volume_file = self.odps.get_volume_partition(volume_name, partition_name).files[file_name]
        self.odps.create_resource(resource_name, 'volumefile', volume_file=volume_file)
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, VolumeFileResource)
        self.assertEqual(res.type, Resource.Type.VOLUMEFILE)
        self.assertEqual(res.volume_path, volume_file.path)
        self.odps.delete_resource(resource_name)
    def testReadSQLWrite(self):
        test_table = tn('pyodps_t_tmp_read_sql_instance_write')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(test_table,
                                       schema=Schema.from_lists(['size'],
                                                                ['bigint']),
                                       if_not_exists=True)
        self.odps.write_table(table, 0,
                              [table.new_record([1]),
                               table.new_record([2])])
        self.odps.write_table(table, [
            table.new_record([3]),
        ])

        test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2')
        self.odps.delete_table(test_table2, if_exists=True)
        table2 = self.odps.create_table(test_table2, table.schema)

        try:
            with self.odps.execute_sql('select * from %s' %
                                       test_table).open_reader() as reader:
                with table2.open_writer() as writer:
                    for record in reader:
                        writer.write(table2.new_record(record.values))
        finally:
            table.drop()
            table2.drop()
Ejemplo n.º 10
0
    def testExecuteSql(self):
        FakeShell = namedtuple('FakeShell', 'user_ns')

        magic_class = ODPSSql(FakeShell(user_ns={}))
        magic_class._odps = self.odps

        test_table_name = tn('pyodps_t_test_sql_magic')
        test_content = [['line1'], ['line2']]
        self.odps.delete_table(test_table_name, if_exists=True)
        self.odps.create_table(test_table_name, 'col string', lifecycle=1)
        self.odps.write_table(test_table_name, test_content)

        options.use_instance_tunnel = False
        result = magic_class.execute('select * from %s' % test_table_name)
        self.assertListEqual(self._get_result(result), test_content)

        options.use_instance_tunnel = True
        result = magic_class.execute('select * from %s' % test_table_name)
        self.assertListEqual(self._get_result(result), test_content)

        result = magic_class.execute('show tables')
        self.assertTrue(len(result) > 0)

        table_name = tn('pyodps_test_magics_create_table_result')
        magic_class.execute('create table %s (col string) lifecycle 1' %
                            table_name)
        magic_class.execute('drop table %s' % table_name)
    def test_plenty_create(self):
        del_insts = [
            self.odps.run_sql('drop table {0}'.format(
                tn('tmp_pyodps_create_temp_%d' % n))) for n in range(10)
        ]
        [inst.wait_for_completion() for inst in del_insts]

        script = PLENTY_CREATE_CODE.format(odps_info=self._get_odps_json(),
                                           import_paths=json.dumps(sys.path))

        script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(
            os.getpid()) + '_plenty_script.py'
        with open(script_name, 'w') as script_file:
            script_file.write(script)
            script_file.close()
        env = copy.deepcopy(os.environ)
        env.update({'WAIT_CLEANUP': '1'})
        subprocess.call([sys.executable, script_name], close_fds=True, env=env)

        sleep(5)
        trial = 4
        case = lambda: all(not self.odps.exist_table(
            tn('tmp_pyodps_create_temp_%d' % tid)) for tid in range(10))
        while not case():
            trial -= 1
            sleep(5)
            if trial == 0:
                assert case()
Ejemplo n.º 12
0
    def testVolumeFileResource(self):
        volume_name = tn('pyodps_t_tmp_resource_file_volume')
        resource_name = tn('pyodps_t_tmp_volume_file_resource')
        partition_name = 'test_partition'
        file_name = 'test_file.txt'
        try:
            self.odps.delete_volume(volume_name)
        except errors.ODPSError:
            pass
        try:
            self.odps.delete_resource(resource_name)
        except errors.ODPSError:
            pass

        self.odps.create_parted_volume(volume_name)
        with self.odps.open_volume_writer(volume_name,
                                          partition_name) as writer:
            writer.write(file_name, FILE_CONTENT)

        volume_file = self.odps.get_volume_partition(
            volume_name, partition_name).files[file_name]
        self.odps.create_resource(resource_name,
                                  'volumefile',
                                  volume_file=volume_file)
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, VolumeFileResource)
        self.assertEqual(res.type, Resource.Type.VOLUMEFILE)
        self.assertEqual(res.volume_path, volume_file.path)
        self.odps.delete_resource(resource_name)
    def testCreateMarsCluster(self):
        import pandas as pd
        mars_source_table_name = tn('mars_datasource')
        mars_des_table_name = tn('mars_datastore')
        self._create_table(mars_source_table_name)
        self.odps.delete_table(mars_des_table_name, if_exists=True)
        data = self._gen_data()
        self.odps.write_table(mars_source_table_name, data)

        client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4()))
        try:
            self.assertFalse(client._with_notebook)

            df = self.odps.to_mars_dataframe(
                mars_source_table_name, runtime_endpoint=self.odps.endpoint)
            df_head = df.head(2)
            self.odps.persist_mars_dataframe(
                df_head,
                mars_des_table_name,
                unknown_as_string=True,
                runtime_endpoint=self.odps.endpoint)

            des = self.odps.to_mars_dataframe(
                mars_des_table_name, runtime_endpoint=self.odps.endpoint)

            expected = self.odps.get_table(
                mars_source_table_name).to_df().to_pandas()
            result = des.to_pandas()
            pd.testing.assert_frame_equal(expected.head(2), result)
            self.odps.delete_table(mars_source_table_name)
            self.odps.delete_table(mars_des_table_name)
        finally:
            client.stop_server()
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
    def testViewTable(self):
        import pandas as pd

        mars_source_table_name = tn('mars_view_datasource')
        self.odps.delete_table(mars_source_table_name, if_exists=True)
        self.odps.create_table(mars_source_table_name,
                               schema='col1 int, col2 string')
        self.odps.write_table(mars_source_table_name,
                              [[1, 'test1'], [2, 'test2']])

        mars_view_table_name = tn('mars_view_table')
        self.odps.execute_sql(
            'DROP VIEW IF EXISTS {}'.format(mars_view_table_name))
        sql = 'create view {} (view_col1, view_col2) as select * from {}'.format(
            mars_view_table_name, mars_source_table_name)
        self.odps.execute_sql(sql)

        client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4()))
        try:
            df = self.odps.to_mars_dataframe(
                mars_view_table_name, runtime_endpoint=self.odps.endpoint)
            result = df.execute().to_pandas()
            expected = pd.DataFrame({
                'view_col1': [1, 2],
                'view_col2': ['test1', 'test2']
            })
            pd.testing.assert_frame_equal(result, expected)
        finally:
            client.stop_server()
Ejemplo n.º 16
0
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [['name1', 3.2], ['name3', 2.4]]

        table_name = tn('pyodps_test_mixed_engine_cp_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(['name', 'fid'], ['string', 'double']))
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache()

        output_table = tn('pyodps_test_mixed_engine_cp_output_table')
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'],
                                   ['string'])
        output_t = self.odps.create_table(output_table,
                                          schema,
                                          if_not_exists=True)

        t = joined.persist(output_table,
                           partition='ds=today',
                           create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
    def testCreateDeleteUpdateFunction(self):
        test_resource_name = tn('pyodps_t_tmp_test_function_resource') + '.py'
        test_function_name2 = tn(
            'pyodps_t_tmp_test_function_resource2') + '.py'
        test_function_name = tn('pyodps_t_tmp_test_function')

        try:
            self.odps.delete_resource(test_resource_name)
        except errors.NoSuchObject:
            pass
        try:
            self.odps.delete_function(test_function_name)
        except errors.NoSuchObject:
            pass
        try:
            self.odps.delete_resource(test_function_name2)
        except errors.NoSuchObject:
            pass

        test_resource = self.odps.create_resource(test_resource_name,
                                                  'py',
                                                  file_obj=FILE_CONTENT)

        test_function = self.odps.create_function(
            test_function_name,
            class_type=test_resource_name.split('.', 1)[0] + '.MyPlus',
            resources=[
                test_resource,
            ])

        self.assertIsNotNone(test_function.name)
        self.assertIsNotNone(test_function.owner)
        self.assertIsNotNone(test_function.creation_time)
        self.assertIsNotNone(test_function.class_type)
        self.assertEqual(len(test_function.resources), 1)

        with self.odps.open_resource(name=test_resource_name, mode='r') as fp:
            self.assertEqual(to_str(fp.read()), to_str(FILE_CONTENT))

        secondary_user = self.config.get('test', 'secondary_user')
        self.assertNotEqual(test_function.owner, secondary_user)

        test_resource2 = self.odps.create_resource(test_function_name2,
                                                   'file',
                                                   file_obj='Hello World')
        test_function.resources.append(test_resource2)
        test_function.owner = secondary_user
        test_function.update()

        test_function_id = id(test_function)
        del test_function.project.functions[test_function.name]
        test_function = self.odps.get_function(test_function_name)
        self.assertNotEqual(test_function_id, id(test_function))
        self.assertEqual(len(test_function.resources), 2)
        self.assertEqual(test_function.owner, secondary_user)

        test_resource.drop()
        test_resource2.drop()
        test_function.drop()
Ejemplo n.º 18
0
    def testListInstancesInPage(self):
        test_table = tn('pyodps_t_tmp_list_instances_in_page')

        delay_udf = textwrap.dedent("""
        from odps.udf import annotate
        import sys
        import time

        @annotate("bigint->bigint")
        class Delayer(object):
           def evaluate(self, arg0):
               print('Start Logging')
               sys.stdout.flush()
               time.sleep(45)
               print('End Logging')
               sys.stdout.flush()
               return arg0
        """)
        resource_name = tn('test_delayer_function_resource')
        function_name = tn('test_delayer_function')

        if self.odps.exist_resource(resource_name + '.py'):
            self.odps.delete_resource(resource_name + '.py')
        res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf)

        if self.odps.exist_function(function_name):
            self.odps.delete_function(function_name)
        fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ])

        data = [[random.randint(0, 1000)] for _ in compat.irange(100)]
        self.odps.delete_table(test_table, if_exists=True)
        t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint']))
        self.odps.write_table(t, data)

        instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num"
                                     .format(function_name, test_table))

        try:
            self.assertEqual(instance.status, Instance.Status.RUNNING)
            self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate(
                status=Instance.Status.RUNNING,
                from_time=datetime.now()-timedelta(days=2),
                end_time=datetime.now()+timedelta(days=1), max_items=20)])

            self.waitContainerFilled(lambda: instance.tasks)
            task = instance.tasks[0]
            task.put_info('testInfo', 'TestInfo')
            self.assertIsNotNone(task.warnings)

            self.waitContainerFilled(lambda: task.workers, 30)
            self.assertIsNotNone(task.workers[0].get_log('stdout'))
        finally:
            try:
                instance.stop()
            except:
                pass
            res.drop()
            fun.drop()
            t.drop()
Ejemplo n.º 19
0
    def testTableResource(self):
        test_table_name = tn('pyodps_t_tmp_resource_table')
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'])
        self.odps.delete_table(test_table_name, if_exists=True)
        self.odps.create_table(test_table_name, schema)

        resource_name = tn('pyodps_t_tmp_table_resource')
        try:
            self.odps.delete_resource(resource_name)
        except errors.NoSuchObject:
            pass
        res = self.odps.create_resource(resource_name,
                                        'table',
                                        table_name=test_table_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())
        self.assertIs(res, self.odps.get_resource(resource_name))

        del res.parent[resource_name]  # delete from cache

        self.assertIsNot(res, self.odps.get_resource(resource_name))
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())

        test_table_name = tn('pyodps_t_tmp_resource_table')
        test_table_partition = 'pt=test,sec=1'
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'],
                                   ['pt', 'sec'], ['string', 'bigint'])
        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        table.create_partition(test_table_partition)

        resource_name = tn('pyodps_t_tmp_table_resource')
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        test_table_partition = 'pt=test,sec=2'
        table.create_partition(test_table_partition)
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        self.odps.delete_resource(resource_name)
        self.odps.delete_table(test_table_name)
    def testTableResource(self):
        test_table_name = tn('pyodps_t_tmp_resource_table')
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'])
        self.odps.delete_table(test_table_name, if_exists=True)
        self.odps.create_table(test_table_name, schema)

        resource_name = tn('pyodps_t_tmp_table_resource')
        try:
            self.odps.delete_resource(resource_name)
        except errors.NoSuchObject:
            pass
        res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())
        self.assertIs(res, self.odps.get_resource(resource_name))

        del res.parent[resource_name]  # delete from cache

        self.assertIsNot(res, self.odps.get_resource(resource_name))
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())

        test_table_name = tn('pyodps_t_tmp_resource_table')
        test_table_partition = 'pt=test,sec=1'
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint'])
        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        table.create_partition(test_table_partition)

        resource_name = tn('pyodps_t_tmp_table_resource')
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        test_table_partition = 'pt=test,sec=2'
        table.create_partition(test_table_partition)
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        self.odps.delete_resource(resource_name)
        self.odps.delete_table(test_table_name)
Ejemplo n.º 21
0
    def testHorzConcat(self):
        options.ml.dry_run = False

        table_name = tn('test_horz_concat_table2_xxx_yyy')
        self.odps.delete_table(table_name, if_exists=True)

        result_table_name = tn('test_horz_concat_result')
        self.odps.delete_table(result_table_name, if_exists=True)

        self.odps_df[self.odps_df.name, (self.odps_df.id * 2).rename('ren_id')].persist(table_name)
        df2 = self.odps.get_table(table_name).to_df()
        df2 = df2[:3]
        expr = self.odps_df.concat(df2.ren_id, axis=1)
        expr.persist(result_table_name, lifecycle=1)
Ejemplo n.º 22
0
    def testBearerTokenAccount(self):
        self.odps.delete_table(tn('test_bearer_token_account_table'),
                               if_exists=True)
        t = self.odps.create_table(tn('test_bearer_token_account_table'),
                                   'col string',
                                   lifecycle=1)
        with t.open_writer() as writer:
            records = [['val1'], ['val2'], ['val3']]
            writer.write(records)

        inst = self.odps.execute_sql('select count(*) from {0}'.format(
            tn('test_bearer_token_account_table')),
                                     async_=True)
        inst.wait_for_success()
        task_name = inst.get_task_names()[0]

        logview_address = inst.get_logview_address()
        token = logview_address[logview_address.find('token=') +
                                len('token='):]
        bearer_token_account = BearerTokenAccount(token=token)
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=bearer_token_account)
        bearer_token_instance = bearer_token_odps.get_instance(inst.id)

        self.assertEqual(inst.get_task_result(task_name),
                         bearer_token_instance.get_task_result(task_name))
        self.assertEqual(inst.get_task_summary(task_name),
                         bearer_token_instance.get_task_summary(task_name))

        with self.assertRaises(errors.NoPermission):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test1'),
                'col string',
                lifecycle=1)

        fake_token_account = BearerTokenAccount(token='fake-token')
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=fake_token_account)

        with self.assertRaises(errors.ODPSError):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test2'),
                'col string',
                lifecycle=1)
    def testSubPartitions(self):
        test_table_name = tn('pyodps_t_tmp_sub_partitions_table')
        root_partition = 'type=test'
        sub_partitions = ['s=%s' % i for i in range(3)]
        schema = Schema.from_lists([
            'id',
        ], [
            'string',
        ], ['type', 's'], ['string', 'string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        partitions = [root_partition + ',' + p for p in sub_partitions]
        partitions.append('type=test2,s=0')
        for partition in partitions:
            table.create_partition(partition)

        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        self.assertEqual(len(list(table.iterate_partitions(root_partition))),
                         3)

        table.delete_partition(partitions[0])
        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        self.odps.delete_table(test_table_name)
    def testArrowTunnelMultipleParts(self):
        import pandas as pd

        mars_source_table_name = tn('mars_arrow_tunnel_datasource_mpart')
        self.odps.delete_table(mars_source_table_name, if_exists=True)
        table = self.odps.create_table(mars_source_table_name,
                                       schema=('col1 int, col2 string',
                                               'pt string'),
                                       lifecycle=1)
        for pid in range(5):
            pt = table.create_partition('pt=test_part%d' % pid)
            with pt.open_writer() as writer:
                writer.write([[1 + pid * 2, 'test1'], [2 + pid * 2, 'test2']])

        r = self.odps.to_mars_dataframe(mars_source_table_name,
                                        append_partitions=True,
                                        add_offset=True).execute().to_pandas()
        expected = table.to_df().to_pandas()
        pd.testing.assert_frame_equal(r, expected)

        r = self.odps.to_mars_dataframe(mars_source_table_name,
                                        partition='pt>test_part1',
                                        append_partitions=True,
                                        add_offset=True).execute().to_pandas()
        expected = table.to_df().to_pandas().query(
            'pt>"test_part1"').reset_index(drop=True)
        pd.testing.assert_frame_equal(r, expected)
Ejemplo n.º 25
0
    def testReadMapArraySQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_map_array_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(
                ['idx', 'map_col', 'array_col'],
                ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)],
            )
        )

        data = [
            [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']],
            [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']],
        ]
        self.odps.write_table(test_table, data)

        with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader:
            read_data = [list(r.values) for r in reader]
            read_data = sorted(read_data, key=lambda r: r[0])
            expected_data = sorted(data, key=lambda r: r[0])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
    def testPartitions(self):
        test_table_name = tn('pyodps_t_tmp_partitions_table')
        partitions = ['s=%s' % i for i in range(3)]
        schema = Schema.from_lists([
            'id',
        ], [
            'string',
        ], [
            's',
        ], [
            'string',
        ])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        for partition in partitions:
            table.create_partition(partition)

        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        table.get_partition(partitions[0]).drop()
        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        p = next(table.partitions)
        self.assertGreater(len(p.columns), 0)
        p.reload()
        self.assertGreater(len(p.columns), 0)

        self.odps.delete_table(test_table_name)
Ejemplo n.º 27
0
    def testReadBinarySQLInstance(self):
        try:
            options.tunnel.string_as_binary = True
            test_table = tn('pyodps_t_tmp_read_binary_sql_instance')
            self.odps.delete_table(test_table, if_exists=True)
            table = self.odps.create_table(
                test_table,
                schema=Schema.from_lists(['size', 'name'],
                                         ['bigint', 'string']),
                if_not_exists=True)

            data = [[
                1, u'中'.encode('utf-8') + b'\\\\n\\\n' + u'文'.encode('utf-8') +
                b' ,\r\xe9'
            ],
                    [
                        2, u'测试'.encode('utf-8') + b'\x00\x01\x02' +
                        u'数据'.encode('utf-8') + b'\xe9'
                    ]]
            self.odps.write_table(table, 0,
                                  [table.new_record(it) for it in data])

            with self.odps.execute_sql(
                    'select name from %s' %
                    test_table).open_reader(tunnel=False) as reader:
                read_data = sorted([r[0] for r in reader])
                expected_data = sorted([r[1] for r in data])

                self.assertSequenceEqual(read_data, expected_data)

            table.drop()
        finally:
            options.tunnel.string_as_binary = False
    def testCreateInstance(self):
        test_table = tn('pyodps_t_tmp_create_instance')

        task = SQLTask(query='drop table if exists %s' % test_table)
        instance = self.odps._project.instances.create(task=task)
        instance.wait_for_completion()
        self.assertTrue(instance.is_successful())
        self.assertFalse(self.odps.exist_table(test_table))

        task = SQLTask(query='create table %s(id string);' % test_table)
        instance = self.odps._project.instances.create(task=task)
        instance.wait_for_completion()
        self.assertTrue(instance.is_successful())
        self.assertTrue(self.odps.exist_table(test_table))

        instance = self.odps.execute_sql('drop table %s' % test_table)
        self.assertTrue(instance.is_successful())
        self.assertFalse(self.odps.exist_table(test_table))

        tasks = instance.get_tasks()
        self.assertTrue(any(map(lambda task: isinstance(task, SQLTask),
                                tasks)))

        for name in instance.get_task_names():
            self.assertIsNotNone(instance.get_task_detail(name))
            self.assertIsNotNone(instance.get_task_detail2(name))

        # test stop
        self.assertRaises(errors.InvalidStateSetting, instance.stop)
Ejemplo n.º 29
0
    def testCreateInstance(self):
        test_table = tn('pyodps_t_tmp_create_instance')

        task = SQLTask(query='drop table if exists %s' % test_table)
        instance = self.odps._project.instances.create(task=task)
        instance.wait_for_completion()
        self.assertTrue(instance.is_successful())
        self.assertFalse(self.odps.exist_table(test_table))

        task = SQLTask(query='create table %s(id string);' % test_table)
        instance = self.odps._project.instances.create(task=task)
        instance.wait_for_completion()
        self.assertTrue(instance.is_successful())
        self.assertTrue(self.odps.exist_table(test_table))

        instance = self.odps.execute_sql('drop table %s' % test_table)
        self.assertTrue(instance.is_successful())
        self.assertFalse(self.odps.exist_table(test_table))

        tasks = instance.get_tasks()
        self.assertTrue(any(map(lambda task: isinstance(task, SQLTask), tasks)))

        for name in instance.get_task_names():
            self.assertIsNotNone(instance.get_task_detail(name))
            self.assertIsNotNone(instance.get_task_detail2(name))

        # test stop
        self.assertRaises(errors.InvalidStateSetting, instance.stop)
Ejemplo n.º 30
0
    def testRecordReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_table')
        schema = Schema.from_lists(['id', 'name', 'right'],
                                   ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True],
                [5940813139082772990, '中文', False]]
        length = len(data)
        records = [Record(schema=schema, values=values) for values in data]

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, records)
        self.assertSequenceEqual(
            texted_data,
            [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2], [
            record.values
            for record in self.odps.read_table(table, length, step=2)
        ])

        self.assertSequenceEqual(
            texted_data, [record.values for record in table.head(length)])

        table.truncate()
        self.assertEqual([], list(self.odps.read_table(table)))

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
Ejemplo n.º 31
0
    def testReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_table')
        schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True],
                [222, 'bbb', False],
                [333, 'ccc', True],
                [444, '中文', False]]
        length = len(data)
        records = [Record(schema=schema, values=values) for values in data]

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, records)
        self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2],
                                 [record.values for record in self.odps.read_table(table, length, step=2)])

        self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)])

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
Ejemplo n.º 32
0
    def testCreateDeleteTable(self):
        test_table_name = tn("pyodps_t_tmp_create_table")
        schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)

        self.assertIsNone(table._getattr("owner"))
        self.assertIsNotNone(table.owner)

        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def testPersistExecute(self):
        delay = Delay()
        filtered = self.df[self.df.id > 0].cache()

        persist_table_name = tn('pyodps_test_delay_persist')
        schema = Schema.from_lists(['id', 'name', 'value'],
                                   ['bigint', 'string', 'bigint'],
                                   ['pt', 'ds'], ['string', 'string'])
        self.odps.delete_table(persist_table_name, if_exists=True)
        self.odps.create_table(persist_table_name, schema)

        future1 = filtered[filtered.value > 2].persist(persist_table_name,
                                                       partition='pt=a,ds=d1',
                                                       delay=delay)
        future2 = filtered[filtered.value < 2].persist(persist_table_name,
                                                       partition='pt=a,ds=d2',
                                                       delay=delay)

        delay.execute()
        df1 = future1.result()
        df2 = future2.result()

        self.assertEqual([c.lhs.name for c in df1.predicate.children()],
                         ['pt', 'ds'])
        result1 = self._get_result(df1.execute())
        self.assertEqual([r[:-2] for r in result1],
                         [d for d in self.data if d[2] > 2])
        self.assertEqual([c.lhs.name for c in df2.predicate.children()],
                         ['pt', 'ds'])
        result2 = self._get_result(df2.execute())
        self.assertEqual([r[:-2] for r in result2],
                         [d for d in self.data if d[2] < 2])
Ejemplo n.º 34
0
    def testArrowTunnel(self):
        import pandas as pd
        import numpy as np
        import mars.dataframe as md

        mars_des_table_name = tn('mars_arrow_tunnel_datastore')
        self.odps.delete_table(mars_des_table_name, if_exists=True)

        data = pd.DataFrame({
            'col1':
            np.random.rand(1000, ),
            'col2':
            np.random.randint(0, 100, (1000, )),
            'col3':
            np.random.choice(['a', 'b', 'c'], size=(1000, ))
        })

        df = md.DataFrame(data, chunk_size=300)
        self.odps.persist_mars_dataframe(df,
                                         mars_des_table_name,
                                         unknown_as_string=True)
        expected = self.odps.get_table(mars_des_table_name).to_df().to_pandas()
        pd.testing.assert_frame_equal(
            expected.sort_values('col1').reset_index(drop=True),
            data.sort_values('col1').reset_index(drop=True))

        r = self.odps.to_mars_dataframe(mars_des_table_name,
                                        chunk_size=200).execute().to_pandas()
        expected = self.odps.get_table(mars_des_table_name).to_df().to_pandas()
        pd.testing.assert_frame_equal(r.reset_index(drop=True),
                                      expected.reset_index(drop=True))
Ejemplo n.º 35
0
    def testCreateDeleteTable(self):
        test_table_name = tn('pyodps_t_tmp_create_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
Ejemplo n.º 37
0
    def testPandasPersistODPS2(self):
        import pandas as pd
        import numpy as np

        data_int8 = np.random.randint(0, 10, (1, ), dtype=np.int8)
        data_int16 = np.random.randint(0, 10, (1, ), dtype=np.int16)
        data_int32 = np.random.randint(0, 10, (1, ), dtype=np.int32)
        data_int64 = np.random.randint(0, 10, (1, ), dtype=np.int64)
        data_float32 = np.random.random((1, )).astype(np.float32)
        data_float64 = np.random.random((1, )).astype(np.float64)

        df = DataFrame(
            pd.DataFrame(
                OrderedDict([('data_int8', data_int8),
                             ('data_int16', data_int16),
                             ('data_int32', data_int32),
                             ('data_int64', data_int64),
                             ('data_float32', data_float32),
                             ('data_float64', data_float64)])))
        tmp_table_name = tn('pyodps_test_mixed_persist_odps2_types')

        self.odps.delete_table(tmp_table_name, if_exists=True)
        df.persist(tmp_table_name,
                   lifecycle=1,
                   drop_table=True,
                   odps=self.odps)

        t = self.odps.get_table(tmp_table_name)
        expected_types = [
            odps_types.tinyint, odps_types.smallint, odps_types.int_,
            odps_types.bigint, odps_types.float_, odps_types.double
        ]
        self.assertEqual(expected_types, t.schema.types)
Ejemplo n.º 38
0
    def testArrayReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_table')
        schema = Schema.from_lists(['id', 'name', 'right'],
                                   ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True], [222, 'bbb', False], [333, 'ccc', True],
                [444, '中文', False]]
        length = len(data)

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, data)
        self.assertSequenceEqual(
            texted_data,
            [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2], [
            record.values
            for record in self.odps.read_table(table, length, step=2)
        ])

        self.assertSequenceEqual(
            texted_data, [record.values for record in table.head(length)])

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def testFullPartitionedTable(self):
        import pandas as pd

        mars_source_table_name = tn('mars_cupid_datasource_mpart')
        self.odps.delete_table(mars_source_table_name, if_exists=True)
        table = self.odps.create_table(mars_source_table_name,
                                       schema=('col1 int, col2 string',
                                               'pt1 string, pt2 string'),
                                       lifecycle=1)
        for pid in range(5):
            pt = table.create_partition('pt1=test_part%d,pt2=test_part%d' %
                                        (pid, pid))
            with pt.open_writer() as writer:
                writer.write([[1 + pid * 2, 'test1'], [2 + pid * 2, 'test2']])

        client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4()))
        try:
            df = self.odps.to_mars_dataframe(
                mars_source_table_name,
                runtime_endpoint=self.odps.endpoint,
                append_partitions=True,
                add_offset=True)
            result = df.execute().to_pandas()
            expected = table.to_df().to_pandas()
            pd.testing.assert_frame_equal(result, expected)
        finally:
            client.stop_server()
    def testSqlToDataFrame(self):
        import pandas as pd
        mars_source_table_name = tn('mars_sql_datasource')
        self._create_table(mars_source_table_name)
        data = self._gen_data()
        self.odps.write_table(mars_source_table_name, data)

        client = self.odps.create_mars_cluster(2, 4, 8, name=str(uuid.uuid4()))
        try:
            sql = 'select count(1) as count from {}'.format(
                mars_source_table_name)
            df = self.odps.sql_to_mars_dataframe(sql)
            r = df.execute().to_pandas()
            pd.testing.assert_frame_equal(r,
                                          pd.DataFrame([4], columns=['count']))

            sql = """
            SELECT
            t1.`id`,
            MAX(t1.`int_num`) AS `int_num_max`,
            MAX(t1.`float_num`) AS `float_num_max`
            FROM cupid_test_release.`{}` t1
            GROUP BY
            t1.`id`
            """.format(mars_source_table_name)
            df2 = self.odps.sql_to_mars_dataframe(sql)
            r2 = df2.execute().to_pandas()
            expected = self.odps.execute_sql(sql).open_reader().to_pandas()
            pd.testing.assert_frame_equal(r2, expected)

        finally:
            client.stop_server()
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [['name1', 5], ['name2', 6]]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed_%d' % os.getpid())
        if self.odps.exist_table(table):
            self.t = self.odps.get_table(table)
        else:
            self.t = self.odps.create_table(table,
                                            Schema.from_lists(names, types),
                                            lifecycle=1)
            with self.t.open_writer() as w:
                w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
Ejemplo n.º 42
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns")
        schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
Ejemplo n.º 43
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns')
        schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns],
                                 [col.name for col in schema.columns])
Ejemplo n.º 44
0
    def setup(self):
        test_table_name = tn('pyodps_test_dataframe')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.table = self.odps.create_table(test_table_name, schema)

        with self.table.open_writer() as w:
            w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
    def testPivot(self):
        data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]]

        table_name = tn("pyodps_test_mixed_engine_pivot")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"]
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn("name3", str(cm.exception))

            expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"]
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows="id", columns="name", values="fid")
            expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows="id", columns="name", values="fid")
            expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"]
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()
Ejemplo n.º 46
0
    def testReadSQLWrite(self):
        test_table = tn('pyodps_t_tmp_read_sql_instance_write')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True)
        self.odps.write_table(
            table, 0, [table.new_record([1]), table.new_record([2])])
        self.odps.write_table(table, [table.new_record([3]), ])

        test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2')
        self.odps.delete_table(test_table2, if_exists=True)
        table2 = self.odps.create_table(test_table2, table.schema)

        try:
            with self.odps.execute_sql('select * from %s' % test_table).open_reader() as reader:
                with table2.open_writer() as writer:
                    for record in reader:
                        writer.write(table2.new_record(record.values))
        finally:
            table.drop()
            table2.drop()
    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df["name"] == "name1"]
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values["id"]))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df["name"] == "name1"]
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertIsNotNone(df._cache_data)

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values["id"][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

        class TunnelOnlyODPSEngine(ODPSEngine):
            def execute(self, expr, **kw):
                expr = self._pre_process(expr)
                head = kw.get("head")
                return self._handle_cases(expr, head=head)

        engine = MixedEngine(self.odps)
        engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps, global_optimize=False)

        res = engine.execute(self.odps_df["id"], head=3)
        self.assertIsNotNone(res)
        self.assertEqual(sum(res.values["id"]), 6)

        table_name = tn("pyodps_df_mixed2")
        self.odps.delete_table(table_name, if_exists=True)
        table = next(self.odps_df.data_source())
        table2 = self.odps.create_table(table_name, table.schema)
        try:
            res = DataFrame(table2).head(10)
            self.assertEqual(len(res), 0)
        finally:
            table2.drop()
    def test_plenty_create(self):
        del_insts = [self.odps.run_sql('drop table {0}'.format(tn('tmp_pyodps_create_temp_%d' % n))) for n in range(10)]
        [inst.wait_for_completion() for inst in del_insts]

        script = PLENTY_CREATE_CODE.format(odps_info=self._get_odps_json(), import_paths=json.dumps(sys.path))

        script_name = tempfile.gettempdir() + os.sep + 'tmp_' + str(os.getpid()) + '_plenty_script.py'
        with open(script_name, 'w') as script_file:
            script_file.write(script)
            script_file.close()
        env = copy.deepcopy(os.environ)
        env.update({'WAIT_CLEANUP': '1'})
        subprocess.call([sys.executable, script_name], close_fds=True, env=env)

        sleep(5)
        trial = 4
        case = lambda: all(not self.odps.exist_table(tn('tmp_pyodps_create_temp_%d' % tid)) for tid in range(10))
        while not case():
            trial -= 1
            sleep(5)
            if trial == 0:
                assert case()
    def testBigintPartitionedCache(self):
        table = tn("pyodps_test_bigint_partitioned_cache")
        self.odps.delete_table(table, if_exists=True)
        expr = self.odps_df.persist(table, partitions=["id"])

        @output(["id", "name"], ["int", "string"])
        def handle(row):
            return row.id + 1, row.name

        expr = expr["tt" + expr.name, expr.id].cache()
        new_expr = expr.map_reduce(mapper=handle)

        res = self.engine.execute(new_expr)
        self.assertEqual(len(res), 3)
    def testPandasPersist(self):
        import pandas as pd, numpy as np

        self.odps.to_global()

        tmp_table_name = tn('pyodps_test_mixed_persist')
        self.odps.delete_table(tmp_table_name, if_exists=True)

        pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'))
        df = DataFrame(pd_df).persist(tmp_table_name)

        self.assertPandasEqual(df.to_pandas(), pd_df)

        self.odps.delete_table(tmp_table_name)
    def testExtractKV(self):
        data = [
            ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"],
            ["name1", "", "3=1,4=2"],
            ["name1", "k1=7.1,k7=8.2", "1=1,5=6"],
            ["name2", "k2=1.2,k3=1.5", None],
            ["name2", "k9=1.1,k2=1", "4=2"],
        ]

        table_name = tn("pyodps_test_mixed_engine_extract_kv")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"])
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=")
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected_cols = [
                "name",
                "kv_k1",
                "kv_k2",
                "kv_k3",
                "kv_k5",
                "kv_k7",
                "kv_k9",
                "kv2_1",
                "kv2_2",
                "kv2_3",
                "kv2_4",
                "kv2_5",
            ]
            expected = [
                ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None],
                ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None],
                ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0],
                ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None],
                ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None],
            ]

            self.assertListEqual([c.name for c in res.columns], expected_cols)
            self.assertEqual(result, expected)
        finally:
            table.drop()
Ejemplo n.º 52
0
    def testReadWritePartitionTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_partition_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['pt'], ['string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)

        pt1 = 'pt=20151122'
        pt2 = 'pt=20151123'
        table.create_partition(pt1)
        table.create_partition(pt2)

        with table.open_writer(pt1, commit=False) as writer:
            record = table.new_record()
            record[0] = 1
            record[1] = 'name1'
            writer.write(record)

            record = table.new_record()
            record[0] = 3
            record[1] = 'name3'
            writer.write(record)

        self.assertEqual(len(table._upload_ids), 1)
        upload_id = list(table._upload_ids.values())[0]
        with table.open_writer(pt1):
            self.assertEqual(len(table._upload_ids), 1)
            self.assertEqual(upload_id, list(table._upload_ids.values())[0])

        with table.open_writer(pt2) as writer:
            record = table.new_record()
            record[0] = 2
            record[1] = 'name2'
            writer.write(record)

        with table.open_reader(pt1) as reader:
            records = list(reader)
            self.assertEqual(len(records), 2)
            self.assertEqual(sum(r[0] for r in records), 4)

        with table.open_reader(pt2) as reader:
            records = list(reader)
            self.assertEqual(len(records), 1)
            self.assertEqual(sum(r[0] for r in records), 2)

        table.drop()
Ejemplo n.º 53
0
    def testReadWritePartitionTable(self):
        test_table_name = tn("pyodps_t_tmp_read_write_partition_table")
        schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["pt"], ["string"])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        table._upload_ids = dict()

        pt1 = "pt=20151122"
        pt2 = "pt=20151123"
        table.create_partition(pt1)
        table.create_partition(pt2)

        with table.open_reader(pt1) as reader:
            self.assertEqual(len(list(reader)), 0)

        with table.open_writer(pt1, commit=False) as writer:
            record = table.new_record([1, "name1"])
            writer.write(record)

            record = table.new_record()
            record[0] = 3
            record[1] = "name3"
            writer.write(record)

        self.assertEqual(len(table._upload_ids), 1)
        upload_id = list(table._upload_ids.values())[0]
        with table.open_writer(pt1):
            self.assertEqual(len(table._upload_ids), 1)
            self.assertEqual(upload_id, list(table._upload_ids.values())[0])

        with table.open_writer(pt2) as writer:
            writer.write([2, "name2"])

        with table.open_reader(pt1, reopen=True) as reader:
            records = list(reader)
            self.assertEqual(len(records), 2)
            self.assertEqual(sum(r[0] for r in records), 4)

        with table.open_reader(pt2, reopen=True) as reader:
            records = list(reader)
            self.assertEqual(len(records), 1)
            self.assertEqual(sum(r[0] for r in records), 2)

        table.drop()
    def testMapReduceWithResource(self):
        pd_df2 = self.odps_df.to_pandas(wrap=True)

        @output(["name", "id"], ["string", "int"])
        def reducer(resources):
            d = dict()
            for r in resources[0]:
                if r.name in d:
                    d[r.name] += r.id
                else:
                    d[r.name] = r.id

            def inner(keys):
                def h(row, done):
                    if row.name in d:
                        d[row.name] += row.id
                    else:
                        d[row.name] = row.id

                    if done:
                        yield row.name, d[row.name]

                return h

            return inner

        expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
        result = expr.execute()
        self.assertEqual(result.values["id"].sum(), 17)

        odps_df2 = self.pd_df.persist(tn("pyodps_df_mixed2"), odps=self.odps)
        try:
            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)

            expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)

            expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name")
            result = expr.execute()
            self.assertEqual(result.values["id"].sum(), 17)
        finally:
            next(odps_df2.data_source()).drop()
Ejemplo n.º 55
0
    def testReadNonAsciiSQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_non_ascii_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(['size', 'name'], ['bigint', 'string']), if_not_exists=True)

        data = [[1, '中\\\\n\\\n文 ,\r '], [2, '测试\x00\x01\x02数据']]
        self.odps.write_table(
            table, 0, [table.new_record(it) for it in data])

        with self.odps.execute_sql('select name from %s' % test_table).open_reader() as reader:
            read_data = sorted([to_str(r[0]) for r in reader])
            expected_data = sorted([to_str(r[1]) for r in data])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
Ejemplo n.º 56
0
    def testToPandas(self):
        table_name = tn('pyodps_test_mixed_engine_to_pandas')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['col%s' % i for i in range(7)],
                                                                 ['bigint', 'double', 'string', 'datetime',
                                                                  'boolean', 'decimal', 'datetime']))
        expr2 = DataFrame(table2)

        data2 = [
            [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None]
        ]
        self.odps.write_table(table2, 0, data2)

        pd_df = expr2.to_pandas()
        self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist())

        wrapeed_pd_df = expr2.to_pandas(wrap=True)
        self.assertSequenceEqual(data2[0], list(next(wrapeed_pd_df.execute())))
Ejemplo n.º 57
0
    def testReadSQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table, schema=Schema.from_lists(['size'], ['bigint']), if_not_exists=True)
        self.odps.write_table(
            table, 0, [table.new_record([1]), table.new_record([2])])
        self.odps.write_table(table, [table.new_record([3]), ])

        instance = self.odps.execute_sql('select * from %s' % test_table)
        with instance.open_reader(table.schema) as reader:
            self.assertEqual(len(list(reader[::2])), 2)
        with instance.open_reader(table.schema) as reader:
            self.assertEqual(len(list(reader[1::2])), 1)

        hints = {'odps.sql.mapper.split.size': 16}
        instance = self.odps.run_sql('select sum(size) as count from %s' % test_table, hints=hints)

        while len(instance.get_task_names()) == 0 or \
                compat.lvalues(instance.get_task_statuses())[0].status == Instance.Task.TaskStatus.WAITING:
            continue

        while True:
            progress = instance.get_task_progress(instance.get_task_names()[0])
            if len(progress.stages) == 0:
                continue
            self.assertGreater(len(progress.get_stage_progress_formatted_string().split()), 2)
            break

        instance.wait_for_success()

        with instance.open_reader(Schema.from_lists(['count'], ['bigint'])) as reader:
            records = list(reader)
            self.assertEqual(len(records), 1)
            self.assertEqual(records[0]['count'], 6)

        with instance.open_reader() as reader:
            records = list(reader)
            self.assertEqual(len(records), 1)
            self.assertEqual(records[0]['count'], '6')

        table.drop()
    def testBloomFilter(self):
        import numpy as np

        data2 = [["name1"], ["name3"]]

        table_name = tn("pyodps_test_mixed_engine_bf_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(["name"], ["string"]))
        expr2 = DataFrame(table2)

        self.odps.write_table(table2, 0, data2)

        try:
            expr = self.odps_df.bloom_filter("name", expr2[:1].name, capacity=10)

            res = self.engine.execute(expr)

            self.assertTrue(np.all(res["name"] != "name2"))
        finally:
            table2.drop()
Ejemplo n.º 59
0
    def testSimpleArrayReadWriteTable(self):
        test_table_name = tn("pyodps_t_tmp_simpe_read_write_table")
        schema = Schema.from_lists(["num"], ["string"], ["pt"], ["string"])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        partition = "pt=20151122"
        table.create_partition(partition)

        with table.open_writer(partition) as writer:
            writer.write(["1"])

        with table.open_reader(partition) as reader:
            self.assertEqual(reader.count, 1)
            record = next(reader)
            self.assertEqual(record[0], "1")
            self.assertEqual(record.num, "1")

        table.drop()
    def setup(self):
        import pandas as pd

        odps_data = [["name1", 1], ["name2", 2], ["name1", 3]]

        pd_data = [["name1", 5], ["name2", 6]]

        names = ["name", "id"]
        types = ["string", "bigint"]

        table = tn("pyodps_df_mixed")
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)