コード例 #1
0
ファイル: db_config.py プロジェクト: ztplant/GSPython
class OdpsConn(object):
    """
    odps 连接
    """
    def __init__(self, project):
        self.access_id = ODPSCONF.key
        self.access_key = ODPSCONF.sec
        self.project = project

        self.odps = None

    def __enter__(self):
        try:
            self.odps = ODPS(self.access_id, self.access_key, self.project)
        except Exception as exc:
            raise ValueError(exc.message)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        del self

    def get_table_count_and_names(self):
        """
        获取一个项目下的table的数量和table的名字
        :return:
        """
        tables = self.odps.list_tables()
        names = [table.name for table in tables]
        count = len(names)
        return count, names

    def get_table_schema(self, tname):
        """
        获取表字段
        :return: 
        """
        table = self.odps.get_table(tname)
        _sa = table.schema
        _columns = _sa.columns
        schema = [item.name for item in _columns]
        return schema

    def execute_sql(self, sql):
        rest = []
        with self.odps.execute_sql(sql).open_reader() as reader:
            for record in reader:
                rest.append(record.values)
        return rest

    def get_table_last_update_time(self, tname):
        t = self.odps.get_table(tname)
        last_update_time = t.last_modified_time if t else None
        return last_update_time

    def count_table(self, table):
        sql = 'select count(1) from %s' % table
        with self.odps.execute_sql(sql).open_reader() as reader:
            return reader[0].values[0]
コード例 #2
0
    def _execute_in_cupid(cls, ctx, op):
        import os

        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount

        cupid_client = CupidServiceClient()
        to_store_data = ctx[op.inputs[0].key]

        bearer_token = cupid_client.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        odps_schema = o.get_table(op.table_name).schema
        project_name, table_name = op.table_name.split('.')

        writer_config = dict(_table_name=table_name,
                             _project_name=project_name,
                             _table_schema=odps_schema,
                             _partition_spec=op.partition_spec,
                             _block_id=op.block_id,
                             _handle=op.cupid_handle)
        cupid_client.write_table_data(writer_config, to_store_data,
                                      op.write_batch_size)
        ctx[op.outputs[0].key] = pd.DataFrame()
コード例 #3
0
    def tile(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = op.inputs[0]

        out_chunks = []
        out_chunk_shape = (0,) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '')
            chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name,
                                                partition_spec=op.partition_spec,
                                                cupid_handle=to_str(upload_session.handle),
                                                block_id=block_id, write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) > combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i: i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False)
                    chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks,
                                                    cupid_handle=to_str(upload_session.handle),
                                                    overwrite=op.overwrite, odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes)

        out_df = op.outputs[0]
        new_op = op.copy()
        return new_op.new_dataframes(op.inputs, shape=out_df.shape,
                                     dtypes=out_df.dtypes, chunks=[commit_table_chunk],
                                     nsplits=((0,),) * len(out_chunk_shape))
コード例 #4
0
 def _get_table_schema(self):
     odps_client = ODPS(
         access_id=self._kwargs["access_id"],
         secret_access_key=self._kwargs["access_key"],
         project=self._kwargs["project"],
         endpoint=self._kwargs.get("endpoint"),
     )
     odps_table = odps_client.get_table(self._kwargs["table"])
     return odps_table.schema
コード例 #5
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    upload_session = tunnel.create_upload_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    upload_session = tunnel.create_upload_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        retries = 0
        while True:
            try:
                writer = upload_session.open_arrow_writer(0)
                arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
                writer.write(arrow_rb)
                writer.close()
                break
            except:
                if retries >= retry_times:
                    raise
                time.sleep(1)

        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)

        ctx[op.outputs[0].key] = pd.DataFrame()
コード例 #6
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
コード例 #7
0
def _handle_enum_table_partitions(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name, partition
        task_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import context

        cupid_ctx = context()

        odps_params = task_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)

        table = o.get_table(task_config['table_name'])
        partition_desc = task_config.get('partition')
        if not table.schema.partitions:
            _write_request_result(sock, result=None)
        elif partition_desc:
            if check_partition_exist(table, partition_desc):
                _write_request_result(sock, result=[partition_desc])
            else:
                parts = filter_partitions(o, list(table.partitions),
                                          partition_desc)
                _write_request_result(
                    sock, result=[str(pt.partition_spec) for pt in parts])
        else:
            _write_request_result(
                sock,
                result=[str(pt.partition_spec) for pt in table.partitions])
    except:
        logger.exception('Failed to create download session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
コード例 #8
0
def _handle_create_table_upload_session(sock):
    try:
        cmd_len, = struct.unpack('<I', sock.recv(4))
        # dict with odps_params, table_name
        session_config = pickle.loads(sock.recv(cmd_len))

        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        odps_params = session_config['odps_params']
        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME',
                                 None) or odps_params['project']
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=project,
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(session_config['table_name'])

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        ret_data = {
            'handle': upload_session.handle,
        }
        _write_request_result(sock, result=ret_data)
    except:
        logger.exception('Failed to create upload session')
        _write_request_result(sock, False, exc_info=sys.exc_info())
コード例 #9
0
def uploadexcel(input_file,output_table_n='defult'):
    odps =  ODPS(ACCESS_KEY_ID,
             ACCESS_KEY_SECRET,
             PROJECT,
             endpoint='http://service.odps.aliyun.com/api')

    project = odps.get_project()  # 取到默认项目
    print(project)
    ds = datetime.datetime.now().strftime('%Y%m%d')
    print(ds)

    wb = openpyxl.load_workbook(filename=input_file,read_only=True)
    ws = wb.active
    print(datetime.datetime.now())

    output_table = odps.get_table(output_table_n)
    if output_table.exist_partition('ds=' + ds):
        output_table.delete_partition('ds=' + ds)
    output_table.create_partition('ds=' + ds, if_not_exists=True)

    tunnel = TableTunnel(odps)
    upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds)
    print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema)
    index=0
    with upload_session.open_record_writer(0) as writer:
        for row in ws.rows:
            records = output_table.new_record()
            i=0
            for cell in row:
                if cell is None:
                    records[i] = None
                else:
                    records[i] = str(cell.value).encode('utf-8', "replace")
                i=i+1
            writer.write(records)
            index=index+1
            if index % 1000 == 0:
                print(index)
    upload_session.commit(0)

    print('===========')
    print(datetime.datetime.now())
コード例 #10
0
def data_to_local():
    # quant_financial_report_balance_sheet 资产负债表
    o = ODPS('***', '***', 'gravity_quant')
    balance_sheet = DataFrame(
        o.get_table('quant_financial_report_balance_sheet'))
    balance_sheet = balance_sheet.to_pandas()
    balance_sheet.to_csv('financial_report_balance_sheet.csv', index=False)
    # quant_financial_report_profitloss 利润表
    o = ODPS('***', '***', 'gravity_quant')
    profitloss_sheet = DataFrame(
        o.get_table('quant_financial_report_profitloss'))
    profitloss_sheet = profitloss_sheet.to_pandas()
    profitloss_sheet.to_csv('quant_financial_report_profitloss.csv',
                            index=False)
    # quant_financial_report_cashflows_statement 现金流表
    o = ODPS('***', '***', 'gravity_quant')
    cashflows_sheet = DataFrame(
        o.get_table('quant_financial_report_cashflows_statement'))
    cashflows_sheet = cashflows_sheet.to_pandas()
    cashflows_sheet.to_csv('quant_financial_report_cashflows_statement.csv',
                           index=False)
    # quant_financial_analysis_report 财务分析表
    o = ODPS('***', '***', 'gravity_quant')
    analysis_sheet = DataFrame(o.get_table('quant_financial_analysis_report'))
    analysis_sheet = analysis_sheet.to_pandas()
    analysis_sheet.to_csv('quant_financial_analysis_report.csv', index=False)
    # market_values_end 市值数据
    o = ODPS('***', '***', 'gravity_quant')
    market_value = DataFrame(o.get_table('tdl_huangjin_market_values_all'))
    market_value = market_value.to_pandas()
    print(market_value.shape)
    market_value.drop_duplicates(subset=['code', 'pt'],
                                 keep='first',
                                 inplace=True)
    market_value = market_value.dropna(subset=['code', 'pt', 'total_value'])
    market_value.to_csv('market_values_end.csv', index=False)
    # 行业信息数据
    o = ODPS('***', '***', 'gravity_quant')
    quant_stocks_industry_info = DataFrame(
        o.get_table('quant_stocks_industry_info'))
    quant_stocks_industry_info = quant_stocks_industry_info.to_pandas()
    a = quant_stocks_industry_info['industry_sw'].str.split(
        '-', expand=True).add_prefix('level_')
    stocks_info = pd.concat([quant_stocks_industry_info, a], axis=1)
    stocks_info.to_csv('stocks_info.csv', index=False)
コード例 #11
0
def run(args):
    # build an odps instance
    o = ODPS(args.odps_access_id,
             args.odps_access_key,
             args.odps_project,
             endpoint=args.odps_endpoint)

    input_table_project = args.odps_project
    input_table_name = args.input_table_name
    if '.' in input_table_name:
        input_table_project = args.input_table_name.split(".")[0]
        input_table_name = args.input_table_name.split(".")[1]

    # download data from odps
    input_table = o.get_table(input_table_name, project=input_table_project)
    data = input_table.to_df().to_pandas()

    # sample data
    new_data = data.sample(args.sample_row_count)

    # create output table and upload data to odps
    o.delete_table(args.output_table_name, if_exists=True)
    output_table_project = args.odps_project
    output_table_name = args.output_table_name
    if '.' in output_table_name:
        output_table_project = args.output_table_name.split(".")[0]
        output_table_name = args.output_table_name.split(".")[1]

    table = o.create_table(output_table_name,
                           input_table.schema,
                           project=output_table_project,
                           if_not_exists=False,
                           lifecycle=3)
    o.write_table(output_table_name,
                  new_data.values.tolist(),
                  project=output_table_project)
コード例 #12
0
ファイル: Exportdata.py プロジェクト: lijixin43/python_code
from odps import ODPS
from odps.df import DataFrame
import pandas as pd
import xlwt
"""
your-access-id:账户的AccessKey ID。
your-secret-access-key:账户的AccessKey Secret。
your-default-project:使用的项目空间名称。
your-end-point:MaxCompute服务所在区域的Endpoint。
"""
o = ODPS('**your-access-id**',
         '**your-secret-access-key**',
         '**your-default-project**',
         endpoint='**your-end-point**')
t = o.get_table('entry_range_e_info_result')

datas1 = DataFrame(t)

a = []
reader = t.open_reader()
count = reader.count
print(count)

#遍历行和列
for record in reader[0:count]:
    one = []
    for col in t.schema.names:
        one.append(record[col])
    a.append(one)
data1 = pd.DataFrame(a, columns=datas1.schema.names)
コード例 #13
0
ファイル: odps_io_test.py プロジェクト: zuodh/elasticdl
class ODPSIOTest(unittest.TestCase):
    def setUp(self):
        self._project = os.environ[ODPSConfig.PROJECT_NAME]
        self._access_id = os.environ[ODPSConfig.ACCESS_ID]
        self._access_key = os.environ[ODPSConfig.ACCESS_KEY]
        self._endpoint = os.environ.get(ODPSConfig.ENDPOINT)
        self._test_read_table = "test_odps_reader_%d_%d" % (
            int(time.time()),
            random.randint(1, 101),
        )
        self._test_write_table = "test_odps_writer_%d_%d" % (
            int(time.time()),
            random.randint(1, 101),
        )
        self._odps_client = ODPS(self._access_id, self._access_key,
                                 self._project, self._endpoint)
        create_iris_odps_table(self._odps_client, self._project,
                               self._test_read_table)

    def test_read_to_iterator(self):
        reader = ODPSReader(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_read_table,
            None,
            4,
            None,
        )
        records_iter = reader.to_iterator(1, 0, 50, 2, False, None)
        records = list(records_iter)
        self.assertEqual(len(records), 6,
                         "Unexpected number of batches: %d" % len(records))
        flattened_records = [record for batch in records for record in batch]
        self.assertEqual(
            len(flattened_records),
            220,
            "Unexpected number of total records: %d" % len(flattened_records),
        )

    def test_write_odps_to_recordio_shards_from_iterator(self):
        reader = ODPSReader(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_read_table,
            None,
            4,
            None,
        )
        records_iter = reader.to_iterator(1, 0, 50, 2, False, None)
        with tempfile.TemporaryDirectory() as output_dir:
            write_recordio_shards_from_iterator(
                records_iter,
                ["f" + str(i) for i in range(5)],
                output_dir,
                records_per_shard=50,
            )
            self.assertEqual(len(os.listdir(output_dir)), 5)

    def test_write_from_iterator(self):
        columns = ["num", "num2"]
        column_types = ["bigint", "double"]

        # If the table doesn't exist yet
        writer = ODPSWriter(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_write_table,
            columns,
            column_types,
        )
        writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2)
        table = self._odps_client.get_table(self._test_write_table,
                                            self._project)
        self.assertEqual(table.schema.names, columns)
        self.assertEqual(table.schema.types, column_types)
        self.assertEqual(table.to_df().count(), 1)

        # If the table already exists
        writer = ODPSWriter(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_write_table,
        )
        writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2)
        table = self._odps_client.get_table(self._test_write_table,
                                            self._project)
        self.assertEqual(table.schema.names, columns)
        self.assertEqual(table.schema.types, column_types)
        self.assertEqual(table.to_df().count(), 2)

    def tearDown(self):
        self._odps_client.delete_table(self._test_write_table,
                                       self._project,
                                       if_exists=True)
        self._odps_client.delete_table(self._test_read_table,
                                       self._project,
                                       if_exists=True)
コード例 #14
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        out = op.outputs[0]

        if op.table_name is None:
            # is empty table
            ctx[out.key] = cls._build_empty_df(out)
            return

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = op.retry_times or options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    download_session = tunnel.create_download_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    download_session = tunnel.create_download_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        retries = 0
        while True:
            try:
                with download_session.open_arrow_reader(
                        op.start_index, count, columns=op.columns) as reader:
                    table = reader.read()
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)
        data = cls._align_output_data(op, data)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
コード例 #15
0
ファイル: test01.py プロジェクト: 15602035939/PythonStudy
# -*- coding: utf-8 -*-
"""
Created on Sat Sep  2 20:07:27 2017

@author: shuai.qian
"""
import matplotlib.pyplot as plt
from odps.df import DataFrame
from odps import ODPS

o = ODPS('',project='', endpoint='')
t = DataFrame(o.get_table('tmp_ods_mc_testing_dlt'))
print("=================================> START <==================================")
#print(t.dtypes)
#print(t["class"].head(5))
t.groupby('class').agg(count = t['class'].count())

# %matplotlib inline

t['class'].value_counts().plot(kind = 'bar', x = 'class', xlabel = 'cnt' )

tmp = range(0,10,2)
tmp.pop(1)
コード例 #16
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                add_offset=op.add_offset,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(idx, 0))
            out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
コード例 #17
0
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value)

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))
コード例 #18
0
"""
    测试python连接到odps
"""
from odps import ODPS
o = ODPS('LTAIFwE1F5V5Fucy','c1Daaf3vFHwu3PhLBK99iHvH2AqWC4','zhangp123',
         endpoint='http://service.cn.maxcompute.aliyun.com/api')
dual = o.get_table('20171210v7')

print(dual.name)
print(dual.schema)
print(dual.head(10))
# dual.drop()



コード例 #19
0
class ODPSIOTest(unittest.TestCase):
    def setUp(self):
        self._project = os.environ[ODPSConfig.PROJECT_NAME]
        self._access_id = os.environ[ODPSConfig.ACCESS_ID]
        self._access_key = os.environ[ODPSConfig.ACCESS_KEY]
        self._endpoint = os.environ[ODPSConfig.ENDPOINT]
        self._test_read_table = "chicago_taxi_train_data"
        self._test_write_table = "test_odps_writer_%d_%d" % (
            int(time.time()),
            random.randint(1, 101),
        )
        self._odps_client = ODPS(self._access_id, self._access_key,
                                 self._project, self._endpoint)

    def test_read_to_iterator(self):
        reader = ODPSReader(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_read_table,
            None,
            4,
            None,
        )
        records_iter = reader.to_iterator(1, 0, 200, 2, False, None)
        for batch in records_iter:
            self.assertEqual(len(batch), 200,
                             "incompatible size: %d" % len(batch))

    def test_write_odps_to_recordio_shards_from_iterator(self):
        reader = ODPSReader(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_read_table,
            None,
            4,
            None,
        )
        records_iter = reader.to_iterator(1, 0, 200, 2, False, None)
        with tempfile.TemporaryDirectory() as output_dir:
            write_recordio_shards_from_iterator(
                records_iter,
                ["f" + str(i) for i in range(18)],
                output_dir,
                records_per_shard=200,
            )
            self.assertEqual(len(os.listdir(output_dir)), 100)

    def test_write_from_iterator(self):
        columns = ["num", "num2"]
        column_types = ["bigint", "double"]

        # If the table doesn't exist yet
        writer = ODPSWriter(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_write_table,
            columns,
            column_types,
        )
        writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2)
        table = self._odps_client.get_table(self._test_write_table,
                                            self._project)
        self.assertEqual(table.schema.names, columns)
        self.assertEqual(table.schema.types, column_types)
        self.assertEqual(table.to_df().count(), 1)

        # If the table already exists
        writer = ODPSWriter(
            self._project,
            self._access_id,
            self._access_key,
            self._endpoint,
            self._test_write_table,
        )
        writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2)
        table = self._odps_client.get_table(self._test_write_table,
                                            self._project)
        self.assertEqual(table.schema.names, columns)
        self.assertEqual(table.schema.types, column_types)
        self.assertEqual(table.to_df().count(), 2)

    def tearDown(self):
        self._odps_client.delete_table(self._test_write_table,
                                       self._project,
                                       if_exists=True)
コード例 #20
0
ファイル: ceshi.py プロジェクト: smartdongwei/pythonstudy
            requests11 = makeRequests(Lib2Odps, file_arrs)
            [pool11.putRequest(req) for req in requests11]
            pool11.wait()
            logging.info("所有文件上传结束")
        except:
            logging.exception("上传过程中出错")


if __name__ == "__main__":
    partition = time.strftime("%Y%m%d", time.localtime())
    logging.basicConfig(
        filename=runLog + 'dongtaiRL' + partition + ".log",
        filemode='a',
        level=logging.NOTSET,
        format="%(asctime)s - %(levelname)s: %(message)s")  #日志的输出格式
    table_obj = cOdps.get_table(ExpressArr[1], 'kunlun')
    try:
        # 判断是否存在数组中
        if partition not in date_arr:
            print '分区%s' % partition
            table_obj.create_partition('dt=' + partition,
                                       if_not_exists=True)  # 创建分区
            date_arr.append(partition)
            print table_obj
    except:
        print '创建分区错误'

    try:

        fileReal = os.path.split(os.path.realpath(
            sys.argv[0]))[0]  # 获取脚本所在的文件路径
コード例 #21
0
ファイル: odps_writer.py プロジェクト: duyiqi17/PaddleRec
schema = Schema(
    columns=columns)  # schema = Schema(columns=columns, partitions=partitions)

table_name = 'wide_and_deep'

print(schema.columns)


def create_table():
    table = o.create_table(table_name, schema, if_not_exists=True)


#create_table()

table = o.get_table(table_name)  #.to_df()
print(table.to_df())


def write_data():
    records = []

    # prepare data
    input_file = './part-0'
    with open(input_file, 'r') as f:
        for line in f:
            example = []

            features = line.rstrip('\n').split('\t')
            label = int(features[0])
            example.append(label)
コード例 #22
0
	3、用法:python 脚本.py 源项目名称 目标项目名称
'''

s = ODPS('',
         '',
         '%s' % sys.argv[1],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')
d = ODPS('',
         '',
         '%s' % sys.argv[2],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')

print("######################################################################")

for table in s.list_tables():
    t1 = s.get_table(table.name)
    if d.exist_table(table.name):
        t2 = d.get_table(table.name)
    else:
        print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2]))
        continue

    if table.schema.partitions:  #判断该表是否为分区表
        #print 'Table %s is partitioned.' %table.name
        for partition in table.partitions:
            #print partition.name
            with t1.open_reader(partition='%s' % partition.name) as reader:
                count1 = reader.count
                #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1)
            if t2.exist_partition(partition.name):
                with t2.open_reader(partition='%s' %
コード例 #23
0
ファイル: changkou.py プロジェクト: smartdongwei/pythonstudy
        #txt_csv(txtName)

    #for csvName in os.listdir(csvFile):
        #txt_upload_odps(csvName)

    #for jpgName in os.listdir(photoPath):
    allJpgList = [photoPath + name for name in os.listdir(photoPath)]
    try:  # 上传时的模块
        pool11 = tp.ThreadPool(10)
        requests11 = makeRequests(jpg_Upload, allJpgList)
        [pool11.putRequest(req) for req in requests11]
        pool11.wait()
    except:
        traceback.print_exc()



if __name__ == '__main__':
    tableName = ''
    partiton =  time.strftime("%Y%m%d", time.localtime())
    table_obj = cOdps.get_table(tableName, 'kunlun')  # 创建odps连接
    cEndpoint = 
    cAuth = oss2.Auth()
    cBucket = oss2.Bucket(cAuth, cEndpoint, '')
    try:
        table_obj.create_partition('dt=' + partiton, if_not_exists=True)  # 创建分区
    except:
        print '创建分区错误'

    main()
コード例 #24
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.context import get_context

        cupid_ctx = context()
        if cupid_ctx is None:
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            if data_store_size < split_size and mars_context is not None:
                # get worker counts
                worker_count = max(len(mars_context.get_worker_addresses()), 1)
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        logger.debug(
            'Start creating download session of table {} from cupid.'.format(
                op.table_name))
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        if np.isnan(df.shape[0]):
            est_chunk_rows = [None] * len(download_session.splits)
        else:
            sp_file_sizes = np.array([
                sp.split_file_end - sp.split_file_start
                for sp in download_session.splits
            ])
            total_size = sp_file_sizes.sum()
            est_chunk_rows = sp_file_sizes * df.shape[0] // total_size

        logger.warning('Estimated chunk rows: %r', est_chunk_rows)

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        if len(download_session.splits) == 0:
            logger.debug('Table {} has no data'.format(op.table_name))
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(0, 0))
            out_chunks = [out_chunk]
        else:
            for idx, split in enumerate(download_session.splits):
                chunk_op = DataFrameReadTableSplit(
                    cupid_handle=to_str(split.handle),
                    split_index=split.split_index,
                    split_file_start=split.split_file_start,
                    split_file_end=split.split_file_end,
                    schema_file_start=split.schema_file_start,
                    schema_file_end=split.schema_file_end,
                    add_offset=op.add_offset,
                    dtypes=op.dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=est_chunk_rows[idx])
                # the chunk shape is unknown
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(df.dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, df.shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(idx, 0))
                out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
コード例 #25
0
ファイル: 913Upload.py プロジェクト: smartdongwei/pythonstudy
def rename(files,scan_path):
    for filename in files:
        portion = os.path.splitext(filename)
        if portion[1]==".bcp":
            newname=portion[0]+".txt"
            print(newname)
            print(files)
            os.chdir(scan_path)
            os.rename(filename,newname)



#主函数
if __name__ == '__main__':
    ftp = ftpconnect("","","")
    downloadfile(ftp,"/data1/913data/")
    for i in range(3):
        data_arr = []
        table_obj = cOdps.get_table(ExpressArr[i][1], 'kunlun_dev')
        partition = time.strftime("%Y%m%d", time.localtime())
        try:
            if partition not in data_arr:
                table_obj.create_partition('dt=' + partition,if_not_exists=True)
                data_arr.append(partition)
        except:
            print ('%s创建分区错误' %ExpressArr[0][1])
    files = os.listdir(scan_path)
    rename(files,scan_path)
    express_handle(ExpressArr)

コード例 #26
0
ファイル: maxcompute.py プロジェクト: zlb1028/sqlflow
class MaxComputeConnection(Connection):
    """MaxCompute connection, this class uses ODPS object to establish
    connection with maxcompute

    Args:
        conn_uri: uri in format:
        maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http
    """
    def __init__(self, conn_uri):
        super(MaxComputeConnection, self).__init__(conn_uri)
        user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts(
            conn_uri)
        self.driver = "maxcompute"
        self.params["database"] = proj
        self.endpoint = endpoint
        self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint)

    @staticmethod
    def get_uri_parts(uri):
        """Get username, password, endpoint, projectfrom given uri

        Args:
            uri: a valid maxcompute connection uri

        Returns:
            A tuple (username, password, endpoint, project)
        """
        uripts = urlparse(uri)
        params = parse_qs(uripts.query)
        # compose an endpoint, only keep the host and path and replace scheme
        endpoint = uripts._replace(scheme=params.get("scheme", ["http"])[0],
                                   query="",
                                   netloc=uripts.hostname)
        endpoint = endpoint.geturl()
        return (uripts.username, uripts.password, endpoint,
                params.get("curr_project", [""])[0])

    def _get_result_set(self, statement):
        try:
            instance = self._conn.execute_sql(statement)
            return MaxComputeResultSet(instance)
        except Exception as e:
            raise e

    def close(self):
        if self._conn:
            self._conn = None

    def get_table_schema(self, table_name):
        schema = self._conn.get_table(table_name).schema
        return [(c.name, str(c.type).upper()) for c in schema.columns]

    def persist_table(self, table):
        sql = "ALTER TABLE %s DISABLE LIFECYCLE;" % table
        self.execute(sql)

    def write_table(self,
                    table_name,
                    rows,
                    compress_option=COMPRESS_ODPS_ZLIB):
        """Append rows to given table, this is a driver specific api

        Args:
            table_name: the table to write
            rows: list of rows, each row is a data tuple,
                like [(1,True,"ok"),(2,False,"bad")]
            compress_options: the compress options defined in
                tunnel.CompressOption.CompressAlgorithm
        """
        self._conn.write_table(table_name,
                               rows,
                               compress_option=compress_option)
コード例 #27
0
class ODPSWriter(object):
    def __init__(
        self,
        project,
        access_id,
        access_key,
        endpoint,
        table,
        columns=None,
        column_types=None,
        options=None,
    ):
        """
        Constructs a `ODPSWriter` instance.

        Args:
            project: Name of the ODPS project.
            access_id: ODPS user access ID.
            access_key: ODPS user access key.
            endpoint: ODPS cluster endpoint.
            table: ODPS table name.
            columns: The list of column names in the table,
                which will be inferred if the table exits.
            column_types" The list of column types in the table,
                which will be inferred if the table exits.
            options: Other options passed to ODPS context.
        """
        super(ODPSWriter, self).__init__()

        if table.find(".") > 0:
            project, table = table.split(".")
        if options is None:
            options = {}
        self._project = project
        self._access_id = access_id
        self._access_key = access_key
        self._endpoint = endpoint
        self._table = table
        self._columns = columns
        self._column_types = column_types
        self._odps_table = None
        _configure_odps_options(self._endpoint, options)
        self._odps_client = ODPS(self._access_id, self._access_key,
                                 self._project, self._endpoint)

    def _initialize_table(self):
        if self._odps_client.exist_table(self._table, self._project):
            self._odps_table = self._odps_client.get_table(
                self._table, self._project)
        else:
            if self._columns is None or self._column_types is None:
                raise ValueError("columns and column_types need to be "
                                 "specified for non-existing table.")
            schema = Schema.from_lists(self._columns, self._column_types,
                                       ["worker"], ["string"])
            self._odps_table = self._odps_client.create_table(
                self._table, schema)

    def from_iterator(self, records_iter, worker_index):
        if self._odps_table is None:
            self._initialize_table()
        with self._odps_table.open_writer(partition="worker=" +
                                          str(worker_index),
                                          create_partition=True) as writer:
            for records in records_iter:
                writer.write(records)
コード例 #28
0
from odps import ODPS
o = ODPS('accesskey',
         'xxxxxx',
         project='DWLabClone_67226234odps',
         endpoint='http://service.cn-shanghai.maxcompute.aliyun.com/api')

project = o.get_project()

print(project)
for table in o.list_tables():
    print(table)

t = o.get_table('ods_user_info_d')
for record in t.head(3):
    print(record)

print(next(t.partitions))

with t.open_reader(partition="dt=20210706") as reader:
    count = reader.count
    print(count)
    for record in reader[5:7]:
        print(record)
        print(type(record))
        print(record['uid'])
        print(record['age_range'])

print('*' * 100)
t_rpt = o.get_table('rpt_user_info_d')
# for record in t_rpt.head(3):
#     print(record)
コード例 #29
0
# encoding=utf-8
from odps import ODPS
import pymysql

odps = ODPS('LTAI2wkz5kLt3205','RfTGzh2dfoBljs3ZZKwfpYw6OK9KYX','ofo')
project = odps.get_project()
print(project)

print(project.__getstate__())
t = odps.get_table('ofo_t_puser_partition')


with odps.execute_sql('select id from ofo_t_puser_partition where oauth in (1)').open_reader() as reader:
    
    #返回结构化结果
    #返回结果:
             
                odps.Record {
                 id  90637
                 }
                 odps.Record {
                 id  90640
                 }
             
    for record in reader:#返回结构化结果
        print(record)
    
    #返回原始sql结果
    print(reader)
コード例 #30
0
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        if len(data_srcs) == 0:
            # no partitions are selected
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(out_dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(0, out_shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(index_start, 0))
            out_chunks.append(out_chunk)
        else:
            retry_times = op.retry_times or options.retry_times
            for data_src in data_srcs:
                data_store_size = data_src.size

                retries = 0
                while True:
                    try:
                        with data_src.open_reader() as reader:
                            record_count = reader.count
                        break
                    except:
                        if retries >= retry_times:
                            raise
                        retries += 1
                        time.sleep(1)
                if data_store_size == 0:
                    # empty table
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(0, out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start, 0))
                    out_chunks.append(out_chunk)
                    index_start += 1
                    continue
                chunk_size = df.extra_params.chunk_size

                partition_spec = str(data_src.partition_spec) \
                    if getattr(data_src, 'partition_spec', None) else None

                if chunk_size is None:
                    chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT
                    chunk_count = data_store_size // chunk_bytes + (
                        data_store_size % chunk_bytes != 0)
                    chunk_size = ceildiv(record_count, chunk_count)
                    split_size = chunk_bytes
                else:
                    chunk_count = ceildiv(record_count, chunk_size)
                    split_size = data_store_size // chunk_count

                for i in range(chunk_count):
                    start_index = chunk_size * i
                    end_index = min(chunk_size * (i + 1), record_count)
                    row_size = end_index - start_index
                    chunk_op = DataFrameReadTableSplit(
                        table_name=op.table_name,
                        partition_spec=partition_spec,
                        start_index=start_index,
                        end_index=end_index,
                        nrows=op.nrows,
                        odps_params=op.odps_params,
                        columns=op.columns,
                        incremental_index=op.incremental_index,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=row_size,
                        append_partitions=op.append_partitions,
                        memory_scale=op.memory_scale,
                        retry_times=op.retry_times,
                        extra_params=op.extra_params)
                    index_value = parse_index(
                        pd.RangeIndex(start_index, end_index))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(row_size,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start + i, 0))
                    row_nsplits.append(row_size)
                    out_chunks.append(out_chunk)

                index_start += chunk_count

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)