class OdpsConn(object): """ odps 连接 """ def __init__(self, project): self.access_id = ODPSCONF.key self.access_key = ODPSCONF.sec self.project = project self.odps = None def __enter__(self): try: self.odps = ODPS(self.access_id, self.access_key, self.project) except Exception as exc: raise ValueError(exc.message) return self def __exit__(self, exc_type, exc_val, exc_tb): del self def get_table_count_and_names(self): """ 获取一个项目下的table的数量和table的名字 :return: """ tables = self.odps.list_tables() names = [table.name for table in tables] count = len(names) return count, names def get_table_schema(self, tname): """ 获取表字段 :return: """ table = self.odps.get_table(tname) _sa = table.schema _columns = _sa.columns schema = [item.name for item in _columns] return schema def execute_sql(self, sql): rest = [] with self.odps.execute_sql(sql).open_reader() as reader: for record in reader: rest.append(record.values) return rest def get_table_last_update_time(self, tname): t = self.odps.get_table(tname) last_update_time = t.last_modified_time if t else None return last_update_time def count_table(self, table): sql = 'select count(1) from %s' % table with self.odps.execute_sql(sql).open_reader() as reader: return reader[0].values[0]
def _execute_in_cupid(cls, ctx, op): import os import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount cupid_client = CupidServiceClient() to_store_data = ctx[op.inputs[0].key] bearer_token = cupid_client.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) odps_schema = o.get_table(op.table_name).schema project_name, table_name = op.table_name.split('.') writer_config = dict(_table_name=table_name, _project_name=project_name, _table_schema=odps_schema, _partition_spec=op.partition_spec, _block_id=op.block_id, _handle=op.cupid_handle) cupid_client.write_table_data(writer_config, to_store_data, op.write_batch_size) ctx[op.outputs[0].key] = pd.DataFrame()
def tile(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = op.inputs[0] out_chunks = [] out_chunk_shape = (0,) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '') chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) > combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i: i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str(upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes) out_df = op.outputs[0] new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, dtypes=out_df.dtypes, chunks=[commit_table_chunk], nsplits=((0,),) * len(out_chunk_shape))
def _get_table_schema(self): odps_client = ODPS( access_id=self._kwargs["access_id"], secret_access_key=self._kwargs["access_key"], project=self._kwargs["project"], endpoint=self._kwargs.get("endpoint"), ) odps_table = odps_client.get_table(self._kwargs["table"]) return odps_table.schema
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel import pyarrow as pa import pandas as pd project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = options.retry_times retries = 0 while True: try: if op.partition_spec is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=op.partition_spec) else: upload_session = tunnel.create_upload_session(t.name) break except: if retries >= retry_times: raise time.sleep(1) logger.debug('Start writing table %s split index: %s', op.table_name, op.inputs[0].index) retries = 0 while True: try: writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key]) writer.write(arrow_rb) writer.close() break except: if retries >= retry_times: raise time.sleep(1) upload_session.commit([0]) logger.debug('Finish writing table %s split index: %s', op.table_name, op.inputs[0].index) ctx[op.outputs[0].key] = pd.DataFrame()
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows with download_session.open_arrow_reader(op.start_index, count, columns=op.columns) as reader: table = reader.read() table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_columns(data, op.outputs[0].dtypes) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
def _handle_enum_table_partitions(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, partition task_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import context cupid_ctx = context() odps_params = task_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) table = o.get_table(task_config['table_name']) partition_desc = task_config.get('partition') if not table.schema.partitions: _write_request_result(sock, result=None) elif partition_desc: if check_partition_exist(table, partition_desc): _write_request_result(sock, result=[partition_desc]) else: parts = filter_partitions(o, list(table.partitions), partition_desc) _write_request_result( sock, result=[str(pt.partition_spec) for pt in parts]) else: _write_request_result( sock, result=[str(pt.partition_spec) for pt in table.partitions]) except: logger.exception('Failed to create download session') _write_request_result(sock, False, exc_info=sys.exc_info())
def _handle_create_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(session_config['table_name']) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) ret_data = { 'handle': upload_session.handle, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def uploadexcel(input_file,output_table_n='defult'): odps = ODPS(ACCESS_KEY_ID, ACCESS_KEY_SECRET, PROJECT, endpoint='http://service.odps.aliyun.com/api') project = odps.get_project() # 取到默认项目 print(project) ds = datetime.datetime.now().strftime('%Y%m%d') print(ds) wb = openpyxl.load_workbook(filename=input_file,read_only=True) ws = wb.active print(datetime.datetime.now()) output_table = odps.get_table(output_table_n) if output_table.exist_partition('ds=' + ds): output_table.delete_partition('ds=' + ds) output_table.create_partition('ds=' + ds, if_not_exists=True) tunnel = TableTunnel(odps) upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds) print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema) index=0 with upload_session.open_record_writer(0) as writer: for row in ws.rows: records = output_table.new_record() i=0 for cell in row: if cell is None: records[i] = None else: records[i] = str(cell.value).encode('utf-8', "replace") i=i+1 writer.write(records) index=index+1 if index % 1000 == 0: print(index) upload_session.commit(0) print('===========') print(datetime.datetime.now())
def data_to_local(): # quant_financial_report_balance_sheet 资产负债表 o = ODPS('***', '***', 'gravity_quant') balance_sheet = DataFrame( o.get_table('quant_financial_report_balance_sheet')) balance_sheet = balance_sheet.to_pandas() balance_sheet.to_csv('financial_report_balance_sheet.csv', index=False) # quant_financial_report_profitloss 利润表 o = ODPS('***', '***', 'gravity_quant') profitloss_sheet = DataFrame( o.get_table('quant_financial_report_profitloss')) profitloss_sheet = profitloss_sheet.to_pandas() profitloss_sheet.to_csv('quant_financial_report_profitloss.csv', index=False) # quant_financial_report_cashflows_statement 现金流表 o = ODPS('***', '***', 'gravity_quant') cashflows_sheet = DataFrame( o.get_table('quant_financial_report_cashflows_statement')) cashflows_sheet = cashflows_sheet.to_pandas() cashflows_sheet.to_csv('quant_financial_report_cashflows_statement.csv', index=False) # quant_financial_analysis_report 财务分析表 o = ODPS('***', '***', 'gravity_quant') analysis_sheet = DataFrame(o.get_table('quant_financial_analysis_report')) analysis_sheet = analysis_sheet.to_pandas() analysis_sheet.to_csv('quant_financial_analysis_report.csv', index=False) # market_values_end 市值数据 o = ODPS('***', '***', 'gravity_quant') market_value = DataFrame(o.get_table('tdl_huangjin_market_values_all')) market_value = market_value.to_pandas() print(market_value.shape) market_value.drop_duplicates(subset=['code', 'pt'], keep='first', inplace=True) market_value = market_value.dropna(subset=['code', 'pt', 'total_value']) market_value.to_csv('market_values_end.csv', index=False) # 行业信息数据 o = ODPS('***', '***', 'gravity_quant') quant_stocks_industry_info = DataFrame( o.get_table('quant_stocks_industry_info')) quant_stocks_industry_info = quant_stocks_industry_info.to_pandas() a = quant_stocks_industry_info['industry_sw'].str.split( '-', expand=True).add_prefix('level_') stocks_info = pd.concat([quant_stocks_industry_info, a], axis=1) stocks_info.to_csv('stocks_info.csv', index=False)
def run(args): # build an odps instance o = ODPS(args.odps_access_id, args.odps_access_key, args.odps_project, endpoint=args.odps_endpoint) input_table_project = args.odps_project input_table_name = args.input_table_name if '.' in input_table_name: input_table_project = args.input_table_name.split(".")[0] input_table_name = args.input_table_name.split(".")[1] # download data from odps input_table = o.get_table(input_table_name, project=input_table_project) data = input_table.to_df().to_pandas() # sample data new_data = data.sample(args.sample_row_count) # create output table and upload data to odps o.delete_table(args.output_table_name, if_exists=True) output_table_project = args.odps_project output_table_name = args.output_table_name if '.' in output_table_name: output_table_project = args.output_table_name.split(".")[0] output_table_name = args.output_table_name.split(".")[1] table = o.create_table(output_table_name, input_table.schema, project=output_table_project, if_not_exists=False, lifecycle=3) o.write_table(output_table_name, new_data.values.tolist(), project=output_table_project)
from odps import ODPS from odps.df import DataFrame import pandas as pd import xlwt """ your-access-id:账户的AccessKey ID。 your-secret-access-key:账户的AccessKey Secret。 your-default-project:使用的项目空间名称。 your-end-point:MaxCompute服务所在区域的Endpoint。 """ o = ODPS('**your-access-id**', '**your-secret-access-key**', '**your-default-project**', endpoint='**your-end-point**') t = o.get_table('entry_range_e_info_result') datas1 = DataFrame(t) a = [] reader = t.open_reader() count = reader.count print(count) #遍历行和列 for record in reader[0:count]: one = [] for col in t.schema.names: one.append(record[col]) a.append(one) data1 = pd.DataFrame(a, columns=datas1.schema.names)
class ODPSIOTest(unittest.TestCase): def setUp(self): self._project = os.environ[ODPSConfig.PROJECT_NAME] self._access_id = os.environ[ODPSConfig.ACCESS_ID] self._access_key = os.environ[ODPSConfig.ACCESS_KEY] self._endpoint = os.environ.get(ODPSConfig.ENDPOINT) self._test_read_table = "test_odps_reader_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._test_write_table = "test_odps_writer_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) create_iris_odps_table(self._odps_client, self._project, self._test_read_table) def test_read_to_iterator(self): reader = ODPSReader( self._project, self._access_id, self._access_key, self._endpoint, self._test_read_table, None, 4, None, ) records_iter = reader.to_iterator(1, 0, 50, 2, False, None) records = list(records_iter) self.assertEqual(len(records), 6, "Unexpected number of batches: %d" % len(records)) flattened_records = [record for batch in records for record in batch] self.assertEqual( len(flattened_records), 220, "Unexpected number of total records: %d" % len(flattened_records), ) def test_write_odps_to_recordio_shards_from_iterator(self): reader = ODPSReader( self._project, self._access_id, self._access_key, self._endpoint, self._test_read_table, None, 4, None, ) records_iter = reader.to_iterator(1, 0, 50, 2, False, None) with tempfile.TemporaryDirectory() as output_dir: write_recordio_shards_from_iterator( records_iter, ["f" + str(i) for i in range(5)], output_dir, records_per_shard=50, ) self.assertEqual(len(os.listdir(output_dir)), 5) def test_write_from_iterator(self): columns = ["num", "num2"] column_types = ["bigint", "double"] # If the table doesn't exist yet writer = ODPSWriter( self._project, self._access_id, self._access_key, self._endpoint, self._test_write_table, columns, column_types, ) writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2) table = self._odps_client.get_table(self._test_write_table, self._project) self.assertEqual(table.schema.names, columns) self.assertEqual(table.schema.types, column_types) self.assertEqual(table.to_df().count(), 1) # If the table already exists writer = ODPSWriter( self._project, self._access_id, self._access_key, self._endpoint, self._test_write_table, ) writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2) table = self._odps_client.get_table(self._test_write_table, self._project) self.assertEqual(table.schema.names, columns) self.assertEqual(table.schema.types, column_types) self.assertEqual(table.to_df().count(), 2) def tearDown(self): self._odps_client.delete_table(self._test_write_table, self._project, if_exists=True) self._odps_client.delete_table(self._test_read_table, self._project, if_exists=True)
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel out = op.outputs[0] if op.table_name is None: # is empty table ctx[out.key] = cls._build_empty_df(out) return project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = op.retry_times or options.retry_times retries = 0 while True: try: if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) break except: if retries >= retry_times: raise retries += 1 time.sleep(1) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows retries = 0 while True: try: with download_session.open_arrow_reader( op.start_index, count, columns=op.columns) as reader: table = reader.read() break except: if retries >= retry_times: raise retries += 1 time.sleep(1) table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_output_data(op, data) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
# -*- coding: utf-8 -*- """ Created on Sat Sep 2 20:07:27 2017 @author: shuai.qian """ import matplotlib.pyplot as plt from odps.df import DataFrame from odps import ODPS o = ODPS('',project='', endpoint='') t = DataFrame(o.get_table('tmp_ods_mc_testing_dlt')) print("=================================> START <==================================") #print(t.dtypes) #print(t["class"].head(5)) t.groupby('class').agg(count = t['class'].count()) # %matplotlib inline t['class'].value_counts().plot(kind = 'bar', x = 'class', xlabel = 'cnt' ) tmp = range(0,10,2) tmp.pop(1)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = build_concatenated_rows_frame(op.inputs[0]) out_df = op.outputs[0] out_chunks = [] out_chunk_shape = (0, ) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace( '-', '') chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, unknown_as_string=op.unknown_as_string, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, index_value=out_df.index_value, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) >= combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i:i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, index_value=out_df.index_value, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str( upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk( chunks, shape=out_chunk_shape, dtypes=op.dtypes, index_value=out_df.index_value) new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, index_value=out_df.index_value, dtypes=out_df.dtypes, columns_value=out_df.columns_value, chunks=[commit_table_chunk], nsplits=((0, ), ) * len(out_chunk_shape))
""" 测试python连接到odps """ from odps import ODPS o = ODPS('LTAIFwE1F5V5Fucy','c1Daaf3vFHwu3PhLBK99iHvH2AqWC4','zhangp123', endpoint='http://service.cn.maxcompute.aliyun.com/api') dual = o.get_table('20171210v7') print(dual.name) print(dual.schema) print(dual.head(10)) # dual.drop()
class ODPSIOTest(unittest.TestCase): def setUp(self): self._project = os.environ[ODPSConfig.PROJECT_NAME] self._access_id = os.environ[ODPSConfig.ACCESS_ID] self._access_key = os.environ[ODPSConfig.ACCESS_KEY] self._endpoint = os.environ[ODPSConfig.ENDPOINT] self._test_read_table = "chicago_taxi_train_data" self._test_write_table = "test_odps_writer_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) def test_read_to_iterator(self): reader = ODPSReader( self._project, self._access_id, self._access_key, self._endpoint, self._test_read_table, None, 4, None, ) records_iter = reader.to_iterator(1, 0, 200, 2, False, None) for batch in records_iter: self.assertEqual(len(batch), 200, "incompatible size: %d" % len(batch)) def test_write_odps_to_recordio_shards_from_iterator(self): reader = ODPSReader( self._project, self._access_id, self._access_key, self._endpoint, self._test_read_table, None, 4, None, ) records_iter = reader.to_iterator(1, 0, 200, 2, False, None) with tempfile.TemporaryDirectory() as output_dir: write_recordio_shards_from_iterator( records_iter, ["f" + str(i) for i in range(18)], output_dir, records_per_shard=200, ) self.assertEqual(len(os.listdir(output_dir)), 100) def test_write_from_iterator(self): columns = ["num", "num2"] column_types = ["bigint", "double"] # If the table doesn't exist yet writer = ODPSWriter( self._project, self._access_id, self._access_key, self._endpoint, self._test_write_table, columns, column_types, ) writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2) table = self._odps_client.get_table(self._test_write_table, self._project) self.assertEqual(table.schema.names, columns) self.assertEqual(table.schema.types, column_types) self.assertEqual(table.to_df().count(), 1) # If the table already exists writer = ODPSWriter( self._project, self._access_id, self._access_key, self._endpoint, self._test_write_table, ) writer.from_iterator(iter([[1, 0.5], [2, 0.6]]), 2) table = self._odps_client.get_table(self._test_write_table, self._project) self.assertEqual(table.schema.names, columns) self.assertEqual(table.schema.types, column_types) self.assertEqual(table.to_df().count(), 2) def tearDown(self): self._odps_client.delete_table(self._test_write_table, self._project, if_exists=True)
requests11 = makeRequests(Lib2Odps, file_arrs) [pool11.putRequest(req) for req in requests11] pool11.wait() logging.info("所有文件上传结束") except: logging.exception("上传过程中出错") if __name__ == "__main__": partition = time.strftime("%Y%m%d", time.localtime()) logging.basicConfig( filename=runLog + 'dongtaiRL' + partition + ".log", filemode='a', level=logging.NOTSET, format="%(asctime)s - %(levelname)s: %(message)s") #日志的输出格式 table_obj = cOdps.get_table(ExpressArr[1], 'kunlun') try: # 判断是否存在数组中 if partition not in date_arr: print '分区%s' % partition table_obj.create_partition('dt=' + partition, if_not_exists=True) # 创建分区 date_arr.append(partition) print table_obj except: print '创建分区错误' try: fileReal = os.path.split(os.path.realpath( sys.argv[0]))[0] # 获取脚本所在的文件路径
schema = Schema( columns=columns) # schema = Schema(columns=columns, partitions=partitions) table_name = 'wide_and_deep' print(schema.columns) def create_table(): table = o.create_table(table_name, schema, if_not_exists=True) #create_table() table = o.get_table(table_name) #.to_df() print(table.to_df()) def write_data(): records = [] # prepare data input_file = './part-0' with open(input_file, 'r') as f: for line in f: example = [] features = line.rstrip('\n').split('\t') label = int(features[0]) example.append(label)
3、用法:python 脚本.py 源项目名称 目标项目名称 ''' s = ODPS('', '', '%s' % sys.argv[1], endpoint='http://service.cn.maxcompute.aliyun.com/api') d = ODPS('', '', '%s' % sys.argv[2], endpoint='http://service.cn.maxcompute.aliyun.com/api') print("######################################################################") for table in s.list_tables(): t1 = s.get_table(table.name) if d.exist_table(table.name): t2 = d.get_table(table.name) else: print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2])) continue if table.schema.partitions: #判断该表是否为分区表 #print 'Table %s is partitioned.' %table.name for partition in table.partitions: #print partition.name with t1.open_reader(partition='%s' % partition.name) as reader: count1 = reader.count #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1) if t2.exist_partition(partition.name): with t2.open_reader(partition='%s' %
#txt_csv(txtName) #for csvName in os.listdir(csvFile): #txt_upload_odps(csvName) #for jpgName in os.listdir(photoPath): allJpgList = [photoPath + name for name in os.listdir(photoPath)] try: # 上传时的模块 pool11 = tp.ThreadPool(10) requests11 = makeRequests(jpg_Upload, allJpgList) [pool11.putRequest(req) for req in requests11] pool11.wait() except: traceback.print_exc() if __name__ == '__main__': tableName = '' partiton = time.strftime("%Y%m%d", time.localtime()) table_obj = cOdps.get_table(tableName, 'kunlun') # 创建odps连接 cEndpoint = cAuth = oss2.Auth() cBucket = oss2.Bucket(cAuth, cEndpoint, '') try: table_obj.create_partition('dt=' + partiton, if_not_exists=True) # 创建分区 except: print '创建分区错误' main()
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from mars.context import get_context cupid_ctx = context() if cupid_ctx is None: raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max(len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) logger.debug( 'Start creating download session of table {} from cupid.'.format( op.table_name)) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) if np.isnan(df.shape[0]): est_chunk_rows = [None] * len(download_session.splits) else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() est_chunk_rows = sp_file_sizes * df.shape[0] // total_size logger.warning('Estimated chunk rows: %r', est_chunk_rows) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False if len(download_session.splits) == 0: logger.debug('Table {} has no data'.format(op.table_name)) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(0, 0)) out_chunks = [out_chunk] else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx]) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def rename(files,scan_path): for filename in files: portion = os.path.splitext(filename) if portion[1]==".bcp": newname=portion[0]+".txt" print(newname) print(files) os.chdir(scan_path) os.rename(filename,newname) #主函数 if __name__ == '__main__': ftp = ftpconnect("","","") downloadfile(ftp,"/data1/913data/") for i in range(3): data_arr = [] table_obj = cOdps.get_table(ExpressArr[i][1], 'kunlun_dev') partition = time.strftime("%Y%m%d", time.localtime()) try: if partition not in data_arr: table_obj.create_partition('dt=' + partition,if_not_exists=True) data_arr.append(partition) except: print ('%s创建分区错误' %ExpressArr[0][1]) files = os.listdir(scan_path) rename(files,scan_path) express_handle(ExpressArr)
class MaxComputeConnection(Connection): """MaxCompute connection, this class uses ODPS object to establish connection with maxcompute Args: conn_uri: uri in format: maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http """ def __init__(self, conn_uri): super(MaxComputeConnection, self).__init__(conn_uri) user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts( conn_uri) self.driver = "maxcompute" self.params["database"] = proj self.endpoint = endpoint self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint) @staticmethod def get_uri_parts(uri): """Get username, password, endpoint, projectfrom given uri Args: uri: a valid maxcompute connection uri Returns: A tuple (username, password, endpoint, project) """ uripts = urlparse(uri) params = parse_qs(uripts.query) # compose an endpoint, only keep the host and path and replace scheme endpoint = uripts._replace(scheme=params.get("scheme", ["http"])[0], query="", netloc=uripts.hostname) endpoint = endpoint.geturl() return (uripts.username, uripts.password, endpoint, params.get("curr_project", [""])[0]) def _get_result_set(self, statement): try: instance = self._conn.execute_sql(statement) return MaxComputeResultSet(instance) except Exception as e: raise e def close(self): if self._conn: self._conn = None def get_table_schema(self, table_name): schema = self._conn.get_table(table_name).schema return [(c.name, str(c.type).upper()) for c in schema.columns] def persist_table(self, table): sql = "ALTER TABLE %s DISABLE LIFECYCLE;" % table self.execute(sql) def write_table(self, table_name, rows, compress_option=COMPRESS_ODPS_ZLIB): """Append rows to given table, this is a driver specific api Args: table_name: the table to write rows: list of rows, each row is a data tuple, like [(1,True,"ok"),(2,False,"bad")] compress_options: the compress options defined in tunnel.CompressOption.CompressAlgorithm """ self._conn.write_table(table_name, rows, compress_option=compress_option)
class ODPSWriter(object): def __init__( self, project, access_id, access_key, endpoint, table, columns=None, column_types=None, options=None, ): """ Constructs a `ODPSWriter` instance. Args: project: Name of the ODPS project. access_id: ODPS user access ID. access_key: ODPS user access key. endpoint: ODPS cluster endpoint. table: ODPS table name. columns: The list of column names in the table, which will be inferred if the table exits. column_types" The list of column types in the table, which will be inferred if the table exits. options: Other options passed to ODPS context. """ super(ODPSWriter, self).__init__() if table.find(".") > 0: project, table = table.split(".") if options is None: options = {} self._project = project self._access_id = access_id self._access_key = access_key self._endpoint = endpoint self._table = table self._columns = columns self._column_types = column_types self._odps_table = None _configure_odps_options(self._endpoint, options) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) def _initialize_table(self): if self._odps_client.exist_table(self._table, self._project): self._odps_table = self._odps_client.get_table( self._table, self._project) else: if self._columns is None or self._column_types is None: raise ValueError("columns and column_types need to be " "specified for non-existing table.") schema = Schema.from_lists(self._columns, self._column_types, ["worker"], ["string"]) self._odps_table = self._odps_client.create_table( self._table, schema) def from_iterator(self, records_iter, worker_index): if self._odps_table is None: self._initialize_table() with self._odps_table.open_writer(partition="worker=" + str(worker_index), create_partition=True) as writer: for records in records_iter: writer.write(records)
from odps import ODPS o = ODPS('accesskey', 'xxxxxx', project='DWLabClone_67226234odps', endpoint='http://service.cn-shanghai.maxcompute.aliyun.com/api') project = o.get_project() print(project) for table in o.list_tables(): print(table) t = o.get_table('ods_user_info_d') for record in t.head(3): print(record) print(next(t.partitions)) with t.open_reader(partition="dt=20210706") as reader: count = reader.count print(count) for record in reader[5:7]: print(record) print(type(record)) print(record['uid']) print(record['age_range']) print('*' * 100) t_rpt = o.get_table('rpt_user_info_d') # for record in t_rpt.head(3): # print(record)
# encoding=utf-8 from odps import ODPS import pymysql odps = ODPS('LTAI2wkz5kLt3205','RfTGzh2dfoBljs3ZZKwfpYw6OK9KYX','ofo') project = odps.get_project() print(project) print(project.__getstate__()) t = odps.get_table('ofo_t_puser_partition') with odps.execute_sql('select id from ofo_t_puser_partition where oauth in (1)').open_reader() as reader: #返回结构化结果 #返回结果: odps.Record { id 90637 } odps.Record { id 90640 } for record in reader:#返回结构化结果 print(record) #返回原始sql结果 print(reader)
def _tile_tunnel(cls, op): from odps import ODPS project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] row_nsplits = [] index_start = 0 df = op.outputs[0] out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) if len(data_srcs) == 0: # no partitions are selected chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(0, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(index_start, 0)) out_chunks.append(out_chunk) else: retry_times = op.retry_times or options.retry_times for data_src in data_srcs: data_store_size = data_src.size retries = 0 while True: try: with data_src.open_reader() as reader: record_count = reader.count break except: if retries >= retry_times: raise retries += 1 time.sleep(1) if data_store_size == 0: # empty table chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(0, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(index_start, 0)) out_chunks.append(out_chunk) index_start += 1 continue chunk_size = df.extra_params.chunk_size partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None if chunk_size is None: chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT chunk_count = data_store_size // chunk_bytes + ( data_store_size % chunk_bytes != 0) chunk_size = ceildiv(record_count, chunk_count) split_size = chunk_bytes else: chunk_count = ceildiv(record_count, chunk_size) split_size = data_store_size // chunk_count for i in range(chunk_count): start_index = chunk_size * i end_index = min(chunk_size * (i + 1), record_count) row_size = end_index - start_index chunk_op = DataFrameReadTableSplit( table_name=op.table_name, partition_spec=partition_spec, start_index=start_index, end_index=end_index, nrows=op.nrows, odps_params=op.odps_params, columns=op.columns, incremental_index=op.incremental_index, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=row_size, append_partitions=op.append_partitions, memory_scale=op.memory_scale, retry_times=op.retry_times, extra_params=op.extra_params) index_value = parse_index( pd.RangeIndex(start_index, end_index)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(row_size, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(index_start + i, 0)) row_nsplits.append(row_size) out_chunks.append(out_chunk) index_start += chunk_count if op.incremental_index and _NEED_STANDARDIZE: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = (tuple(row_nsplits), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)