def check_instance_idle(self): last_active_time = self._last_activity_time has_running = False for ref in self._session_refs.values(): for info in ref.get_graph_infos().values(): if info.get('end_time') is None: has_running = True break else: last_active_time = max(info['end_time'], last_active_time) if has_running: break if not has_running and last_active_time < time.time( ) - self._idle_timeout: # timeout: we need to kill the instance from odps import ODPS from odps.accounts import BearerTokenAccount from cupid.runtime import context logger.warning('Timeout met, killing the instance now.') bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ['ODPS_PROJECT_NAME'] endpoint = os.environ['ODPS_RUNTIME_ENDPOINT'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) o.stop_instance(os.environ['MARS_K8S_POD_NAMESPACE']) else: self.ref().check_instance_idle(_delay=10, _tell=True, _wait=False)
def _handle_terminate_instance(sock): from cupid.runtime import context, RuntimeContext from odps import ODPS from odps.accounts import BearerTokenAccount try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with key cmd_body = pickle.loads(sock.recv(cmd_len)) instance_id = cmd_body['instance_id'] if not RuntimeContext.is_context_ready(): logger.warning('Cupid context not ready') else: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ['ODPS_PROJECT_NAME'] endpoint = os.environ['ODPS_RUNTIME_ENDPOINT'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) o.stop_instance(instance_id) except: logger.exception('Failed to put kv value') _write_request_result(sock, False, exc_info=sys.exc_info())
def load2excel(filepath, begin, end): o = ODPS(ACCESS_KEY_ID, ACCESS_KEY_SECRET, PROJECT, endpoint='http://service.odps.aliyun.com/api') print(o) sql = "select * from stat_table where ds>={begin} and ds<={end}".format(begin=begin, end=end) with o.execute_sql(sql).open_reader() as reader: print(sql) print(reader.count) table_title = reader._schema.names print(table_title) workbook = openpyxl.Workbook() worksheet = workbook.create_sheet('Sheet') for i in range(len(table_title)): worksheet.cell(1, i + 1, table_title[i]) write_count = 2 for r in reader: for i in range(len(r.values)): # 10个依次写入当前行 worksheet.cell(write_count, i + 1, r.values[i]) write_count = write_count + 1 workbook.save(filename=filepath)
def setUp(self): self.project = os.environ[MaxComputeConfig.PROJECT_NAME] access_id = os.environ[MaxComputeConfig.ACCESS_ID] access_key = os.environ[MaxComputeConfig.ACCESS_KEY] endpoint = os.environ.get(MaxComputeConfig.ENDPOINT) tunnel_endpoint = os.environ.get(MaxComputeConfig.TUNNEL_ENDPOINT, None) self.test_table = "test_odps_data_reader_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self.odps_client = ODPS(access_id, access_key, self.project, endpoint) create_iris_odps_table(self.odps_client, self.project, self.test_table) self.records_per_task = 50 self.reader = ODPSDataReader( project=self.project, access_id=access_id, access_key=access_key, endpoint=endpoint, table=self.test_table, tunnel_endpoint=tunnel_endpoint, num_processes=1, records_per_task=self.records_per_task, )
def check_instance_idle(self): from cupid.runtime import context has_running, active_time_from_service = self._get_service_activity_info( ) if active_time_from_service != self._last_active_time_from_service: self._last_active_time = active_time_from_service self._last_active_time_from_service = active_time_from_service elif has_running: self._last_active_time = time.time() if self._last_active_time < time.time() - self._idle_timeout: # timeout: we need to kill the instance from odps import ODPS from odps.accounts import BearerTokenAccount logger.warning('Timeout met, killing the instance now.') bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ['ODPS_PROJECT_NAME'] endpoint = os.environ['ODPS_RUNTIME_ENDPOINT'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) o.stop_instance(os.environ['MARS_K8S_POD_NAMESPACE']) else: kv_store = context().kv_store() kv_store[CUPID_LAST_IDLE_TIME_KEY] = str(self._last_active_time) self.ref().check_instance_idle(_delay=10, _tell=True, _wait=False)
def _execute_in_cupid(cls, ctx, op): import os import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount cupid_client = CupidServiceClient() to_store_data = ctx[op.inputs[0].key] bearer_token = cupid_client.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) odps_schema = o.get_table(op.table_name).schema project_name, table_name = op.table_name.split('.') writer_config = dict(_table_name=table_name, _project_name=project_name, _table_schema=odps_schema, _partition_spec=op.partition_spec, _block_id=op.block_id, _handle=op.cupid_handle) cupid_client.write_table_data(writer_config, to_store_data, op.write_batch_size) ctx[op.outputs[0].key] = pd.DataFrame()
def get_lt_etid(): logging.info('12. 正在获取其他的企业 ') print utils.current_time(), '建立odps链接..' o = ODPS('LTAIzEuNzcL6qJJ8', 'eUAgj9ijhWCvOQ3w5Uv3FkwhNxvPF2', 'database_test', 'http://service.odps.aliyun.com/api') print utils.current_time(), '进行查询...' pt = time.strftime('%Y%m%d', time.localtime(int(time.time() - 86400))) res = o.execute_sql( "select distinct etid from et_jobs where pt='{}' and isheadhunter=1". format(pt)) print utils.current_time(), '处理查询结果...' etid_set = set() conn = utils.get_local_db() addtime = int(time.time()) cnt = 0 with res.open_reader() as reader: print utils.current_time(), '共需处理{}条!'.format(reader.count) for record in reader: etid_set.add((record['etid'], )) if len(etid_set) >= 1000: conn.executemany( "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)" .format(addtime), list(etid_set)) cnt += 1000 print utils.current_time(), '当前已写入{}条!'.format(cnt) etid_set.clear() if len(etid_set) > 0: conn.executemany( "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)" .format(addtime), list(etid_set)) cnt += len(etid_set) print utils.current_time(), '当前已写入{}条!'.format(cnt) conn.close() return reader.count
def __init__(self, conn_uri): super(MaxComputeConnection, self).__init__(conn_uri) user, pwd, endpoint, proj = MaxComputeConnection.get_uri_parts( conn_uri) self.driver = "maxcompute" self.params["database"] = proj self.endpoint = endpoint self._conn = ODPS(user, pwd, project=proj, endpoint=endpoint)
class OdpsConn(object): """ odps 连接 """ def __init__(self, project): self.access_id = ODPSCONF.key self.access_key = ODPSCONF.sec self.project = project self.odps = None def __enter__(self): try: self.odps = ODPS(self.access_id, self.access_key, self.project) except Exception as exc: raise ValueError(exc.message) return self def __exit__(self, exc_type, exc_val, exc_tb): del self def get_table_count_and_names(self): """ 获取一个项目下的table的数量和table的名字 :return: """ tables = self.odps.list_tables() names = [table.name for table in tables] count = len(names) return count, names def get_table_schema(self, tname): """ 获取表字段 :return: """ table = self.odps.get_table(tname) _sa = table.schema _columns = _sa.columns schema = [item.name for item in _columns] return schema def execute_sql(self, sql): rest = [] with self.odps.execute_sql(sql).open_reader() as reader: for record in reader: rest.append(record.values) return rest def get_table_last_update_time(self, tname): t = self.odps.get_table(tname) last_update_time = t.last_modified_time if t else None return last_update_time def count_table(self, table): sql = 'select count(1) from %s' % table with self.odps.execute_sql(sql).open_reader() as reader: return reader[0].values[0]
def tile(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = op.inputs[0] out_chunks = [] out_chunk_shape = (0,) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '') chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) > combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i: i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str(upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes) out_df = op.outputs[0] new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, dtypes=out_df.dtypes, chunks=[commit_table_chunk], nsplits=((0,),) * len(out_chunk_shape))
def _get_table_schema(self): odps_client = ODPS( access_id=self._kwargs["access_id"], secret_access_key=self._kwargs["access_key"], project=self._kwargs["project"], endpoint=self._kwargs.get("endpoint"), ) odps_table = odps_client.get_table(self._kwargs["table"]) return odps_table.schema
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel import pyarrow as pa import pandas as pd project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) retry_times = options.retry_times retries = 0 while True: try: if op.partition_spec is not None: upload_session = tunnel.create_upload_session( t.name, partition_spec=op.partition_spec) else: upload_session = tunnel.create_upload_session(t.name) break except: if retries >= retry_times: raise time.sleep(1) logger.debug('Start writing table %s split index: %s', op.table_name, op.inputs[0].index) retries = 0 while True: try: writer = upload_session.open_arrow_writer(0) arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key]) writer.write(arrow_rb) writer.close() break except: if retries >= retry_times: raise time.sleep(1) upload_session.commit([0]) logger.debug('Finish writing table %s split index: %s', op.table_name, op.inputs[0].index) ctx[op.outputs[0].key] = pd.DataFrame()
def testTokenizedSignServerAccount(self): server = SignServer(token=str(uuid.uuid4())) server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) self.assertRaises( SignServerError, lambda: odps.delete_table(tn('test_sign_account_table'), if_exists=True)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address, token=server.token) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async=True) finally: server.stop()
def __init__(self, conn_uri): super().__init__(conn_uri) self.params["database"] = self.params["curr_project"] # compose an endpoint, only keep the host and path and replace scheme endpoint = self.uripts._replace(scheme=self.params["scheme"], query="", netloc=self.uripts.hostname) self._conn = ODPS(self.uripts.username, self.uripts.password, project=self.params["database"], endpoint=endpoint.geturl())
def setUp(self): self._project = os.environ[ODPSConfig.PROJECT_NAME] self._access_id = os.environ[ODPSConfig.ACCESS_ID] self._access_key = os.environ[ODPSConfig.ACCESS_KEY] self._endpoint = os.environ[ODPSConfig.ENDPOINT] self._test_read_table = "chicago_taxi_train_data" self._test_write_table = "test_odps_writer_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint)
def __init__( self, project, access_id, access_key, endpoint, table, partition=None, num_processes=None, options=None, transform_fn=None, columns=None, ): """ Constructs a `ODPSReader` instance. Args: project: Name of the ODPS project. access_id: ODPS user access ID. access_key: ODPS user access key. endpoint: ODPS cluster endpoint. table: ODPS table name. tunnel_endpoint: ODPS tunnel endpoint. partition: ODPS table's partition. options: Other options passed to ODPS context. num_processes: Number of parallel processes on this worker. If `None`, use the number of cores. transform_fn: Customized transfrom function columns: list of table column names """ super(ODPSReader, self).__init__() if table.find(".") > 0: project, table = table.split(".") if options is None: options = {} self._project = project self._access_id = access_id self._access_key = access_key self._endpoint = endpoint self._table = table self._partition = partition self._num_processes = num_processes _configure_odps_options(self._endpoint, options) self._odps_table = ODPS( self._access_id, self._access_key, self._project, self._endpoint, ).get_table(self._table) self._transform_fn = transform_fn self._columns = columns
def _set_odps(self): if self._odps is not None: return if options.access_id is not None and \ options.access_key is not None and \ options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint ) else: self._odps = enter().odps
def download_infos(tablename, storename, keys): o = ODPS("LTAIWt3hG5GvYBhX", "RriedkAIENmPvXvRmQcy9wRqOYx3QV", 'graph_embedding_intern_dev', endpoint='http://service-corp.odps.aliyun-inc.com/api') project = o.get_project() csv_file = open(storename, mode='w') writer = csv.writer(csv_file, delimiter='\t') tunnel = TableTunnel(o) download_session = tunnel.create_download_session(tablename) with download_session.open_record_reader(0, download_session.count) as reader: for record in reader: info = [record[key] for key in keys] writer.writerow(info) print("complete storing {}".format(storename))
def _execute_arrow_tunnel(cls, ctx, op): from odps import ODPS from odps.tunnel import TableTunnel project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(odps_params['access_id'], odps_params['secret_access_key'], project=odps_params['project'], endpoint=endpoint) t = o.get_table(op.table_name) tunnel = TableTunnel(o, project=t.project) if op.partition_spec is not None: download_session = tunnel.create_download_session( t.name, partition_spec=op.partition_spec) else: download_session = tunnel.create_download_session(t.name) logger.debug('Start reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) if op.nrows is None: count = op.end_index - op.start_index else: count = op.nrows with download_session.open_arrow_reader(op.start_index, count, columns=op.columns) as reader: table = reader.read() table = cls._append_partition_values(table, op) if op.string_as_binary: table = cls._cast_string_to_binary(table) data = arrow_table_to_pandas_dataframe( table, use_arrow_dtype=op.use_arrow_dtype) data = cls._align_columns(data, op.outputs[0].dtypes) logger.debug('Finish reading table %s(%s) split from %s to %s', op.table_name, op.partition_spec, op.start_index, op.end_index) ctx[op.outputs[0].key] = data
def setUp(self): self._project = os.environ[ODPSConfig.PROJECT_NAME] self._access_id = os.environ[ODPSConfig.ACCESS_ID] self._access_key = os.environ[ODPSConfig.ACCESS_KEY] self._endpoint = os.environ.get(ODPSConfig.ENDPOINT) self._test_read_table = "test_odps_reader_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._test_write_table = "test_odps_writer_%d_%d" % ( int(time.time()), random.randint(1, 101), ) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) self.create_iris_odps_table()
def execute(cls, ctx, op): import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.io.table import CupidTableUploadSession if op.is_terminal: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) project_name, table_name = op.table_name.split('.') upload_session = CupidTableUploadSession(session=cupid_session, table_name=table_name, project_name=project_name, handle=op.cupid_handle, blocks=op.blocks) upload_session.commit(overwrite=op.overwrite) ctx[op.outputs[0].key] = pd.DataFrame()
class MaxComputeConnection(Connection): """MaxCompute connection, this class uses ODPS object to establish connection with maxcompute Args: conn_uri: uri in format: maxcompute://access_id:[email protected]/api?curr_project=test_ci&scheme=http """ def __init__(self, conn_uri): super().__init__(conn_uri) self.params["database"] = self.params["curr_project"] # compose an endpoint, only keep the host and path and replace scheme endpoint = self.uripts._replace(scheme=self.params["scheme"], query="", netloc=self.uripts.hostname) self._conn = ODPS(self.uripts.username, self.uripts.password, project=self.params["database"], endpoint=endpoint.geturl()) def _parse_uri(self): return super()._parse_uri() def _get_result_set(self, statement): try: instance = self._conn.execute_sql(statement) return MaxComputeResultSet(instance) except Exception as e: return MaxComputeResultSet(None, str(e)) def close(self): if self._conn: self._conn = None
def _build_locals_in_odps(cls, ctx, op): from odps import ODPS sess = ctx.get_current_session().as_default() local = {'session': sess} if op.data is not None: local.update(op.data) o = ODPS.from_environments() return dict(o=o, session=sess, odps=o)
def createTimeButtons(self, event=None): start_date_select = self.cal1.selection_get().strftime("%Y%m%d") end_date_select = self.cal2.selection_get().strftime("%Y%m%d") cat1_select = self.genreCombo.get() cat2_select = [ self.Listbox1.get(v) for v in self.Listbox1.curselection() ] cat3_select = [ self.Listbox2.get(a) for a in self.Listbox2.curselection() ] file_name = str(self.file_name.get()) print(start_date_select) print(end_date_select) print(cat1_select) print(cat2_select) print(cat3_select) print(file_name) # Web Hooks o = ODPS('', '', '', endpoint='') options.tunnel.endpoint = '**API URL**' query = """SELECT DISTINCT buyer_id ,email_address ,phone_number ,venture_category1_name_en ,venture_category2_name_en ,venture_category3_name_en FROM Table Name AS t WHERE venture = "" AND order_status_esm NOT IN ('invalid') AND category1_name_en in ('""" + cat1_select + """') AND category2_name_en in (""" + "'" + "','".join( cat2_select) + "'" + """) AND category3_name_en in (""" + "'" + "','".join( cat3_select) + "'" + """) AND TO_CHAR(t.order_create_date,'yyyymmdd') BETWEEN '""" + start_date_select + """' AND '""" + end_date_select + """' """ # Executing the query and set the data into a dataframe df = o.execute_sql(query).open_reader().to_result_frame().to_pandas() df.columns = df.columns.str.upper() date_string = arrow.now().format('YYYYMMDD') + ' ' + file_name df.to_csv(str(str(date_string) + '.csv'), index=False) print(query)
def testBearerTokenAccount(self): self.odps.delete_table(tn('test_bearer_token_account_table'), if_exists=True) t = self.odps.create_table(tn('test_bearer_token_account_table'), 'col string', lifecycle=1) with t.open_writer() as writer: records = [['val1'], ['val2'], ['val3']] writer.write(records) inst = self.odps.execute_sql('select count(*) from {0}'.format( tn('test_bearer_token_account_table')), async_=True) inst.wait_for_success() task_name = inst.get_task_names()[0] logview_address = inst.get_logview_address() token = logview_address[logview_address.find('token=') + len('token='):] bearer_token_account = BearerTokenAccount(token=token) bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=bearer_token_account) bearer_token_instance = bearer_token_odps.get_instance(inst.id) self.assertEqual(inst.get_task_result(task_name), bearer_token_instance.get_task_result(task_name)) self.assertEqual(inst.get_task_summary(task_name), bearer_token_instance.get_task_summary(task_name)) with self.assertRaises(errors.NoPermission): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test1'), 'col string', lifecycle=1) fake_token_account = BearerTokenAccount(token='fake-token') bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=fake_token_account) with self.assertRaises(errors.ODPSError): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test2'), 'col string', lifecycle=1)
def _handle_enum_table_partitions(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, partition task_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import context cupid_ctx = context() odps_params = task_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) table = o.get_table(task_config['table_name']) partition_desc = task_config.get('partition') if not table.schema.partitions: _write_request_result(sock, result=None) elif partition_desc: if check_partition_exist(table, partition_desc): _write_request_result(sock, result=[partition_desc]) else: parts = filter_partitions(o, list(table.partitions), partition_desc) _write_request_result( sock, result=[str(pt.partition_spec) for pt in parts]) else: _write_request_result( sock, result=[str(pt.partition_spec) for pt in table.partitions]) except: logger.exception('Failed to create download session') _write_request_result(sock, False, exc_info=sys.exc_info())
def __init__( self, project, access_id, access_key, endpoint, table, columns=None, column_types=None, options=None, ): """ Constructs a `ODPSWriter` instance. Args: project: Name of the ODPS project. access_id: ODPS user access ID. access_key: ODPS user access key. endpoint: ODPS cluster endpoint. table: ODPS table name. columns: The list of column names in the table, which will be inferred if the table exits. column_types" The list of column types in the table, which will be inferred if the table exits. options: Other options passed to ODPS context. """ super(ODPSWriter, self).__init__() if table.find(".") > 0: project, table = table.split(".") if options is None: options = {} self._project = project self._access_id = access_id self._access_key = access_key self._endpoint = endpoint self._table = table self._columns = columns self._column_types = column_types self._odps_table = None _configure_odps_options(self._endpoint, options) self._odps_client = ODPS( self._access_id, self._access_key, self._project, self._endpoint )
def _handle_create_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(session_config['table_name']) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) ret_data = { 'handle': upload_session.handle, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def uploadexcel(input_file,output_table_n='defult'): odps = ODPS(ACCESS_KEY_ID, ACCESS_KEY_SECRET, PROJECT, endpoint='http://service.odps.aliyun.com/api') project = odps.get_project() # 取到默认项目 print(project) ds = datetime.datetime.now().strftime('%Y%m%d') print(ds) wb = openpyxl.load_workbook(filename=input_file,read_only=True) ws = wb.active print(datetime.datetime.now()) output_table = odps.get_table(output_table_n) if output_table.exist_partition('ds=' + ds): output_table.delete_partition('ds=' + ds) output_table.create_partition('ds=' + ds, if_not_exists=True) tunnel = TableTunnel(odps) upload_session = tunnel.create_upload_session(output_table.name, partition_spec='ds=' + ds) print('output into', output_table_n, 'partition ds=', ds, ':\n', output_table.schema) index=0 with upload_session.open_record_writer(0) as writer: for row in ws.rows: records = output_table.new_record() i=0 for cell in row: if cell is None: records[i] = None else: records[i] = str(cell.value).encode('utf-8', "replace") i=i+1 writer.write(records) index=index+1 if index % 1000 == 0: print(index) upload_session.commit(0) print('===========') print(datetime.datetime.now())
def get_connection(self, db_name=None): if self.conn: return self.conn db_name = db_name if db_name else self.instance.db_name if db_name is None: raise ValueError("db_name不能为空") self.conn = ODPS(self.user, self.password, project=db_name, endpoint=self.host) return self.conn
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and \ options.access_key is not None and \ options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint ) else: self._odps = enter().odps @line_magic('enter') def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps if 'o' not in self.shell.user_ns: self.shell.user_ns['o'] = self._odps return r @line_magic('setup') def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) html_notify('setup succeeded') @line_magic('teardown') def teardown(self, line): name = line.strip() teardown(name) html_notify('teardown succeeded') @line_magic('list_rooms') def list_rooms(self, line): return list_rooms() @line_magic('stores') def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum((float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0) return all_percent / len(progress.stages) else: return 0 def _to_stdout(cls, msg): print(msg) @line_magic('set') def set_hint(self, line): if '=' not in line: raise ValueError('Hint for sql is not allowed') key, val = line.strip().strip(';').split('=', 1) key, val = key.strip(), val.strip() settings = options.sql.settings if settings is None: options.sql.settings = {key: val} else: options.sql.settings[key] = val @line_cell_magic('sql') def execute(self, line, cell=''): self._set_odps() content = line + '\n' + cell content = content.strip() sql = None hints = dict() splits = content.split(';') for s in splits: stripped = s.strip() if stripped.lower().startswith('set '): hint = stripped.split(' ', 1)[1] k, v = hint.split('=', 1) k, v = k.strip(), v.strip() hints[k] = v elif len(stripped) == 0: continue else: if sql is None: sql = s else: sql = '%s;%s' % (sql, s) # replace user defined parameters sql = replace_sql_parameters(sql, self.shell.user_ns) if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql, hints=hints) if options.verbose: stdout = options.verbose_log or self._to_stdout stdout('Instance ID: ' + instance.id) stdout(' Log view: ' + instance.get_logview_address()) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd from pandas.parser import CParserError try: res = pd.read_csv(StringIO(reader.raw)) except (ValueError, CParserError): res = reader.raw except ImportError: try: res = ResultFrame(list(reader), columns=reader._columns) except TypeError: res = reader.raw html_notify('SQL execution succeeded') return res finally: bar.close() @line_magic('persist') def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name, notify=False) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name, notify=False) html_notify('Persist succeeded')
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and options.access_key is not None and options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint, ) else: self._odps = enter().odps @line_magic("enter") def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps return r @line_magic("setup") def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) @line_magic("teardown") def teardown(self, line): name = line.strip() teardown(name) @line_magic("list_rooms") def list_rooms(self, line): return list_rooms() @line_magic("stores") def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum( (float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0 ) return all_percent / len(progress.stages) else: return 0 @line_cell_magic("sql") def execute(self, line, cell=""): self._set_odps() sql = line + "\n" + cell sql = sql.strip() if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd try: return pd.read_csv(StringIO(reader.raw)) except ValueError: return reader.raw except ImportError: return ResultFrame(list(reader), columns=reader._columns) finally: bar.close() @line_magic("persist") def persist(self, line): import pandas as pd self._set_odps() line = line.strip().strip(";") frame_name, table_name = line.split(None, 1) if "." in table_name: project_name, table_name = tuple(table_name.split(".", 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if not isinstance(frame, pd.DataFrame): raise TypeError("%s is not a Pandas DataFrame" % frame_name) columns = list(frame.columns) types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes] if self._odps.exist_table(table_name, project=project_name): raise TypeError("%s already exists") tb = self._odps.create_table(table_name, Schema.from_lists(columns, types)) def gen(df): size = len(df) bar = init_progress_bar(size) try: c = itertools.count() for row in df.values: i = next(c) if i % 50 == 0: bar.update(min(i, size)) yield tb.new_record(list(row)) bar.update(size) finally: bar.close() with tb.open_writer() as writer: writer.write(gen(frame))