def testBearerTokenAccount(self): self.odps.delete_table(tn('test_bearer_token_account_table'), if_exists=True) t = self.odps.create_table(tn('test_bearer_token_account_table'), 'col string', lifecycle=1) with t.open_writer() as writer: records = [['val1'], ['val2'], ['val3']] writer.write(records) inst = self.odps.execute_sql('select count(*) from {0}'.format( tn('test_bearer_token_account_table')), async_=True) inst.wait_for_success() task_name = inst.get_task_names()[0] logview_address = inst.get_logview_address() token = logview_address[logview_address.find('token=') + len('token='):] bearer_token_account = BearerTokenAccount(token=token) bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=bearer_token_account) bearer_token_instance = bearer_token_odps.get_instance(inst.id) self.assertEqual(inst.get_task_result(task_name), bearer_token_instance.get_task_result(task_name)) self.assertEqual(inst.get_task_summary(task_name), bearer_token_instance.get_task_summary(task_name)) with self.assertRaises(errors.NoPermission): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test1'), 'col string', lifecycle=1) fake_token_account = BearerTokenAccount(token='fake-token') bearer_token_odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=fake_token_account) with self.assertRaises(errors.ODPSError): bearer_token_odps.create_table( tn('test_bearer_token_account_table_test2'), 'col string', lifecycle=1)
def testTokenizedSignServerAccount(self): server = SignServer(token=str(uuid.uuid4())) server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) self.assertRaises( SignServerError, lambda: odps.delete_table(tn('test_sign_account_table'), if_exists=True)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address, token=server.token) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async=True) finally: server.stop()
def testSignServerAccount(self): server = SignServer() server.accounts[ self.odps.account.access_id] = self.odps.account.secret_access_key try: server.start(('127.0.0.1', 0)) account = SignServerAccount(self.odps.account.access_id, server.server.server_address) odps = ODPS(None, None, self.odps.project, self.odps.endpoint, account=account) odps.delete_table(tn('test_sign_account_table'), if_exists=True) t = odps.create_table(tn('test_sign_account_table'), 'col string', lifecycle=1) self.assertTrue(odps.exist_table(tn('test_sign_account_table'))) t.drop(async_=True) finally: server.stop()
def run(args): # build an odps instance o = ODPS(args.odps_access_id, args.odps_access_key, args.odps_project, endpoint=args.odps_endpoint) input_table_project = args.odps_project input_table_name = args.input_table_name if '.' in input_table_name: input_table_project = args.input_table_name.split(".")[0] input_table_name = args.input_table_name.split(".")[1] # download data from odps input_table = o.get_table(input_table_name, project=input_table_project) data = input_table.to_df().to_pandas() # sample data new_data = data.sample(args.sample_row_count) # create output table and upload data to odps o.delete_table(args.output_table_name, if_exists=True) output_table_project = args.odps_project output_table_name = args.output_table_name if '.' in output_table_name: output_table_project = args.output_table_name.split(".")[0] output_table_name = args.output_table_name.split(".")[1] table = o.create_table(output_table_name, input_table.schema, project=output_table_project, if_not_exists=False, lifecycle=3) o.write_table(output_table_name, new_data.values.tolist(), project=output_table_project)
class ODPSWriter(object): def __init__( self, project, access_id, access_key, endpoint, table, columns=None, column_types=None, options=None, ): """ Constructs a `ODPSWriter` instance. Args: project: Name of the ODPS project. access_id: ODPS user access ID. access_key: ODPS user access key. endpoint: ODPS cluster endpoint. table: ODPS table name. columns: The list of column names in the table, which will be inferred if the table exits. column_types" The list of column types in the table, which will be inferred if the table exits. options: Other options passed to ODPS context. """ super(ODPSWriter, self).__init__() if table.find(".") > 0: project, table = table.split(".") if options is None: options = {} self._project = project self._access_id = access_id self._access_key = access_key self._endpoint = endpoint self._table = table self._columns = columns self._column_types = column_types self._odps_table = None _configure_odps_options(self._endpoint, options) self._odps_client = ODPS(self._access_id, self._access_key, self._project, self._endpoint) def _initialize_table(self): if self._odps_client.exist_table(self._table, self._project): self._odps_table = self._odps_client.get_table( self._table, self._project) else: if self._columns is None or self._column_types is None: raise ValueError("columns and column_types need to be " "specified for non-existing table.") schema = Schema.from_lists(self._columns, self._column_types, ["worker"], ["string"]) self._odps_table = self._odps_client.create_table( self._table, schema) def from_iterator(self, records_iter, worker_index): if self._odps_table is None: self._initialize_table() with self._odps_table.open_writer(partition="worker=" + str(worker_index), create_partition=True) as writer: for records in records_iter: writer.write(records)
class ODPSSql(Magics): _odps = None def _set_odps(self): if self._odps is not None: return if options.access_id is not None and options.access_key is not None and options.default_project is not None: self._odps = ODPS( options.access_id, options.access_key, options.default_project, endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint, ) else: self._odps = enter().odps @line_magic("enter") def enter(self, line): room = line.strip() if room: r = enter(room) self._odps = r.odps else: r = enter() self._odps = r.odps return r @line_magic("setup") def setup(self, line): args = line.strip().split() name, args = args[0], args[1:] setup(*args, room=name) @line_magic("teardown") def teardown(self, line): name = line.strip() teardown(name) @line_magic("list_rooms") def list_rooms(self, line): return list_rooms() @line_magic("stores") def list_stores(self, line): line = line.strip() if line: room = enter(line) else: room = enter() return room.display() def _get_task_percent(self, instance, task_name): progress = instance.get_task_progress(task_name) if len(progress.stages) > 0: all_percent = sum( (float(stage.terminated_workers) / stage.total_workers) for stage in progress.stages if stage.total_workers > 0 ) return all_percent / len(progress.stages) else: return 0 @line_cell_magic("sql") def execute(self, line, cell=""): self._set_odps() sql = line + "\n" + cell sql = sql.strip() if sql: bar = init_progress_bar() instance = self._odps.run_sql(sql) percent = 0 while not instance.is_terminated(): task_names = instance.get_task_names() last_percent = percent if len(task_names) > 0: percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names) else: percent = 0 percent = min(1, max(percent, last_percent)) bar.update(percent) time.sleep(1) instance.wait_for_success() bar.update(1) try: with instance.open_reader() as reader: try: import pandas as pd try: return pd.read_csv(StringIO(reader.raw)) except ValueError: return reader.raw except ImportError: return ResultFrame(list(reader), columns=reader._columns) finally: bar.close() @line_magic("persist") def persist(self, line): import pandas as pd self._set_odps() line = line.strip().strip(";") frame_name, table_name = line.split(None, 1) if "." in table_name: project_name, table_name = tuple(table_name.split(".", 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if not isinstance(frame, pd.DataFrame): raise TypeError("%s is not a Pandas DataFrame" % frame_name) columns = list(frame.columns) types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes] if self._odps.exist_table(table_name, project=project_name): raise TypeError("%s already exists") tb = self._odps.create_table(table_name, Schema.from_lists(columns, types)) def gen(df): size = len(df) bar = init_progress_bar(size) try: c = itertools.count() for row in df.values: i = next(c) if i % 50 == 0: bar.update(min(i, size)) yield tb.new_record(list(row)) bar.update(size) finally: bar.close() with tb.open_writer() as writer: writer.write(gen(frame))
class myOdps: # 初始化一个odps连接对象 def __init__(self, access_id, secret_access_key, project): self.odps = ODPS(access_id=access_id, secret_access_key=secret_access_key, project=project, end_point="http://service.odps.aliyun.com/api") # 获取所有表名 def get_all_tabel(self): # return 返回所有的表名 table_name = [] for table in self.odps.list_tables(): table_name.append(table.name) return table_name # 创建一张表 def creat_table(self, table_name, columns=None, if_not_exists=True): # table_name: 表名 # columns : ('num bigint, num2 double', 'pt string') 字段和分组的元组 # if_not_exists:True 不存在才创建 # lifecycle:28 生命周期 # return 返回表对象 try: return self.odps.create_table(table_name, columns, if_not_exists=if_not_exists) except: return self.odps.get_table(table_name) # 通过表名直接获取一张表 def get_a_table(self, table_name): # table_name: 表名 # return 返回表对象 return self.odps.get_table(table_name) # 删除一张表 def drop_a_table(self, table_name): # table_name: 表名 # return 返回表删除结果 return self.odps.delete_table(table_name) # 获取一张表的所有分区 def get_partitions(self, table): # table:表对象 # return: 表的所有分区 partitions = [] for partition in table.partitions: partitions.append(partition.name) return partitions # ============= 数据上传 ============ # 上传csv到odps并创建表,csv必须要有表头 def uploadCSV(self, csvFilename, tableName, sep=",", pt=None): """ :param csvFilename: 传入本地csv的路径,必须要有表头 :param tableName: 上传到odps时的表名 :param sep: csv的分隔符 :param pt: 是否创建分区 """ print("start upload ...\n") df = pd.read_csv(csvFilename, sep=sep) shape0 = df.shape[0] columns = [ Column(name=f"{x}", type='string', comment='the column') for x in df.columns ] if pt: partitions = [ Partition(name='pt', type='string', comment='the partition') ] schema = Schema(columns=columns, partitions=partitions) table = self.creat_table(tableName, schema) table.create_partition(f"pt={pt}", if_not_exists=True) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=f"pt={pt}") as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) item.append(pt) writer.write(item) else: schema = Schema(columns=columns) table = self.creat_table(tableName, schema) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=None) as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) writer.write(item) print("\n\n upload finish ...") # 上传的过程中并进行下载,下载完再上传完整的数据,数据行的坐标为1的字段为下载地址 def downloaAndUp(self, csvFilename, tableName, sep=",", urlIndex=1, pt=None): """ :param csvFilename: 传入本地csv的路径,必须要有表头 :param tableName: 上传到odps时的表名 :param sep: csv的分隔符 :param urlIndex: url字段的坐标位置 """ print("start upload ...\n") f = open(csvFilename, encoding='utf-8') first_line = f.readlines(1)[0].strip('\n').split(sep) columns = [ Column(name=f"{x}", type='string', comment='the column') for x in first_line ] if pt: partitions = [ Partition(name='pt', type='string', comment='the partition') ] schema = Schema(columns=columns, partitions=partitions) table = self.creat_table(tableName, schema) table.create_partition(f"pt={pt}", if_not_exists=True) with table.open_writer(partition=f"pt={pt}") as writer: for index, line in enumerate(f): print(f"{index} in {tableName} ...") item = line.strip('\n').split(sep) item.append(pt) resp = download(item[urlIndex]) data = resp.text if sys.getsizeof(data) <= 8 * 1024 * 1000: item[urlIndex] = data else: print(f"failed in {item[0]}") writer.write(item) else: schema = Schema(columns=columns) table = self.creat_table(tableName, schema) with table.open_writer(partition=None) as writer: for index, line in enumerate(f): print(f"{index} in {tableName} ...") item = line.strip('\n').split(sep) resp = download(item[urlIndex]) data = resp.text if sys.getsizeof(data) <= 8 * 1024 * 1000: item[urlIndex] = data else: print(f"failed in {item[0]}") writer.write(item) print("\n\n upload finish ...") f.close() # ===========执行sql========= # sql查询 def select_sql(self, sql): # return: 查询结果的迭代对象 with self.odps.execute_sql(sql).open_reader() as reader: return reader
class OdpsHelper: def __init__(self,access_id="",secret_access_key="",project="", endpoint="",verbose=1,link_now=1): self.access_id = access_id self.secret_access_key = secret_access_key self.project = project self.endpoint = endpoint self.varbose = verbose self.odps_instance = None if link_now: self.link() def link(self): self.odps_instance = ODPS(access_id = self.access_id, secret_access_key = self.secret_access_key, project = self.project, endpoint = self.endpoint) return self.odps_instance def generate_odps_columns_from_pandas_df(self,df): type_dict = {"int64":'BIGINT','float64':'DOUBLE','object':'STRING'} column_types = df.dtypes odps_columns = [] for c_name in column_types.index: temp_type = type_dict.get(str(c_name),'STRING') if self.verbose: print("Transfer from pandas columns to odps columns:", o_name +'->'+c_name.lower()+':'+'str(column_type[c_name])+' '->'+temp_type) odps_columns.append(Column(name=c_name.lower(),type=temp_type)) return odps_columns def generate_table(self,table_name,columns,if_not_exists=True): schema = Schema(columns=columns) self.odps_instance.create_table(table_name.lower(),schema,if_not_exists=if_not_exists) t = self.odps_instance.get_table(table_name) return t def write_odps_from_pandas_df(self,table,df): if isinstance(table,str): t = self.generate_table(table,self.generate_odps_columns_from_pandas_df(df)) elif isinstance(table,Table): t = table else: raise Exception("Unsupported data type") df_renamed = df.rename(str.lower,axis=1) columns_for_sort = [x.name for x in t.schema.columns] df_final = df_renamed.loc[:,columns_for_sort] odps_data = np.array(df_final).tolist() with t.open_writer() as writer: writer.write(odps_data) writer.close() def write_odps_from_csv(self,table_name,csv_path): df = pd.read_csv(csv_path) self.write_odps_from_pandas_df(table_name,df) def read_odps_to_pandas_df(self,table,top_n=-1): if isinstance(table,str): t = self.odps_instance.get_table(table) elif isinstance(table,Table): t = Table else: raise Exception("Unsupported data type") log.info("============== Table info:\n"+str(t)) table_name = t.name columns = [] with t.open_reader() as reader: count = reader.count log.info("=========== row count:"+str(count)) for row in reader[:1]: columns = np.array(row)[:,0] data = [] counter = 0 step_count = 100 counters = 0 step_length = count if self.varbose print("========= start reading table:",table_name,'total:',count,'step length',step_length) print("========= read top:",top_n) start_time = time.time() columns_len = len(columns) for row in t.open_reader(): temp_data = [] for j in range(columns_len): temp_data.append(row[j]) data.append(temp_data) counter += 1 if top_n>0: if(counters + counter) == top_n: break if counter == step_length: counters += counter step_now = round(counters * 100 /count,1) if self.verbose: print("100%") print("============= end reading table.Elasped minutes:",round((time.time() - start_time)/60,2)) result = pd.DataFrame(data,columns=columns) return result