class OdpsConn(object): """ odps 连接 """ def __init__(self, project): self.access_id = ODPSCONF.key self.access_key = ODPSCONF.sec self.project = project self.odps = None def __enter__(self): try: self.odps = ODPS(self.access_id, self.access_key, self.project) except Exception as exc: raise ValueError(exc.message) return self def __exit__(self, exc_type, exc_val, exc_tb): del self def get_table_count_and_names(self): """ 获取一个项目下的table的数量和table的名字 :return: """ tables = self.odps.list_tables() names = [table.name for table in tables] count = len(names) return count, names def get_table_schema(self, tname): """ 获取表字段 :return: """ table = self.odps.get_table(tname) _sa = table.schema _columns = _sa.columns schema = [item.name for item in _columns] return schema def execute_sql(self, sql): rest = [] with self.odps.execute_sql(sql).open_reader() as reader: for record in reader: rest.append(record.values) return rest def get_table_last_update_time(self, tname): t = self.odps.get_table(tname) last_update_time = t.last_modified_time if t else None return last_update_time def count_table(self, table): sql = 'select count(1) from %s' % table with self.odps.execute_sql(sql).open_reader() as reader: return reader[0].values[0]
2、源端的表在目标端不存在则不校验该表,源端的分区在目标端不存在则不校验该分区 3、用法:python 脚本.py 源项目名称 目标项目名称 ''' s = ODPS('', '', '%s' % sys.argv[1], endpoint='http://service.cn.maxcompute.aliyun.com/api') d = ODPS('', '', '%s' % sys.argv[2], endpoint='http://service.cn.maxcompute.aliyun.com/api') print("######################################################################") for table in s.list_tables(): t1 = s.get_table(table.name) if d.exist_table(table.name): t2 = d.get_table(table.name) else: print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2])) continue if table.schema.partitions: #判断该表是否为分区表 #print 'Table %s is partitioned.' %table.name for partition in table.partitions: #print partition.name with t1.open_reader(partition='%s' % partition.name) as reader: count1 = reader.count #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1) if t2.exist_partition(partition.name):
# coding=utf-8 __author__ = 'zhangteng' from odps import ODPS from config import ODPSCONF O = ODPS(ODPSCONF.key, ODPSCONF.sec, 'sync_data', endpoint='http://service.odps.aliyun.com/api') f = open('odps_ddl_1208.txt', 'a') i = 1 s = '' for t in O.list_tables(): print(i, t.name) s += "\n\n%s.\t%s\t%s\n" % (i, t.name, t.comment) s += t.schema.get_table_ddl().replace('table_name', t.name) i += 1 if i % 5 == 0: f.write(s) s = '' f.write(s) f.close()
from odps import ODPS o = ODPS('accesskey', 'xxxxxx', project='DWLabClone_67226234odps', endpoint='http://service.cn-shanghai.maxcompute.aliyun.com/api') project = o.get_project() print(project) for table in o.list_tables(): print(table) t = o.get_table('ods_user_info_d') for record in t.head(3): print(record) print(next(t.partitions)) with t.open_reader(partition="dt=20210706") as reader: count = reader.count print(count) for record in reader[5:7]: print(record) print(type(record)) print(record['uid']) print(record['age_range']) print('*' * 100) t_rpt = o.get_table('rpt_user_info_d') # for record in t_rpt.head(3): # print(record)
class myOdps: # 初始化一个odps连接对象 def __init__(self, access_id, secret_access_key, project): self.odps = ODPS(access_id=access_id, secret_access_key=secret_access_key, project=project, end_point="http://service.odps.aliyun.com/api") # 获取所有表名 def get_all_tabel(self): # return 返回所有的表名 table_name = [] for table in self.odps.list_tables(): table_name.append(table.name) return table_name # 创建一张表 def creat_table(self, table_name, columns=None, if_not_exists=True): # table_name: 表名 # columns : ('num bigint, num2 double', 'pt string') 字段和分组的元组 # if_not_exists:True 不存在才创建 # lifecycle:28 生命周期 # return 返回表对象 try: return self.odps.create_table(table_name, columns, if_not_exists=if_not_exists) except: return self.odps.get_table(table_name) # 通过表名直接获取一张表 def get_a_table(self, table_name): # table_name: 表名 # return 返回表对象 return self.odps.get_table(table_name) # 删除一张表 def drop_a_table(self, table_name): # table_name: 表名 # return 返回表删除结果 return self.odps.delete_table(table_name) # 获取一张表的所有分区 def get_partitions(self, table): # table:表对象 # return: 表的所有分区 partitions = [] for partition in table.partitions: partitions.append(partition.name) return partitions # ============= 数据上传 ============ # 上传csv到odps并创建表,csv必须要有表头 def uploadCSV(self, csvFilename, tableName, sep=",", pt=None): """ :param csvFilename: 传入本地csv的路径,必须要有表头 :param tableName: 上传到odps时的表名 :param sep: csv的分隔符 :param pt: 是否创建分区 """ print("start upload ...\n") df = pd.read_csv(csvFilename, sep=sep) shape0 = df.shape[0] columns = [ Column(name=f"{x}", type='string', comment='the column') for x in df.columns ] if pt: partitions = [ Partition(name='pt', type='string', comment='the partition') ] schema = Schema(columns=columns, partitions=partitions) table = self.creat_table(tableName, schema) table.create_partition(f"pt={pt}", if_not_exists=True) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=f"pt={pt}") as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) item.append(pt) writer.write(item) else: schema = Schema(columns=columns) table = self.creat_table(tableName, schema) table_columns = [i.name for i in table.schema.columns] with table.open_writer(partition=None) as writer: for index in df.index: print(f"{index+1}/{shape0} in {tableName} ...") item_dict = dict(df.loc[index]) item = [] for field in table_columns[:-1]: item.append(item_dict.get(field, '')) writer.write(item) print("\n\n upload finish ...") # 上传的过程中并进行下载,下载完再上传完整的数据,数据行的坐标为1的字段为下载地址 def downloaAndUp(self, csvFilename, tableName, sep=",", urlIndex=1, pt=None): """ :param csvFilename: 传入本地csv的路径,必须要有表头 :param tableName: 上传到odps时的表名 :param sep: csv的分隔符 :param urlIndex: url字段的坐标位置 """ print("start upload ...\n") f = open(csvFilename, encoding='utf-8') first_line = f.readlines(1)[0].strip('\n').split(sep) columns = [ Column(name=f"{x}", type='string', comment='the column') for x in first_line ] if pt: partitions = [ Partition(name='pt', type='string', comment='the partition') ] schema = Schema(columns=columns, partitions=partitions) table = self.creat_table(tableName, schema) table.create_partition(f"pt={pt}", if_not_exists=True) with table.open_writer(partition=f"pt={pt}") as writer: for index, line in enumerate(f): print(f"{index} in {tableName} ...") item = line.strip('\n').split(sep) item.append(pt) resp = download(item[urlIndex]) data = resp.text if sys.getsizeof(data) <= 8 * 1024 * 1000: item[urlIndex] = data else: print(f"failed in {item[0]}") writer.write(item) else: schema = Schema(columns=columns) table = self.creat_table(tableName, schema) with table.open_writer(partition=None) as writer: for index, line in enumerate(f): print(f"{index} in {tableName} ...") item = line.strip('\n').split(sep) resp = download(item[urlIndex]) data = resp.text if sys.getsizeof(data) <= 8 * 1024 * 1000: item[urlIndex] = data else: print(f"failed in {item[0]}") writer.write(item) print("\n\n upload finish ...") f.close() # ===========执行sql========= # sql查询 def select_sql(self, sql): # return: 查询结果的迭代对象 with self.odps.execute_sql(sql).open_reader() as reader: return reader
from odps.df import DataFrame from odps.types import Schema, Record odps = ODPS('', 'PtMa1T01Nq0y2da8SBl0FRMmgxjE8X', 'GyyStatistical', endpoint='https://service.odps.aliyun.com/api') # 取到某个项目 project = odps.get_project('GyyStatistical') # 取到默认项目 # project = odps.get_project() # 列出项目下所有的表 print('----列出项目下所有的表 start----') for table in odps.list_tables(): print(table) print('----列出项目下所有的表 end----\n') print('----同步方式 执行SQL语句 start----') instance = odps.execute_sql('select * from ots_arealist1') with instance.open_reader() as reader: for record in reader: print(record) print(type(record)) print('----同步方式 执行SQL语句 end----\n') # print('----异步方式 执行SQL语句 start----') # instance = odps.run_sql('select * from ots_arealist') # instance.wait_for_success() # with instance.open_reader() as reader: