Beispiel #1
0
class OdpsConn(object):
    """
    odps 连接
    """
    def __init__(self, project):
        self.access_id = ODPSCONF.key
        self.access_key = ODPSCONF.sec
        self.project = project

        self.odps = None

    def __enter__(self):
        try:
            self.odps = ODPS(self.access_id, self.access_key, self.project)
        except Exception as exc:
            raise ValueError(exc.message)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        del self

    def get_table_count_and_names(self):
        """
        获取一个项目下的table的数量和table的名字
        :return:
        """
        tables = self.odps.list_tables()
        names = [table.name for table in tables]
        count = len(names)
        return count, names

    def get_table_schema(self, tname):
        """
        获取表字段
        :return: 
        """
        table = self.odps.get_table(tname)
        _sa = table.schema
        _columns = _sa.columns
        schema = [item.name for item in _columns]
        return schema

    def execute_sql(self, sql):
        rest = []
        with self.odps.execute_sql(sql).open_reader() as reader:
            for record in reader:
                rest.append(record.values)
        return rest

    def get_table_last_update_time(self, tname):
        t = self.odps.get_table(tname)
        last_update_time = t.last_modified_time if t else None
        return last_update_time

    def count_table(self, table):
        sql = 'select count(1) from %s' % table
        with self.odps.execute_sql(sql).open_reader() as reader:
            return reader[0].values[0]
Beispiel #2
0
	2、源端的表在目标端不存在则不校验该表,源端的分区在目标端不存在则不校验该分区
	3、用法:python 脚本.py 源项目名称 目标项目名称
'''

s = ODPS('',
         '',
         '%s' % sys.argv[1],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')
d = ODPS('',
         '',
         '%s' % sys.argv[2],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')

print("######################################################################")

for table in s.list_tables():
    t1 = s.get_table(table.name)
    if d.exist_table(table.name):
        t2 = d.get_table(table.name)
    else:
        print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2]))
        continue

    if table.schema.partitions:  #判断该表是否为分区表
        #print 'Table %s is partitioned.' %table.name
        for partition in table.partitions:
            #print partition.name
            with t1.open_reader(partition='%s' % partition.name) as reader:
                count1 = reader.count
                #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1)
            if t2.exist_partition(partition.name):
Beispiel #3
0
# coding=utf-8
__author__ = 'zhangteng'

from odps import ODPS
from config import ODPSCONF

O = ODPS(ODPSCONF.key,
         ODPSCONF.sec,
         'sync_data',
         endpoint='http://service.odps.aliyun.com/api')

f = open('odps_ddl_1208.txt', 'a')

i = 1
s = ''

for t in O.list_tables():
    print(i, t.name)
    s += "\n\n%s.\t%s\t%s\n" % (i, t.name, t.comment)
    s += t.schema.get_table_ddl().replace('table_name', t.name)
    i += 1
    if i % 5 == 0:
        f.write(s)
        s = ''

f.write(s)
f.close()
Beispiel #4
0
from odps import ODPS
o = ODPS('accesskey',
         'xxxxxx',
         project='DWLabClone_67226234odps',
         endpoint='http://service.cn-shanghai.maxcompute.aliyun.com/api')

project = o.get_project()

print(project)
for table in o.list_tables():
    print(table)

t = o.get_table('ods_user_info_d')
for record in t.head(3):
    print(record)

print(next(t.partitions))

with t.open_reader(partition="dt=20210706") as reader:
    count = reader.count
    print(count)
    for record in reader[5:7]:
        print(record)
        print(type(record))
        print(record['uid'])
        print(record['age_range'])

print('*' * 100)
t_rpt = o.get_table('rpt_user_info_d')
# for record in t_rpt.head(3):
#     print(record)
Beispiel #5
0
class myOdps:

    # 初始化一个odps连接对象
    def __init__(self, access_id, secret_access_key, project):
        self.odps = ODPS(access_id=access_id,
                         secret_access_key=secret_access_key,
                         project=project,
                         end_point="http://service.odps.aliyun.com/api")

    # 获取所有表名
    def get_all_tabel(self):
        # return 返回所有的表名
        table_name = []
        for table in self.odps.list_tables():
            table_name.append(table.name)
        return table_name

    # 创建一张表
    def creat_table(self, table_name, columns=None, if_not_exists=True):
        # table_name: 表名
        # columns :  ('num bigint, num2 double', 'pt string') 字段和分组的元组
        # if_not_exists:True   不存在才创建
        # lifecycle:28   生命周期
        # return 返回表对象
        try:
            return self.odps.create_table(table_name,
                                          columns,
                                          if_not_exists=if_not_exists)
        except:
            return self.odps.get_table(table_name)

    # 通过表名直接获取一张表
    def get_a_table(self, table_name):
        # table_name: 表名
        # return 返回表对象
        return self.odps.get_table(table_name)

    # 删除一张表
    def drop_a_table(self, table_name):
        # table_name: 表名
        # return 返回表删除结果
        return self.odps.delete_table(table_name)

    # 获取一张表的所有分区
    def get_partitions(self, table):
        # table:表对象
        # return: 表的所有分区
        partitions = []
        for partition in table.partitions:
            partitions.append(partition.name)
        return partitions

    # ============= 数据上传 ============

    # 上传csv到odps并创建表,csv必须要有表头

    def uploadCSV(self, csvFilename, tableName, sep=",", pt=None):
        """
        :param csvFilename: 传入本地csv的路径,必须要有表头
        :param tableName:  上传到odps时的表名
        :param sep:   csv的分隔符
        :param pt:   是否创建分区
        """
        print("start upload ...\n")
        df = pd.read_csv(csvFilename, sep=sep)
        shape0 = df.shape[0]
        columns = [
            Column(name=f"{x}", type='string', comment='the column')
            for x in df.columns
        ]

        if pt:
            partitions = [
                Partition(name='pt', type='string', comment='the partition')
            ]
            schema = Schema(columns=columns, partitions=partitions)
            table = self.creat_table(tableName, schema)
            table.create_partition(f"pt={pt}", if_not_exists=True)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=f"pt={pt}") as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    item.append(pt)
                    writer.write(item)
        else:
            schema = Schema(columns=columns)
            table = self.creat_table(tableName, schema)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=None) as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    writer.write(item)
        print("\n\n upload finish ...")

    # 上传的过程中并进行下载,下载完再上传完整的数据,数据行的坐标为1的字段为下载地址
    def downloaAndUp(self,
                     csvFilename,
                     tableName,
                     sep=",",
                     urlIndex=1,
                     pt=None):
        """
        :param csvFilename: 传入本地csv的路径,必须要有表头
        :param tableName:  上传到odps时的表名
        :param sep:   csv的分隔符
        :param urlIndex: url字段的坐标位置
        """
        print("start upload ...\n")
        f = open(csvFilename, encoding='utf-8')
        first_line = f.readlines(1)[0].strip('\n').split(sep)
        columns = [
            Column(name=f"{x}", type='string', comment='the column')
            for x in first_line
        ]

        if pt:
            partitions = [
                Partition(name='pt', type='string', comment='the partition')
            ]
            schema = Schema(columns=columns, partitions=partitions)
            table = self.creat_table(tableName, schema)
            table.create_partition(f"pt={pt}", if_not_exists=True)
            with table.open_writer(partition=f"pt={pt}") as writer:
                for index, line in enumerate(f):
                    print(f"{index} in {tableName}  ...")
                    item = line.strip('\n').split(sep)
                    item.append(pt)
                    resp = download(item[urlIndex])
                    data = resp.text
                    if sys.getsizeof(data) <= 8 * 1024 * 1000:
                        item[urlIndex] = data
                    else:
                        print(f"failed in {item[0]}")
                    writer.write(item)
        else:
            schema = Schema(columns=columns)
            table = self.creat_table(tableName, schema)
            with table.open_writer(partition=None) as writer:
                for index, line in enumerate(f):
                    print(f"{index}  in {tableName}  ...")
                    item = line.strip('\n').split(sep)
                    resp = download(item[urlIndex])
                    data = resp.text
                    if sys.getsizeof(data) <= 8 * 1024 * 1000:
                        item[urlIndex] = data
                    else:
                        print(f"failed in {item[0]}")
                    writer.write(item)
        print("\n\n upload finish ...")
        f.close()

    # ===========执行sql=========
    # sql查询
    def select_sql(self, sql):
        # return: 查询结果的迭代对象
        with self.odps.execute_sql(sql).open_reader() as reader:
            return reader
Beispiel #6
0
from odps.df import DataFrame
from odps.types import Schema, Record

odps = ODPS('',
            'PtMa1T01Nq0y2da8SBl0FRMmgxjE8X',
            'GyyStatistical',
            endpoint='https://service.odps.aliyun.com/api')

# 取到某个项目
project = odps.get_project('GyyStatistical')
# 取到默认项目
# project = odps.get_project()

# 列出项目下所有的表
print('----列出项目下所有的表 start----')
for table in odps.list_tables():
    print(table)
print('----列出项目下所有的表 end----\n')

print('----同步方式 执行SQL语句 start----')
instance = odps.execute_sql('select * from ots_arealist1')
with instance.open_reader() as reader:
    for record in reader:
        print(record)
        print(type(record))
print('----同步方式 执行SQL语句 end----\n')

# print('----异步方式 执行SQL语句 start----')
# instance = odps.run_sql('select * from ots_arealist')
# instance.wait_for_success()
# with instance.open_reader() as reader: