Example #1
0
    def testBearerTokenAccount(self):
        self.odps.delete_table(tn('test_bearer_token_account_table'),
                               if_exists=True)
        t = self.odps.create_table(tn('test_bearer_token_account_table'),
                                   'col string',
                                   lifecycle=1)
        with t.open_writer() as writer:
            records = [['val1'], ['val2'], ['val3']]
            writer.write(records)

        inst = self.odps.execute_sql('select count(*) from {0}'.format(
            tn('test_bearer_token_account_table')),
                                     async_=True)
        inst.wait_for_success()
        task_name = inst.get_task_names()[0]

        logview_address = inst.get_logview_address()
        token = logview_address[logview_address.find('token=') +
                                len('token='):]
        bearer_token_account = BearerTokenAccount(token=token)
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=bearer_token_account)
        bearer_token_instance = bearer_token_odps.get_instance(inst.id)

        self.assertEqual(inst.get_task_result(task_name),
                         bearer_token_instance.get_task_result(task_name))
        self.assertEqual(inst.get_task_summary(task_name),
                         bearer_token_instance.get_task_summary(task_name))

        with self.assertRaises(errors.NoPermission):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test1'),
                'col string',
                lifecycle=1)

        fake_token_account = BearerTokenAccount(token='fake-token')
        bearer_token_odps = ODPS(None,
                                 None,
                                 self.odps.project,
                                 self.odps.endpoint,
                                 account=fake_token_account)

        with self.assertRaises(errors.ODPSError):
            bearer_token_odps.create_table(
                tn('test_bearer_token_account_table_test2'),
                'col string',
                lifecycle=1)
Example #2
0
    def testTokenizedSignServerAccount(self):
        server = SignServer(token=str(uuid.uuid4()))
        server.accounts[
            self.odps.account.access_id] = self.odps.account.secret_access_key
        try:
            server.start(('127.0.0.1', 0))
            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            self.assertRaises(
                SignServerError,
                lambda: odps.delete_table(tn('test_sign_account_table'),
                                          if_exists=True))

            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address,
                                        token=server.token)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            odps.delete_table(tn('test_sign_account_table'), if_exists=True)
            t = odps.create_table(tn('test_sign_account_table'),
                                  'col string',
                                  lifecycle=1)
            self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
            t.drop(async=True)
        finally:
            server.stop()
Example #3
0
 def testSignServerAccount(self):
     server = SignServer()
     server.accounts[
         self.odps.account.access_id] = self.odps.account.secret_access_key
     try:
         server.start(('127.0.0.1', 0))
         account = SignServerAccount(self.odps.account.access_id,
                                     server.server.server_address)
         odps = ODPS(None,
                     None,
                     self.odps.project,
                     self.odps.endpoint,
                     account=account)
         odps.delete_table(tn('test_sign_account_table'), if_exists=True)
         t = odps.create_table(tn('test_sign_account_table'),
                               'col string',
                               lifecycle=1)
         self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
         t.drop(async_=True)
     finally:
         server.stop()
Example #4
0
def run(args):
    # build an odps instance
    o = ODPS(args.odps_access_id,
             args.odps_access_key,
             args.odps_project,
             endpoint=args.odps_endpoint)

    input_table_project = args.odps_project
    input_table_name = args.input_table_name
    if '.' in input_table_name:
        input_table_project = args.input_table_name.split(".")[0]
        input_table_name = args.input_table_name.split(".")[1]

    # download data from odps
    input_table = o.get_table(input_table_name, project=input_table_project)
    data = input_table.to_df().to_pandas()

    # sample data
    new_data = data.sample(args.sample_row_count)

    # create output table and upload data to odps
    o.delete_table(args.output_table_name, if_exists=True)
    output_table_project = args.odps_project
    output_table_name = args.output_table_name
    if '.' in output_table_name:
        output_table_project = args.output_table_name.split(".")[0]
        output_table_name = args.output_table_name.split(".")[1]

    table = o.create_table(output_table_name,
                           input_table.schema,
                           project=output_table_project,
                           if_not_exists=False,
                           lifecycle=3)
    o.write_table(output_table_name,
                  new_data.values.tolist(),
                  project=output_table_project)
Example #5
0
class ODPSWriter(object):
    def __init__(
        self,
        project,
        access_id,
        access_key,
        endpoint,
        table,
        columns=None,
        column_types=None,
        options=None,
    ):
        """
        Constructs a `ODPSWriter` instance.

        Args:
            project: Name of the ODPS project.
            access_id: ODPS user access ID.
            access_key: ODPS user access key.
            endpoint: ODPS cluster endpoint.
            table: ODPS table name.
            columns: The list of column names in the table,
                which will be inferred if the table exits.
            column_types" The list of column types in the table,
                which will be inferred if the table exits.
            options: Other options passed to ODPS context.
        """
        super(ODPSWriter, self).__init__()

        if table.find(".") > 0:
            project, table = table.split(".")
        if options is None:
            options = {}
        self._project = project
        self._access_id = access_id
        self._access_key = access_key
        self._endpoint = endpoint
        self._table = table
        self._columns = columns
        self._column_types = column_types
        self._odps_table = None
        _configure_odps_options(self._endpoint, options)
        self._odps_client = ODPS(self._access_id, self._access_key,
                                 self._project, self._endpoint)

    def _initialize_table(self):
        if self._odps_client.exist_table(self._table, self._project):
            self._odps_table = self._odps_client.get_table(
                self._table, self._project)
        else:
            if self._columns is None or self._column_types is None:
                raise ValueError("columns and column_types need to be "
                                 "specified for non-existing table.")
            schema = Schema.from_lists(self._columns, self._column_types,
                                       ["worker"], ["string"])
            self._odps_table = self._odps_client.create_table(
                self._table, schema)

    def from_iterator(self, records_iter, worker_index):
        if self._odps_table is None:
            self._initialize_table()
        with self._odps_table.open_writer(partition="worker=" +
                                          str(worker_index),
                                          create_partition=True) as writer:
            for records in records_iter:
                writer.write(records)
Example #6
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and options.access_key is not None and options.default_project is not None:
            self._odps = ODPS(
                options.access_id,
                options.access_key,
                options.default_project,
                endpoint=options.end_point,
                tunnel_endpoint=options.tunnel_endpoint,
            )
        else:
            self._odps = enter().odps

    @line_magic("enter")
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        return r

    @line_magic("setup")
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)

    @line_magic("teardown")
    def teardown(self, line):
        name = line.strip()
        teardown(name)

    @line_magic("list_rooms")
    def list_rooms(self, line):
        return list_rooms()

    @line_magic("stores")
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum(
                (float(stage.terminated_workers) / stage.total_workers)
                for stage in progress.stages
                if stage.total_workers > 0
            )
            return all_percent / len(progress.stages)
        else:
            return 0

    @line_cell_magic("sql")
    def execute(self, line, cell=""):
        self._set_odps()

        sql = line + "\n" + cell
        sql = sql.strip()

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql)

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd

                        try:
                            return pd.read_csv(StringIO(reader.raw))
                        except ValueError:
                            return reader.raw
                    except ImportError:
                        return ResultFrame(list(reader), columns=reader._columns)
            finally:
                bar.close()

    @line_magic("persist")
    def persist(self, line):
        import pandas as pd

        self._set_odps()

        line = line.strip().strip(";")

        frame_name, table_name = line.split(None, 1)

        if "." in table_name:
            project_name, table_name = tuple(table_name.split(".", 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if not isinstance(frame, pd.DataFrame):
            raise TypeError("%s is not a Pandas DataFrame" % frame_name)

        columns = list(frame.columns)
        types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes]

        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError("%s already exists")

        tb = self._odps.create_table(table_name, Schema.from_lists(columns, types))

        def gen(df):
            size = len(df)

            bar = init_progress_bar(size)

            try:
                c = itertools.count()
                for row in df.values:
                    i = next(c)
                    if i % 50 == 0:
                        bar.update(min(i, size))

                    yield tb.new_record(list(row))

                bar.update(size)
            finally:
                bar.close()

        with tb.open_writer() as writer:
            writer.write(gen(frame))
Example #7
0
class myOdps:

    # 初始化一个odps连接对象
    def __init__(self, access_id, secret_access_key, project):
        self.odps = ODPS(access_id=access_id,
                         secret_access_key=secret_access_key,
                         project=project,
                         end_point="http://service.odps.aliyun.com/api")

    # 获取所有表名
    def get_all_tabel(self):
        # return 返回所有的表名
        table_name = []
        for table in self.odps.list_tables():
            table_name.append(table.name)
        return table_name

    # 创建一张表
    def creat_table(self, table_name, columns=None, if_not_exists=True):
        # table_name: 表名
        # columns :  ('num bigint, num2 double', 'pt string') 字段和分组的元组
        # if_not_exists:True   不存在才创建
        # lifecycle:28   生命周期
        # return 返回表对象
        try:
            return self.odps.create_table(table_name,
                                          columns,
                                          if_not_exists=if_not_exists)
        except:
            return self.odps.get_table(table_name)

    # 通过表名直接获取一张表
    def get_a_table(self, table_name):
        # table_name: 表名
        # return 返回表对象
        return self.odps.get_table(table_name)

    # 删除一张表
    def drop_a_table(self, table_name):
        # table_name: 表名
        # return 返回表删除结果
        return self.odps.delete_table(table_name)

    # 获取一张表的所有分区
    def get_partitions(self, table):
        # table:表对象
        # return: 表的所有分区
        partitions = []
        for partition in table.partitions:
            partitions.append(partition.name)
        return partitions

    # ============= 数据上传 ============

    # 上传csv到odps并创建表,csv必须要有表头

    def uploadCSV(self, csvFilename, tableName, sep=",", pt=None):
        """
        :param csvFilename: 传入本地csv的路径,必须要有表头
        :param tableName:  上传到odps时的表名
        :param sep:   csv的分隔符
        :param pt:   是否创建分区
        """
        print("start upload ...\n")
        df = pd.read_csv(csvFilename, sep=sep)
        shape0 = df.shape[0]
        columns = [
            Column(name=f"{x}", type='string', comment='the column')
            for x in df.columns
        ]

        if pt:
            partitions = [
                Partition(name='pt', type='string', comment='the partition')
            ]
            schema = Schema(columns=columns, partitions=partitions)
            table = self.creat_table(tableName, schema)
            table.create_partition(f"pt={pt}", if_not_exists=True)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=f"pt={pt}") as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    item.append(pt)
                    writer.write(item)
        else:
            schema = Schema(columns=columns)
            table = self.creat_table(tableName, schema)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=None) as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    writer.write(item)
        print("\n\n upload finish ...")

    # 上传的过程中并进行下载,下载完再上传完整的数据,数据行的坐标为1的字段为下载地址
    def downloaAndUp(self,
                     csvFilename,
                     tableName,
                     sep=",",
                     urlIndex=1,
                     pt=None):
        """
        :param csvFilename: 传入本地csv的路径,必须要有表头
        :param tableName:  上传到odps时的表名
        :param sep:   csv的分隔符
        :param urlIndex: url字段的坐标位置
        """
        print("start upload ...\n")
        f = open(csvFilename, encoding='utf-8')
        first_line = f.readlines(1)[0].strip('\n').split(sep)
        columns = [
            Column(name=f"{x}", type='string', comment='the column')
            for x in first_line
        ]

        if pt:
            partitions = [
                Partition(name='pt', type='string', comment='the partition')
            ]
            schema = Schema(columns=columns, partitions=partitions)
            table = self.creat_table(tableName, schema)
            table.create_partition(f"pt={pt}", if_not_exists=True)
            with table.open_writer(partition=f"pt={pt}") as writer:
                for index, line in enumerate(f):
                    print(f"{index} in {tableName}  ...")
                    item = line.strip('\n').split(sep)
                    item.append(pt)
                    resp = download(item[urlIndex])
                    data = resp.text
                    if sys.getsizeof(data) <= 8 * 1024 * 1000:
                        item[urlIndex] = data
                    else:
                        print(f"failed in {item[0]}")
                    writer.write(item)
        else:
            schema = Schema(columns=columns)
            table = self.creat_table(tableName, schema)
            with table.open_writer(partition=None) as writer:
                for index, line in enumerate(f):
                    print(f"{index}  in {tableName}  ...")
                    item = line.strip('\n').split(sep)
                    resp = download(item[urlIndex])
                    data = resp.text
                    if sys.getsizeof(data) <= 8 * 1024 * 1000:
                        item[urlIndex] = data
                    else:
                        print(f"failed in {item[0]}")
                    writer.write(item)
        print("\n\n upload finish ...")
        f.close()

    # ===========执行sql=========
    # sql查询
    def select_sql(self, sql):
        # return: 查询结果的迭代对象
        with self.odps.execute_sql(sql).open_reader() as reader:
            return reader
Example #8
0
class OdpsHelper:
    def __init__(self,access_id="",secret_access_key="",project="",
                 endpoint="",verbose=1,link_now=1):
        self.access_id = access_id
        self.secret_access_key = secret_access_key
        self.project = project
        self.endpoint = endpoint
        self.varbose = verbose
        self.odps_instance = None
        if link_now:
            self.link()
    def link(self):
        self.odps_instance = ODPS(access_id = self.access_id,
                                  secret_access_key = self.secret_access_key,
                                  project = self.project,
                                  endpoint = self.endpoint)
        return self.odps_instance
    def generate_odps_columns_from_pandas_df(self,df):
        type_dict = {"int64":'BIGINT','float64':'DOUBLE','object':'STRING'}
        column_types = df.dtypes
        odps_columns = []
        for c_name in column_types.index:
            temp_type = type_dict.get(str(c_name),'STRING')
            if self.verbose:
                print("Transfer from pandas columns to odps columns:",
                      o_name +'->'+c_name.lower()+':'+'str(column_type[c_name])+'
                                                      '->'+temp_type)
                odps_columns.append(Column(name=c_name.lower(),type=temp_type))
        return odps_columns
    def generate_table(self,table_name,columns,if_not_exists=True):
        schema = Schema(columns=columns)
        self.odps_instance.create_table(table_name.lower(),schema,if_not_exists=if_not_exists)
        t = self.odps_instance.get_table(table_name)
        return t
    def write_odps_from_pandas_df(self,table,df):
        if isinstance(table,str):
            t = self.generate_table(table,self.generate_odps_columns_from_pandas_df(df))
        elif isinstance(table,Table):
            t = table
        else:
            raise Exception("Unsupported data type")
        df_renamed = df.rename(str.lower,axis=1)
        columns_for_sort = [x.name for x in t.schema.columns]
        df_final = df_renamed.loc[:,columns_for_sort]
        odps_data = np.array(df_final).tolist()
        with t.open_writer() as writer:
            writer.write(odps_data)
        writer.close()

    def write_odps_from_csv(self,table_name,csv_path):
        df = pd.read_csv(csv_path)
        self.write_odps_from_pandas_df(table_name,df)

    def read_odps_to_pandas_df(self,table,top_n=-1):
        if isinstance(table,str):
            t = self.odps_instance.get_table(table)
        elif isinstance(table,Table):
            t = Table
        else:
            raise  Exception("Unsupported data type")

        log.info("============== Table info:\n"+str(t))
        table_name = t.name
        columns = []
        with t.open_reader() as reader:
            count = reader.count
            log.info("=========== row count:"+str(count))
            for row in reader[:1]:
                columns = np.array(row)[:,0]
        data = []
        counter = 0
        step_count = 100
        counters = 0
        step_length = count
        if self.varbose
            print("========= start reading table:",table_name,'total:',count,'step length',step_length)
            print("========= read top:",top_n)
        start_time = time.time()
        columns_len = len(columns)
        for row in t.open_reader():
            temp_data = []
            for j in range(columns_len):
                temp_data.append(row[j])
            data.append(temp_data)
            counter += 1
            if top_n>0:
                if(counters + counter) == top_n:
                    break
            if counter == step_length:
                counters += counter
                step_now = round(counters * 100 /count,1)
                if self.verbose:
                    print("100%")
                    print("============= end reading table.Elasped minutes:",round((time.time() - start_time)/60,2))
                result = pd.DataFrame(data,columns=columns)
                return result