Ejemplo n.º 1
0
    def testTokenizedSignServerAccount(self):
        server = SignServer(token=str(uuid.uuid4()))
        server.accounts[
            self.odps.account.access_id] = self.odps.account.secret_access_key
        try:
            server.start(('127.0.0.1', 0))
            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            self.assertRaises(
                SignServerError,
                lambda: odps.delete_table(tn('test_sign_account_table'),
                                          if_exists=True))

            account = SignServerAccount(self.odps.account.access_id,
                                        server.server.server_address,
                                        token=server.token)
            odps = ODPS(None,
                        None,
                        self.odps.project,
                        self.odps.endpoint,
                        account=account)
            odps.delete_table(tn('test_sign_account_table'), if_exists=True)
            t = odps.create_table(tn('test_sign_account_table'),
                                  'col string',
                                  lifecycle=1)
            self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
            t.drop(async=True)
        finally:
            server.stop()
Ejemplo n.º 2
0
 def testSignServerAccount(self):
     server = SignServer()
     server.accounts[
         self.odps.account.access_id] = self.odps.account.secret_access_key
     try:
         server.start(('127.0.0.1', 0))
         account = SignServerAccount(self.odps.account.access_id,
                                     server.server.server_address)
         odps = ODPS(None,
                     None,
                     self.odps.project,
                     self.odps.endpoint,
                     account=account)
         odps.delete_table(tn('test_sign_account_table'), if_exists=True)
         t = odps.create_table(tn('test_sign_account_table'),
                               'col string',
                               lifecycle=1)
         self.assertTrue(odps.exist_table(tn('test_sign_account_table')))
         t.drop(async_=True)
     finally:
         server.stop()
Ejemplo n.º 3
0
class ODPSWriter(object):
    def __init__(
        self,
        project,
        access_id,
        access_key,
        endpoint,
        table,
        columns=None,
        column_types=None,
        options=None,
    ):
        """
        Constructs a `ODPSWriter` instance.

        Args:
            project: Name of the ODPS project.
            access_id: ODPS user access ID.
            access_key: ODPS user access key.
            endpoint: ODPS cluster endpoint.
            table: ODPS table name.
            columns: The list of column names in the table,
                which will be inferred if the table exits.
            column_types" The list of column types in the table,
                which will be inferred if the table exits.
            options: Other options passed to ODPS context.
        """
        super(ODPSWriter, self).__init__()

        if table.find(".") > 0:
            project, table = table.split(".")
        if options is None:
            options = {}
        self._project = project
        self._access_id = access_id
        self._access_key = access_key
        self._endpoint = endpoint
        self._table = table
        self._columns = columns
        self._column_types = column_types
        self._odps_table = None
        _configure_odps_options(self._endpoint, options)
        self._odps_client = ODPS(self._access_id, self._access_key,
                                 self._project, self._endpoint)

    def _initialize_table(self):
        if self._odps_client.exist_table(self._table, self._project):
            self._odps_table = self._odps_client.get_table(
                self._table, self._project)
        else:
            if self._columns is None or self._column_types is None:
                raise ValueError("columns and column_types need to be "
                                 "specified for non-existing table.")
            schema = Schema.from_lists(self._columns, self._column_types,
                                       ["worker"], ["string"])
            self._odps_table = self._odps_client.create_table(
                self._table, schema)

    def from_iterator(self, records_iter, worker_index):
        if self._odps_table is None:
            self._initialize_table()
        with self._odps_table.open_writer(partition="worker=" +
                                          str(worker_index),
                                          create_partition=True) as writer:
            for records in records_iter:
                writer.write(records)
Ejemplo n.º 4
0
'''

s = ODPS('',
         '',
         '%s' % sys.argv[1],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')
d = ODPS('',
         '',
         '%s' % sys.argv[2],
         endpoint='http://service.cn.maxcompute.aliyun.com/api')

print("######################################################################")

for table in s.list_tables():
    t1 = s.get_table(table.name)
    if d.exist_table(table.name):
        t2 = d.get_table(table.name)
    else:
        print("表%s 在目标项目%s中不存在 跳过校验" % (table.name, sys.argv[2]))
        continue

    if table.schema.partitions:  #判断该表是否为分区表
        #print 'Table %s is partitioned.' %table.name
        for partition in table.partitions:
            #print partition.name
            with t1.open_reader(partition='%s' % partition.name) as reader:
                count1 = reader.count
                #print "表名:%s\t分区:%s\t数据量:%s" %(table.name,partition.name,count1)
            if t2.exist_partition(partition.name):
                with t2.open_reader(partition='%s' %
                                    partition.name) as reader2:
Ejemplo n.º 5
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and \
                    options.access_key is not None and \
                    options.default_project is not None:
            self._odps = ODPS(
                options.access_id, options.access_key, options.default_project,
                endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint
            )
        else:
            self._odps = enter().odps

    @line_magic('enter')
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        if 'o' not in self.shell.user_ns:
            self.shell.user_ns['o'] = self._odps

        return r

    @line_magic('setup')
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)
        html_notify('setup succeeded')

    @line_magic('teardown')
    def teardown(self, line):
        name = line.strip()
        teardown(name)
        html_notify('teardown succeeded')

    @line_magic('list_rooms')
    def list_rooms(self, line):
        return list_rooms()

    @line_magic('stores')
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum((float(stage.terminated_workers) / stage.total_workers)
                              for stage in progress.stages if stage.total_workers > 0)
            return all_percent / len(progress.stages)
        else:
            return 0

    def _to_stdout(cls, msg):
        print(msg)

    @line_magic('set')
    def set_hint(self, line):
        if '=' not in line:
            raise ValueError('Hint for sql is not allowed')

        key, val = line.strip().strip(';').split('=', 1)
        key, val = key.strip(), val.strip()

        settings = options.sql.settings
        if settings is None:
            options.sql.settings = {key: val}
        else:
            options.sql.settings[key] = val

    @line_cell_magic('sql')
    def execute(self, line, cell=''):
        self._set_odps()

        content = line + '\n' + cell
        content = content.strip()

        sql = None
        hints = dict()

        splits = content.split(';')
        for s in splits:
            stripped = s.strip()
            if stripped.lower().startswith('set '):
                hint = stripped.split(' ', 1)[1]
                k, v = hint.split('=', 1)
                k, v = k.strip(), v.strip()
                hints[k] = v
            elif len(stripped) == 0:
                continue
            else:
                if sql is None:
                    sql = s
                else:
                    sql = '%s;%s' % (sql, s)

        # replace user defined parameters
        sql = replace_sql_parameters(sql, self.shell.user_ns)

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql, hints=hints)
            if options.verbose:
                stdout = options.verbose_log or self._to_stdout
                stdout('Instance ID: ' + instance.id)
                stdout('  Log view: ' + instance.get_logview_address())

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name)
                                  for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd
                        from pandas.parser import CParserError

                        try:
                            res = pd.read_csv(StringIO(reader.raw))
                        except (ValueError, CParserError):
                            res = reader.raw
                    except ImportError:
                        try:
                            res = ResultFrame(list(reader), columns=reader._columns)
                        except TypeError:
                            res = reader.raw

                html_notify('SQL execution succeeded')
                return res
            finally:
                bar.close()

    @line_magic('persist')
    def persist(self, line):
        try:
            import pandas as pd
            has_pandas = True
        except ImportError:
            has_pandas = False

        self._set_odps()

        line = line.strip().strip(';')

        frame_name, table_name = line.split(None, 1)

        if '.' in table_name:
            project_name, table_name = tuple(table_name.split('.', 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError('%s already exists' % table_name)

        if isinstance(frame, DataFrame):
            frame.persist(name=table_name, project=project_name, notify=False)
        elif has_pandas and isinstance(frame, pd.DataFrame):
            frame = DataFrame(frame)
            frame.persist(name=table_name, project=project_name, notify=False)
        html_notify('Persist succeeded')
Ejemplo n.º 6
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and options.access_key is not None and options.default_project is not None:
            self._odps = ODPS(
                options.access_id,
                options.access_key,
                options.default_project,
                endpoint=options.end_point,
                tunnel_endpoint=options.tunnel_endpoint,
            )
        else:
            self._odps = enter().odps

    @line_magic("enter")
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        return r

    @line_magic("setup")
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)

    @line_magic("teardown")
    def teardown(self, line):
        name = line.strip()
        teardown(name)

    @line_magic("list_rooms")
    def list_rooms(self, line):
        return list_rooms()

    @line_magic("stores")
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum(
                (float(stage.terminated_workers) / stage.total_workers)
                for stage in progress.stages
                if stage.total_workers > 0
            )
            return all_percent / len(progress.stages)
        else:
            return 0

    @line_cell_magic("sql")
    def execute(self, line, cell=""):
        self._set_odps()

        sql = line + "\n" + cell
        sql = sql.strip()

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql)

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name) for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd

                        try:
                            return pd.read_csv(StringIO(reader.raw))
                        except ValueError:
                            return reader.raw
                    except ImportError:
                        return ResultFrame(list(reader), columns=reader._columns)
            finally:
                bar.close()

    @line_magic("persist")
    def persist(self, line):
        import pandas as pd

        self._set_odps()

        line = line.strip().strip(";")

        frame_name, table_name = line.split(None, 1)

        if "." in table_name:
            project_name, table_name = tuple(table_name.split(".", 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if not isinstance(frame, pd.DataFrame):
            raise TypeError("%s is not a Pandas DataFrame" % frame_name)

        columns = list(frame.columns)
        types = [np_to_odps_types.get(tp, odps_types.string) for tp in frame.dtypes]

        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError("%s already exists")

        tb = self._odps.create_table(table_name, Schema.from_lists(columns, types))

        def gen(df):
            size = len(df)

            bar = init_progress_bar(size)

            try:
                c = itertools.count()
                for row in df.values:
                    i = next(c)
                    if i % 50 == 0:
                        bar.update(min(i, size))

                    yield tb.new_record(list(row))

                bar.update(size)
            finally:
                bar.close()

        with tb.open_writer() as writer:
            writer.write(gen(frame))
Ejemplo n.º 7
0
class ODPSSql(Magics):

    _odps = None

    def _set_odps(self):
        if self._odps is not None:
            return

        if options.access_id is not None and \
                    options.access_key is not None and \
                    options.default_project is not None:
            self._odps = ODPS(
                options.access_id, options.access_key, options.default_project,
                endpoint=options.end_point, tunnel_endpoint=options.tunnel_endpoint
            )
        else:
            self._odps = enter().odps

    @line_magic('enter')
    def enter(self, line):
        room = line.strip()
        if room:
            r = enter(room)
            self._odps = r.odps
        else:
            r = enter()
            self._odps = r.odps

        if 'o' not in self.shell.user_ns:
            self.shell.user_ns['o'] = self._odps

        return r

    @line_magic('setup')
    def setup(self, line):
        args = line.strip().split()
        name, args = args[0], args[1:]
        setup(*args, room=name)

    @line_magic('teardown')
    def teardown(self, line):
        name = line.strip()
        teardown(name)

    @line_magic('list_rooms')
    def list_rooms(self, line):
        return list_rooms()

    @line_magic('stores')
    def list_stores(self, line):
        line = line.strip()

        if line:
            room = enter(line)
        else:
            room = enter()

        return room.display()

    def _get_task_percent(self, instance, task_name):
        progress = instance.get_task_progress(task_name)

        if len(progress.stages) > 0:
            all_percent = sum((float(stage.terminated_workers) / stage.total_workers)
                              for stage in progress.stages if stage.total_workers > 0)
            return all_percent / len(progress.stages)
        else:
            return 0

    def _to_stdout(cls, msg):
        print(msg)

    @line_magic('set')
    def set_hint(self, line):
        if '=' not in line:
            raise ValueError('Hint for sql is not allowed')

        key, val = line.strip().strip(';').split('=', 1)
        key, val = key.strip(), val.strip()

        settings = options.sql.settings
        if settings is None:
            options.sql.settings = {key: val}
        else:
            options.sql.settings[key] = val

    @line_cell_magic('sql')
    def execute(self, line, cell=''):
        self._set_odps()

        content = line + '\n' + cell
        content = content.strip()

        sql = None
        hints = dict()

        splits = content.split(';')
        for s in splits:
            stripped = s.strip()
            if stripped.lower().startswith('set '):
                hint = stripped.split(' ', 1)[1]
                k, v = hint.split('=', 1)
                k, v = k.strip(), v.strip()
                hints[k] = v
            elif len(stripped) == 0:
                continue
            else:
                if sql is None:
                    sql = s
                else:
                    sql = '%s;%s' % (sql, s)

        # replace user defined parameters
        sql = replace_sql_parameters(sql, self.shell.user_ns)

        if sql:
            bar = init_progress_bar()

            instance = self._odps.run_sql(sql, hints=hints)
            if options.verbose:
                stdout = options.verbose_log or self._to_stdout
                stdout('Instance ID: ' + instance.id)
                stdout('  Log view: ' + instance.get_logview_address())

            percent = 0
            while not instance.is_terminated():
                task_names = instance.get_task_names()
                last_percent = percent
                if len(task_names) > 0:
                    percent = sum(self._get_task_percent(instance, name)
                                  for name in task_names) / len(task_names)
                else:
                    percent = 0
                percent = min(1, max(percent, last_percent))
                bar.update(percent)

                time.sleep(1)

            instance.wait_for_success()
            bar.update(1)

            try:
                with instance.open_reader() as reader:
                    try:
                        import pandas as pd
                        from pandas.parser import CParserError

                        try:
                            return pd.read_csv(StringIO(reader.raw))
                        except (ValueError, CParserError):
                            return reader.raw
                    except ImportError:
                        try:
                            return ResultFrame(list(reader), columns=reader._columns)
                        except TypeError:
                            return reader.raw
            finally:
                bar.close()

    @line_magic('persist')
    def persist(self, line):
        try:
            import pandas as pd
            has_pandas = True
        except ImportError:
            has_pandas = False

        self._set_odps()

        line = line.strip().strip(';')

        frame_name, table_name = line.split(None, 1)

        if '.' in table_name:
            project_name, table_name = tuple(table_name.split('.', 1))
        else:
            project_name = None

        frame = self.shell.user_ns[frame_name]
        if self._odps.exist_table(table_name, project=project_name):
            raise TypeError('%s already exists' % table_name)

        if isinstance(frame, DataFrame):
            frame.persist(name=table_name, project=project_name)
        elif has_pandas and isinstance(frame, pd.DataFrame):
            frame = DataFrame(frame)
            frame.persist(name=table_name, project=project_name)