Exemple #1
0
    def _single_pk_iterate(self):
        # 从本次最小 id 开始(不包含), 按序查出接下来的 STEPS 条记录
        this_min = self.pks[0]['last_max']
        src_sql = SQL_SINGLE_PK_GREATER_LIMIT_ITERATE.format(
            table=wrap(self.table),
            id=wrap(self.pks[0]['name']),
            md5_concat=self.md5_concat)
        src_result_tuples = query(Const.SRC_POOL, src_sql, this_min,
                                  Const.STEPS)

        got_this_time = len(src_result_tuples)
        not_matches_this_time = []
        if got_this_time > 0:
            # 更新本次查询到的最大 id
            self.pks[0]['last_max'] = this_max = src_result_tuples[-1][0]
            # 按照区间, 查询目标表
            dest_sql = SQL_SINGLE_PK_RANGE_ITERATE.format(
                table=wrap(self.table),
                id=wrap(self.pks[0]['name']),
                md5_concat=self.md5_concat)
            dest_result_tuples = query(Const.DEST_POOL, dest_sql, this_min,
                                       this_max)
            # 比较记录的 md5 值
            not_matches_this_time = self._cmp_md5(src_result_tuples,
                                                  dest_result_tuples)

        # 返回本次查询到记录数
        return got_this_time, not_matches_this_time
Exemple #2
0
    def _double_check_log_dingding(self, not_matches):
        if not not_matches:
            return

        if self.pk_type not in [
                PK_AUTO_INCREMENT, PK_VARCHAR_INT, PK_COMPOSITE
        ]:
            return

        errors = []
        sql = SQL_BY_ID.format(table=wrap(self.table),
                               pk_fields=self.pk_fields,
                               md5_concat=self.md5_concat,
                               value_params=', '.join(
                                   ['%s' for i in range(len(self.pks))]))

        for record_tup in not_matches:
            src_result_tuples = query(Const.SRC_POOL, sql, *record_tup[0:-1])
            dest_result_tuples = query(Const.DEST_POOL, sql, *record_tup[0:-1])

            error_dict = {
                'db_type': Const.DB_TYPE,
                'database': Const.DATABASE,
                'table': self.table,
                'pk_fields': self.pk_fields
            }

            if not src_result_tuples:
                if dest_result_tuples:
                    # str(dest_result_tuples[0]) 避免数据库字段对应类转换为 json 异常
                    error_dict.update({
                        'error': Const.NO_ID_IN_SRC,
                        'dest': str(dest_result_tuples[0])
                    })
                    errors.append(error_dict)
            elif not dest_result_tuples:
                error_dict.update({
                    'error': Const.NO_ID_IN_DEST,
                    'src': str(src_result_tuples[0])
                })
                errors.append(error_dict)
            elif src_result_tuples[0][-1] != dest_result_tuples[0][-1]:
                error_dict.update({
                    'error': Const.MD5_NOT_EQUAL,
                    'src': str(src_result_tuples[0]),
                    'dest': str(dest_result_tuples[0])
                })
                errors.append(error_dict)

        if errors:
            result_logger.info(
                json.dumps(errors, ensure_ascii=False, cls=JsonEncoderX))
            dingding_inconsistency('data inconsistency', errors)
Exemple #3
0
    def _composite_pk_iterate(self):
        # 本次最小 id
        pk0_begin = self.pks[0]['last_max']
        pk1_begin = self.pks[1]['last_max']
        pk2_begin = None

        begin_where = None
        end_where = None
        if len(self.pks) == 2:
            # (A, B) > (x, y) 等价于 A>x OR (A=x AND B>y)
            # 经实测, 前面的简单写法在 MySQL 中也用到了索引,但就是比后面的写法慢
            begin_where = "{pk0}>%s OR ({pk0}=%s AND {pk1}>%s)".format(
                pk0=wrap(self.pks[0]['name']), pk1=wrap(self.pks[1]['name']))
            end_where = "{pk0}<%s OR ({pk0}=%s AND {pk1}<=%s)".format(
                pk0=wrap(self.pks[0]['name']), pk1=wrap(self.pks[1]['name']))
        else:
            # (A, B, C) > (x, y, z) 等价于 A>x OR (A=x AND (B>y OR (B=y AND C>z)))
            # 所有 > 3 主键列的表, 数据量都 < 10000, 采用前面的简单写法
            begin_where = '({pk_fields}) > (%s, %s, %s)'.format(
                pk_fields=self.pk_fields)
            end_where = '({pk_fields}) <= (%s, %s, %s)'.format(
                pk_fields=self.pk_fields)
            pk2_begin = self.pks[2]['last_max']

        # 从本次最小 id 开始(不包含), 按序查出接下来的 STEPS 条记录
        src_args = self._build_args([pk0_begin, pk1_begin, pk2_begin])
        src_args.append(Const.STEPS)
        src_sql = SQL_COMPOSITE_PK_GREATER_LIMIT_ITERATE.format(
            table=wrap(self.table),
            md5_concat=self.md5_concat,
            pk_fields=self.pk_fields,
            begin_where=begin_where)
        src_result_tuples = query(Const.SRC_POOL, src_sql, *src_args)

        got_this_time = len(src_result_tuples)
        not_matches_this_time = []
        if got_this_time > 0:
            # 更新本次查询到的联合主键最大值, 排除最后一个(md5_concat)
            for i in range(len(src_result_tuples[-1]) - 1):
                self.pks[i]['last_max'] = src_result_tuples[-1][i]
            # 按照区间, 查询目标表
            dest_sql = SQL_COMPOSITE_PK_RANGE_ITERATE.format(
                table=wrap(self.table),
                md5_concat=self.md5_concat,
                pk_fields=self.pk_fields,
                begin_where=begin_where,
                end_where=end_where)
            begin_args = self._build_args([pk0_begin, pk1_begin, pk2_begin])
            end_args = self._build_args([pk['last_max'] for pk in self.pks])
            dest_args = begin_args + end_args
            dest_result_tuples = query(Const.DEST_POOL, dest_sql, *dest_args)
            # 比较记录的 md5 值
            not_matches_this_time = self._cmp_md5(src_result_tuples,
                                                  dest_result_tuples)

        # 返回本次查询到记录数
        return got_this_time, not_matches_this_time
Exemple #4
0
    def _select_all_iterate(self):
        sql = SQL_SELECT_ALL_ITERATE.format(table=wrap(self.table),
                                            md5_concat=self.md5_concat)
        src_result_tuples = query(Const.SRC_POOL, sql)

        got_this_time = len(src_result_tuples)
        not_matches_this_time = []
        if got_this_time > 0:
            # 查询目标表
            dest_result_tuples = query(Const.DEST_POOL, sql)
            # 比较记录的 md5 值
            not_matches_this_time = self._cmp_md5(src_result_tuples,
                                                  dest_result_tuples)

        # 返回本次查询到记录数
        return got_this_time, not_matches_this_time
Exemple #5
0
    def __init__(self, table, pks, md5_concat):
        self.table = table  # 表名
        self.pks = pks  # 主键列 [{'name':, 'data_type':, 'last_max':}]
        self.pk_fields = None if pks == None else ','.join(
            [wrap(pk['name']) for pk in self.pks])
        self.md5_concat = md5_concat  # md5(concat(所有列)) 字符串

        self.pk_type = None  # 主键类型, 如 PK_AUTO_INCREMENT、PK_COMPOSITE ...

        self.table_structure_matched = None  # 表结构是否匹配

        self.count = 0  # 表总记录数
        self.got_stage = 0  # 当前执行阶段, 已获取记录数
        self.got_already = 0  # 已获取记录数
        self.finished = False  # 表迭代比较任务是否完成

        self.create_time_millis = now_time_millis()  # 创建时间戳(ms)
        self.update_time_millis = None  # 更新时间戳(ms)
Exemple #6
0
 def set_count(self):
     result_tuples = query(Const.SRC_POOL,
                           SQL_COUNT.format(table=wrap(self.table)))
     self.count = result_tuples[0][0]
     logger.info("===== %s, db=%s table=%s, count=%s", Const.DB_TYPE,
                 Const.DATABASE, self.table, self.count)