def _single_pk_iterate(self): # 从本次最小 id 开始(不包含), 按序查出接下来的 STEPS 条记录 this_min = self.pks[0]['last_max'] src_sql = SQL_SINGLE_PK_GREATER_LIMIT_ITERATE.format( table=wrap(self.table), id=wrap(self.pks[0]['name']), md5_concat=self.md5_concat) src_result_tuples = query(Const.SRC_POOL, src_sql, this_min, Const.STEPS) got_this_time = len(src_result_tuples) not_matches_this_time = [] if got_this_time > 0: # 更新本次查询到的最大 id self.pks[0]['last_max'] = this_max = src_result_tuples[-1][0] # 按照区间, 查询目标表 dest_sql = SQL_SINGLE_PK_RANGE_ITERATE.format( table=wrap(self.table), id=wrap(self.pks[0]['name']), md5_concat=self.md5_concat) dest_result_tuples = query(Const.DEST_POOL, dest_sql, this_min, this_max) # 比较记录的 md5 值 not_matches_this_time = self._cmp_md5(src_result_tuples, dest_result_tuples) # 返回本次查询到记录数 return got_this_time, not_matches_this_time
def _double_check_log_dingding(self, not_matches): if not not_matches: return if self.pk_type not in [ PK_AUTO_INCREMENT, PK_VARCHAR_INT, PK_COMPOSITE ]: return errors = [] sql = SQL_BY_ID.format(table=wrap(self.table), pk_fields=self.pk_fields, md5_concat=self.md5_concat, value_params=', '.join( ['%s' for i in range(len(self.pks))])) for record_tup in not_matches: src_result_tuples = query(Const.SRC_POOL, sql, *record_tup[0:-1]) dest_result_tuples = query(Const.DEST_POOL, sql, *record_tup[0:-1]) error_dict = { 'db_type': Const.DB_TYPE, 'database': Const.DATABASE, 'table': self.table, 'pk_fields': self.pk_fields } if not src_result_tuples: if dest_result_tuples: # str(dest_result_tuples[0]) 避免数据库字段对应类转换为 json 异常 error_dict.update({ 'error': Const.NO_ID_IN_SRC, 'dest': str(dest_result_tuples[0]) }) errors.append(error_dict) elif not dest_result_tuples: error_dict.update({ 'error': Const.NO_ID_IN_DEST, 'src': str(src_result_tuples[0]) }) errors.append(error_dict) elif src_result_tuples[0][-1] != dest_result_tuples[0][-1]: error_dict.update({ 'error': Const.MD5_NOT_EQUAL, 'src': str(src_result_tuples[0]), 'dest': str(dest_result_tuples[0]) }) errors.append(error_dict) if errors: result_logger.info( json.dumps(errors, ensure_ascii=False, cls=JsonEncoderX)) dingding_inconsistency('data inconsistency', errors)
def _composite_pk_iterate(self): # 本次最小 id pk0_begin = self.pks[0]['last_max'] pk1_begin = self.pks[1]['last_max'] pk2_begin = None begin_where = None end_where = None if len(self.pks) == 2: # (A, B) > (x, y) 等价于 A>x OR (A=x AND B>y) # 经实测, 前面的简单写法在 MySQL 中也用到了索引,但就是比后面的写法慢 begin_where = "{pk0}>%s OR ({pk0}=%s AND {pk1}>%s)".format( pk0=wrap(self.pks[0]['name']), pk1=wrap(self.pks[1]['name'])) end_where = "{pk0}<%s OR ({pk0}=%s AND {pk1}<=%s)".format( pk0=wrap(self.pks[0]['name']), pk1=wrap(self.pks[1]['name'])) else: # (A, B, C) > (x, y, z) 等价于 A>x OR (A=x AND (B>y OR (B=y AND C>z))) # 所有 > 3 主键列的表, 数据量都 < 10000, 采用前面的简单写法 begin_where = '({pk_fields}) > (%s, %s, %s)'.format( pk_fields=self.pk_fields) end_where = '({pk_fields}) <= (%s, %s, %s)'.format( pk_fields=self.pk_fields) pk2_begin = self.pks[2]['last_max'] # 从本次最小 id 开始(不包含), 按序查出接下来的 STEPS 条记录 src_args = self._build_args([pk0_begin, pk1_begin, pk2_begin]) src_args.append(Const.STEPS) src_sql = SQL_COMPOSITE_PK_GREATER_LIMIT_ITERATE.format( table=wrap(self.table), md5_concat=self.md5_concat, pk_fields=self.pk_fields, begin_where=begin_where) src_result_tuples = query(Const.SRC_POOL, src_sql, *src_args) got_this_time = len(src_result_tuples) not_matches_this_time = [] if got_this_time > 0: # 更新本次查询到的联合主键最大值, 排除最后一个(md5_concat) for i in range(len(src_result_tuples[-1]) - 1): self.pks[i]['last_max'] = src_result_tuples[-1][i] # 按照区间, 查询目标表 dest_sql = SQL_COMPOSITE_PK_RANGE_ITERATE.format( table=wrap(self.table), md5_concat=self.md5_concat, pk_fields=self.pk_fields, begin_where=begin_where, end_where=end_where) begin_args = self._build_args([pk0_begin, pk1_begin, pk2_begin]) end_args = self._build_args([pk['last_max'] for pk in self.pks]) dest_args = begin_args + end_args dest_result_tuples = query(Const.DEST_POOL, dest_sql, *dest_args) # 比较记录的 md5 值 not_matches_this_time = self._cmp_md5(src_result_tuples, dest_result_tuples) # 返回本次查询到记录数 return got_this_time, not_matches_this_time
def _select_all_iterate(self): sql = SQL_SELECT_ALL_ITERATE.format(table=wrap(self.table), md5_concat=self.md5_concat) src_result_tuples = query(Const.SRC_POOL, sql) got_this_time = len(src_result_tuples) not_matches_this_time = [] if got_this_time > 0: # 查询目标表 dest_result_tuples = query(Const.DEST_POOL, sql) # 比较记录的 md5 值 not_matches_this_time = self._cmp_md5(src_result_tuples, dest_result_tuples) # 返回本次查询到记录数 return got_this_time, not_matches_this_time
def __init__(self, table, pks, md5_concat): self.table = table # 表名 self.pks = pks # 主键列 [{'name':, 'data_type':, 'last_max':}] self.pk_fields = None if pks == None else ','.join( [wrap(pk['name']) for pk in self.pks]) self.md5_concat = md5_concat # md5(concat(所有列)) 字符串 self.pk_type = None # 主键类型, 如 PK_AUTO_INCREMENT、PK_COMPOSITE ... self.table_structure_matched = None # 表结构是否匹配 self.count = 0 # 表总记录数 self.got_stage = 0 # 当前执行阶段, 已获取记录数 self.got_already = 0 # 已获取记录数 self.finished = False # 表迭代比较任务是否完成 self.create_time_millis = now_time_millis() # 创建时间戳(ms) self.update_time_millis = None # 更新时间戳(ms)
def set_count(self): result_tuples = query(Const.SRC_POOL, SQL_COUNT.format(table=wrap(self.table))) self.count = result_tuples[0][0] logger.info("===== %s, db=%s table=%s, count=%s", Const.DB_TYPE, Const.DATABASE, self.table, self.count)