def update_img_info(img_name): # 更新图片信息 conn = base_data_final_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT `info` FROM poi_images WHERE file_name = %s;''', (img_name,)) info = cursor.fetchone()[0] cursor.close() conn.close() if info: _i = json.loads(info) else: _i = {} _i['delete_reason'] = "图片质量低,手动取消" conn = base_data_final_pool.connection() cursor = conn.cursor() cursor.execute('''UPDATE poi_images SET `use` = 0, info = %s WHERE file_name = %s;''', (json.dumps(_i), img_name,)) conn.commit() cursor.close() conn.close() logger.info("[new info][info: {}]".format(json.dumps(_i)))
def get_file(sid): sql = '''SELECT file_name, pic_size, info FROM poi_images WHERE source = 'qyer' AND sid = '{}' AND `use` = 1;'''.format(sid) conn = base_data_final_pool.connection() cursor = conn.cursor(cursor=pymysql.cursors.DictCursor) cursor.execute(sql) file_pic_size = {} for line in cursor.fetchall(): w, h = eval(line['pic_size']) _j_data = json.loads(line['info']) _j_data['down_reason'] = "图片数多余 90 张,下掉一部分" file_pic_size[(json.dumps(_j_data), line['file_name'])] = int(w) * int(h) cursor.close() conn.close() down_imgs = list( map(lambda x: x[0], sorted(file_pic_size.items(), key=lambda x: x[1], reverse=True)))[90:] update_sql(sid=sid, data=down_imgs)
def init_all_seek_dict(): local_conn = base_data_final_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT * FROM data_insert_seek;''') global all_seek_dict all_seek_dict = {k: v for k, v in local_cursor.fetchall()} local_cursor.close() local_conn.close()
def insert_data(data): update_sql = '''INSERT IGNORE INTO poi_images (file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`) VALUE (%(file_name)s, 'online', %(sid)s, %(url)s, %(pic_size)s, %(bucket_name)s, %(url_md5)s, %(pic_md5)s, %(use)s);''' conn = base_data_final_pool.connection() cursor = conn.cursor() _res = cursor.executemany(update_sql, data) conn.commit() cursor.close() conn.close() logger.debug("[move data][total: {}][execute: {}]".format(len(data), _res))
def create_table(): final_conn = base_data_final_pool.connection() final_cursor = final_conn.cursor() for k, v in final_table.items(): real_path = os.path.split(os.path.realpath(__file__))[0] sql_path = os.path.join(real_path, 'sql', v) final_sql = open(sql_path).read() table_name = "{}_final".format(k) final_cursor.execute(final_sql % (table_name, )) logger.debug('[create table][name: {}]'.format(table_name)) final_cursor.close() final_conn.close()
def update_seek_table(table_name, update_time): local_conn = base_data_final_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''REPLACE INTO data_insert_seek VALUES (%s, %s);''', (table_name, update_time)) logger.debug("[update seek table][table_name: {}][update_time: {}]".format( table_name, update_time)) global all_seek_dict all_seek_dict[table_name] = update_time local_conn.commit() local_cursor.close() local_conn.close()
def update_sql(sid, data): sql = '''UPDATE poi_images SET `use` = 0, info = %s WHERE source = 'qyer' AND sid = '{}' AND file_name = %s;'''.format(sid) conn = base_data_final_pool.connection() cursor = conn.cursor() _res = cursor.executemany(sql, data) conn.commit() cursor.close() conn.close() logger.info("[sid: {}][total: {}][execute: {}]".format( sid, len(data), _res))
def insert_db(table_name, data): global offset conn = base_data_final_pool.connection() cursor = conn.cursor() insert_into = "REPLACE INTO first_images (`source`, `source_id`, `first_img`) VALUES (%s, %s, %s)" res = cursor.executemany(insert_into, data) conn.commit() cursor.close() conn.close() logger.debug( "[insert data][table_name: {}][offset: {}][total: {}][insert: {}]". format(table_name, offset, len(data), res))
def update_db(data): sql = '''UPDATE poi_images SET `use` = 0, info = JSON_SET(CASE WHEN info IS NULL THEN '{}' ELSE info END, '$.down_reason', '疑似头像,下线') WHERE file_name = %s;''' conn = base_data_final_pool.connection() cursor = conn.cursor() _res = cursor.executemany(sql, data) conn.commit() cursor.close() conn.close() print("[total: {}][insert: {}]".format(len(data), _res))
def create_table(image_type): final_conn = base_data_final_pool.connection() final_cursor = final_conn.cursor() sql_name = final_table.get(image_type, None) if sql_name is None: raise TypeError("[Unknown View Type: {}]".format(image_type)) final_sql = open( '/search/hourong/PycharmProjects/PoiCommonScript/serviceplatform_data/sql/{}' .format(sql_name)).read() table_name = "{}_images".format(image_type_dict[image_type]) final_cursor.execute(final_sql % (table_name, )) logger.debug('[create table][name: {}]'.format(table_name)) final_cursor.close() final_conn.close()
def create_table(image_type): final_conn = base_data_final_pool.connection() final_cursor = final_conn.cursor() sql_name = final_table.get(image_type, None) if sql_name is None: raise TypeError("[Unknown View Type: {}]".format(image_type)) real_path = os.path.split(os.path.realpath(__file__))[0] sql_path = os.path.join(real_path, 'sql', sql_name) final_sql = open(sql_path).read() table_name = "{}_images".format(image_type_dict[image_type]) final_cursor.execute(final_sql % (table_name, )) logger.debug('[create table][name: {}]'.format(table_name)) final_cursor.close() final_conn.close()
def update_sql(data): sql = '''UPDATE hotel_images SET info = JSON_SET(CASE WHEN info IS NULL THEN '{{}}' ELSE info END, '$.down_reason', '扫库 md5 不对应,过滤') WHERE pic_md5 IN ({});'''.format(','.join(map(lambda x: "'{}'".format(x), data))) conn = base_data_final_pool.connection() cursor = conn.cursor() try: _res = cursor.execute(sql) except Exception: print(sql) raise Exception() conn.commit() cursor.close() conn.close() logger.info("[total: {}][execute: {}]".format(len(data), _res))
def load_data(limit=400): local_conn = base_data_final_pool.connection() local_cursor = local_conn.cursor() local_cursor.execute('''SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = '{}';'''.format(final_database)) # 强制要求按照 tag 的先后顺序排列 table_list = list( sorted(filter(lambda x: len(x.split('_')) in (3, 4), map(lambda x: x[0], local_cursor.fetchall())), key=lambda x: x.split('_')[-1])) local_cursor.close() for each_table in table_list: each_table_key_list = each_table.split('_') if len(each_table_key_list) == 3: if each_table_key_list[0] not in ('attr', 'rest', 'total', 'hotel'): logger.debug('[skip table][name: {}]'.format(each_table)) continue if each_table_key_list[-1] < '20170929a': logger.debug('[skip table][name: {}]'.format(each_table)) continue # 通过表明成获取类型以及 tag try: _type, _, _tag = each_table.split('_') except Exception: logger.error('[Unknown View Final: {}]'.format(each_table)) continue # 生成如数据表名 to_table_name = "{}_final".format(_type) _type = "{}_detail".format(_type) elif len(each_table_key_list) == 4: if each_table_key_list[1] != 'images': logger.debug('[skip table][name: {}]'.format(each_table)) continue try: _type, _, _, _tag = each_table.split('_') except Exception: logger.error('[Unknown View Final: {}]'.format(each_table)) continue # 生成如数据表名 if _type == 'hotel': to_table_name = "hotel_images" elif _type == 'poi': to_table_name = "poi_images" else: raise TypeError('Unknown Type: {}'.format(_type)) _type = "{}_images".format(_type) else: continue u_time = get_seek(table_name=each_table) start = time.time() # 开始进行数据合并 local_cursor = local_conn.cursor() update_time_sql = '''SELECT {0} FROM {1} WHERE {0} >= '{2}' ORDER BY {0} LIMIT {3};'''.format(time_key[_type], each_table, u_time, limit) line_count = local_cursor.execute(update_time_sql) logger.debug('sql: %s\nselect_count: %s' % (update_time_sql, str(local_cursor.rowcount))) if line_count == 0: # 如果已无数据,则不需要执行后面的处理 continue # get final update time for inserting db next time final_update_time = max(map(lambda x: x[0], local_cursor.fetchall())) logger.debug('each_table: %s final_update_time: %s' % (each_table, str(final_update_time))) local_cursor.close() # replace into final data local_cursor = local_conn.cursor() query_sql_list = [] if to_table_name == 'hotel_images': query_sql = '''REPLACE INTO {0} (source, source_id, pic_url, pic_md5, part, hotel_id, status, update_date, size, flag, file_md5) SELECT source, source_id, pic_url, pic_md5, part, hotel_id, status, update_date, size, flag, file_md5 FROM {1} WHERE update_date >= '{2}' ORDER BY update_date LIMIT {3};'''.format(to_table_name, each_table, u_time, limit) query_sql_list.append(query_sql) elif to_table_name == 'poi_images': query_sql = '''REPLACE INTO {0} (file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`, part, date) SELECT file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`, part, date FROM {1} WHERE date >= '{2}' ORDER BY date LIMIT {3};'''.format(to_table_name, each_table, u_time, limit) query_sql_list.append(query_sql) elif u_time != '': query_sql = '''REPLACE INTO {1} SELECT * FROM {2} WHERE {0} >= '{3}' ORDER BY {0} LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time, limit) query_sql_list.append(query_sql) if to_table_name == 'attr_final': query_sql = '''REPLACE INTO poi_merge.attr SELECT * FROM {2} WHERE {0} >= '{3}' ORDER BY {0} LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time, limit) query_sql_list.append(query_sql) elif to_table_name == 'total_final': query_sql = '''REPLACE INTO poi_merge.attr SELECT id, source, name, name_en, alias, map_info, city_id, source_city_id, address, star, recommend_lv, pv, plantocounts, beentocounts, overall_rank, ranking, grade, grade_distrib, commentcounts, tips, tagid, related_pois, nomissed, keyword, cateid, url, phone, site, imgurl, commenturl, introduction, '', opentime, price, recommended_time, wayto, 0, 0, insert_time FROM {2} WHERE {0} > '{3}' ORDER BY {0} LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time, limit) query_sql_list.append(query_sql) # elif to_table_name == 'rest_final': # query_sql = '''REPLACE INTO poi_merge.rest SELECT * # FROM {2} # WHERE {0} >= '{3}' # ORDER BY {0} # LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time, limit) else: raise TypeError( "Unknown Type [u_time: {}][to_table_name: {}]".format( u_time, to_table_name)) for _each_query_sql in query_sql_list: is_replace = True try: replace_count = local_cursor.execute(_each_query_sql) except pymysql.err.IntegrityError as integrity_err: _args = integrity_err.args if 'Duplicate entry' in _args[1]: # 当出现 duplicate entry 时候,使用 Insert Ignore 代替(replace into 会出现 duplicate error,暂时不知道原因) is_replace = False _each_query_sql = _each_query_sql.replace( 'REPLACE INTO', 'INSERT IGNORE INTO') replace_count = local_cursor.execute(_each_query_sql) else: logger.exception( msg="[table_name: {}][error_sql: {}]".format( each_table, _each_query_sql), exc_info=integrity_err) continue except Exception as e: logger.exception(msg="[table_name: {}][error_sql: {}]".format( each_table, _each_query_sql), exc_info=e) continue logger.debug( "[insert data][to: {}][from: {}][update_time: {}][final_update_time: {}][limit: {}][line_count: {}][" "{}: {}][takes: {}]".format( to_table_name if 'poi_merge' not in _each_query_sql else 'poi_merge.attr' if 'poi_merge.attr' in _each_query_sql else 'poi_merge.rest', each_table, u_time, final_update_time, limit, line_count, 'replace_count' if is_replace else 'insert_ignore_count', replace_count, time.time() - start)) local_conn.commit() local_cursor.close() update_seek_table(each_table, final_update_time) local_conn.close()
def detectOriData(need_detection_table_name=None): city_map_info_dict = get_city_map() dt = datetime.datetime.now() # 从 base data final 中获取表信息 _conn = base_data_final_pool.connection() _cursor = _conn.cursor() _cursor.execute('''SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'BaseDataFinal';''') # table 中携带数据库链接信息 table_list = list( map( lambda x: (x[0], { 'host': ori_ip, 'user': ori_user, 'passwd': ori_password, 'db': ori_db_name }, 'ota'), _cursor.fetchall())) _cursor.close() _conn.close() # 暂停 hotel-api 数据统计 # _conn = hotel_api_pool.connection() # _cursor = _conn.cursor() # _cursor.execute('''SELECT TABLE_NAME # FROM information_schema.TABLES # WHERE TABLE_SCHEMA = 'hotel_api';''') # for i in groupby( # list(map(lambda x: x[0], _cursor.fetchall())), # lambda x: '_'.join(x.split('_')[:2]) # ): # table_names = sorted(filter(lambda x: len(x.split('_')) >= 3 and 'bak' not in x, i[1]), reverse=True)[:7] # if not table_names: # continue # # for t_name in table_names: # table_list.append((t_name, hotel_api_config, 'api')) # # _cursor.close() # _conn.close() report_data = [] tasks_data = [] for cand_table, conn_config, table_type in table_list: error_count = {} source_count = defaultdict(int) error_dict = defaultdict(int) if need_detection_table_name is not None and cand_table != need_detection_table_name: print('===== 跳过表 {}'.format(cand_table)) continue if table_type == 'ota': # 当判定为 ota 类型时字段查找使用以下方式进行 cand_list = cand_table.split('_') # 使用 BaseDataFinal 中的数据进行数据校验,跳过不为 3 的表 # 其中的数据表名称类似 attr_final_20170929a if len(cand_list) != 3: continue task_type, _, task_tag = cand_list # 跳过非这四种抓取任务类型 if task_type not in ('attr', 'rest', 'hotel', 'total'): continue logger.debug(('[Begin Detect][table: {}]'.format(cand_table))) if task_type == 'hotel': # 酒店类型 sql = '''SELECT hotel_name, hotel_name_en, source, source_id, city_id, map_info, grade FROM {};'''.format(cand_table) elif task_type in ('attr', 'shop', 'rest', 'total'): # 景点、购物,餐厅当前 daodao 使用,以及全部 POI,qyer 使用 sql = '''SELECT name, name_en, source, id, city_id, map_info, grade, address FROM {};'''.format(cand_table) else: # 未知类型,当前跳过 continue elif table_type == 'api': _test_list = cand_table.split('_') if _test_list[0] == 'hotelinfo' and len(_test_list) >= 5: if not (_test_list[2].isdigit() and _test_list[3].isdigit() and _test_list[4].isdigit()): continue sql = '''SELECT hotel_name, hotel_name_en, source, source_id, city_id, map_info, grade FROM {};'''.format(cand_table) task_type = 'hotel_api' task_tag = "{}-{}-{}".format(_test_list[2], _test_list[3], _test_list[4]) else: logger.info("[don't known this table][table_name: {}]".format( cand_table)) continue else: logger.info("[unknown table type][type: {}]".format(table_type)) continue # 获取数据,使用迭代的方式获得 datas = MysqlSource(db_config=conn_config, table_or_query=sql, size=10000, is_table=False) # 经纬度记录集合,用于判定重复内容 map_info_set = defaultdict(set) # 重复经纬度集合,用于提取重复经纬度,以及在经纬度重复的值的最后添加 len(duplicate_map_info_set) 值, # 以保证返回值不会丢失第一次出现的 map_info duplicate_map_info_set = defaultdict(set) total = 0 success = 0 for data in datas: # 该条数据情况,数据正确,默认为 True,后续流程中会修改为 False right = True total += 1 if total % 10000 == 0: logger.debug( "[table data detect][table: {}][count: {}]".format( cand_table, total)) word_list = [] for word in data: if word is None: word_list.append('') else: word_list.append(word) name = word_list[0] name_en = word_list[1] source = word_list[2] sid = word_list[3] try: cid = str(int(word_list[4])) except Exception: cid = word_list[4] map_info = word_list[5] grade = word_list[6] address = word_list[7] # 增加本表中抓取源的统计 source_count[source] += 1 # # todo 从数据库中读取所有的抓取源 # 抓取源错误为大错误,由于爬虫写错导致,应该发报警邮件 # if source not in ('agoda', 'booking', 'ctrip', 'elong ', 'expedia', 'hotels'): # error_dict['数据源错误'] += 1 # right = False if '' == name and '' == name_en: error_dict[(source, '无 name、name_en')] += 1 right = False if '' != name and '' != name_en and is_contain_ch(name_en): if is_full_contain_ch(name_en): if not is_contain_ch(name): error_dict[(source, "中英文名字相反")] += 1 right = False if name.strip().lower() != name_en.strip().lower() \ and is_contain_ch(name) \ and not is_contain_ch(name_en) \ and len(name_en.split(' ')) >= 2 \ and name_en in name: error_dict[(source, "中文名中含有英文名")] += 1 right = False if 'NULL' == map_info: error_dict[(source, '坐标错误(NULL)')] += 1 right = False elif not map_info_legal(map_info): error_dict[(source, '坐标错误(坐标为空或坐标格式错误,除去NULL)')] += 1 if address: tasks_data.append( supplement_field(cand_table, sid, source, address)) right = False else: # 经纬度重复情况判定 if map_info in map_info_set[(source, cid)]: error_dict[(source, "经纬度重复")] += 1 if error_dict[(source, "经纬度重复")] == 1: # 当此经纬度出现 1 次时,经纬度重复加 2 ,之后正常 error_dict[(source, "经纬度重复")] += 1 duplicate_map_info_set[source].add(map_info) if address: tasks_data.append( supplement_field(cand_table, sid, source, address)) right = False # 当前情况为 map_info 为正确的情况,经纬度集合添加 map_info map_info_set[(source, cid)].add(map_info) # 当城市经纬度合法时计算相应的距离 city_map_info = city_map_info_dict.get(cid, None) if map_info_legal(city_map_info): cand_dist = getDistByMap(city_map_info, map_info) cand_reverse_dist = getDistByMap( city_map_info, ','.join(map_info.strip(',')[::-1])) if cand_dist and cand_reverse_dist: if cand_dist >= filter_dist: right = False error_dict[(source, '坐标与所属城市距离过远')] += 1 if cand_reverse_dist <= filter_dist: error_dict[(source, "距离过远坐标翻转后属于所属城市")] += 1 else: distance_set.add(sid) if address: tasks_data.append( supplement_field(cand_table, sid, source, address)) try: grade_f = float(grade) if grade_f > 10: error_dict[(source, '静态评分异常(评分高于10分)')] += 1 right = False except: pass success += 1 if right else 0 # 经纬度重复的值的最后添加 len(duplicate_map_info_set) 值 # 以保证返回值不会丢失第一次出现的 map_info # todo duplicate map info fix source # error_dict[(source, '经纬度重复')] += len(duplicate_map_info_set) # 生成经纬度重复任务,当前只有 qyer if task_type in ('total', 'attr'): for source, each_duplicate_map_info in duplicate_map_info_set.items( ): # get each detail table name detail_table = '_'.join( ['detail', task_type, source, task_tag]) insert_error_map_info_task( duplicate_map_info_set=each_duplicate_map_info, task_table=detail_table, task_type=task_type) logger.debug( "[table detected: {}][total: {}][error: {}][succeed: {}]".format( cand_table, total, error_count, success)) for each_source, _c in source_count.items(): report_data.append({ 'tag': task_tag, 'source': each_source, 'type': task_type, 'error_type': '全量', 'num': _c, 'date': datetime.datetime.strftime(dt, '%Y%m%d'), 'hour': datetime.datetime.strftime(dt, '%H'), 'datetime': datetime.datetime.strftime(dt, '%Y%m%d%H00') }) for s_err_type, num in error_dict.items(): _source, _err_type = s_err_type report_data.append({ 'tag': task_tag, 'source': _source, 'type': task_type, 'error_type': _err_type, 'num': num, 'date': datetime.datetime.strftime(dt, '%Y%m%d'), 'hour': datetime.datetime.strftime(dt, '%H'), 'datetime': datetime.datetime.strftime(dt, '%Y%m%d%H00') }) db = dataset.connect( 'mysql+pymysql://mioji_admin:[email protected]/Report?charset=utf8' ) crawl_report_table = db['serviceplatform_crawl_report_summary'] # serviceplatform_crawl_report_summary for each_data in report_data: try: crawl_report_table.upsert( each_data, keys=['tag', 'source', 'type', 'error_type', 'date'], ensure=None) logger.debug("[table_data: {}]".format(each_data)) except Exception as exc: logger.exception(msg="[update report table error]", exc_info=exc) db.commit() logger.debug('Done') return report_data, tasks_data
def get_img(s_sid_set, poi_type, old_img='', old_first_img='', is_official=False): """ Get img str by using source and sid set :param is_official: is official or not :param old_img: old img list, all img split with | :param old_first_img: old first img, use old sorting :param poi_type: poi type, Eg: attr rest shop :param s_sid_set: source and sid set :return: tuple (new_img str, new_first_img str) """ if not s_sid_set or is_official: return old_img, old_first_img conn = base_data_final_pool.connection() cursor = conn.cursor() query_sql = '''SELECT file_name, bucket_name, pic_size, pic_md5, `use`, info, url FROM poi_images WHERE (`source`, `sid`) IN ({});'''.format(','.join(map(lambda x: "('{}', '{}')".format(x[0], x[1]), s_sid_set))) _res = cursor.execute(query_sql) if not _res: return old_img, old_first_img max_size = -1 max_size_img = '' file2phash = dict() pic_total = set() p_hash_dict = defaultdict(list) for file_name, bucket_name, pic_size, pic_md5, use, info, url in cursor.fetchall(): if poi_type == 'shop' and bucket_name not in ('attr_bucket', 'shop_bucket', 'mioji-attr', 'mioji-shop'): # shopping img upload to mioji-attr or mioji-shop continue elif poi_type != 'shop' and poi_type not in bucket_name: # rest img upload to mioji-rest # attr img upload to mioji-attr continue # 生成 pic total,用于判定被过滤的图片是否为人工新添加的图片 pic_total.add(file_name) # 裂图,必须过滤 if r.get('error_img_{}'.format(file_name)) == '1': continue # pHash filter if url in ('', 'NULL', None): # 产品标注图片,不许过滤,直接使用 file2phash[file_name] = 'USE' p_hash_dict["USE"].append((file_name, -1)) continue elif not info: # 抓取图片,没有 pHash ,直接过滤 continue else: p_hash = json.loads(info)['p_hash'] # img can be used # pic size 为空一般是人工标的图片 if not is_legal(pic_size): if file_name not in old_img: continue elif str(use) != '1': continue else: # 老图,人工标的,不能过滤 file2phash[file_name] = 'USE' p_hash_dict["USE"].append((file_name, -1)) continue # get max size h, w = literal_eval(pic_size) h = int(h) w = int(w) size = h * w if size > max_size: max_size = size max_size_img = file_name # use 1 if str(use) == '1': # 过滤规则 # pixel if size < 200000: continue # scale # min scale scale = w / h if scale < 0.9: if w < 500: continue # max scale if scale > 2.5: continue p_hash_dict[p_hash].append((file_name, size)) cursor.close() conn.close() if poi_type in ('attr', 'shop'): # 获取人脸识别数据 _conn = poi_face_detect_pool.connection() _cursor = _conn.cursor() query_sql = '''SELECT pic_name FROM PoiPictureInformation WHERE is_available=0 AND poi_id IN ({});'''.format( ', '.join( map( lambda x: "'{}'".format( '###'.join(x) if x[0] != 'online' else x[1]), s_sid_set ) ) ) _cursor.execute(query_sql) face_detected = set([x[0].split('/')[-1] for x in _cursor.fetchall()]) _cursor.close() _conn.close() else: face_detected = set() # 人工添加图片 human_pic = p_hash_dict["USE"] # 机器图片,同一 pHash 中选取最大的一张图片 final_pic_dict = {} for k, v in p_hash_dict.items(): pic_res = sorted(v, key=lambda x: x[1], reverse=True) if pic_res: final_pic_dict[pic_res[0][0]] = k old_img_list = old_img.split('|') new_img_list = [] # 按照旧的图片排列顺序增加图片,并去重 for _old_file_name in old_img_list: # 人工添加图片入栈,但无 md5 进行过滤,直接放过 md5 过滤规则 if (_old_file_name not in pic_total) or (_old_file_name in human_pic): # 如果数据合法 if is_legal(_old_file_name): if _old_file_name not in face_detected: if _old_file_name not in new_img_list: # 人工添加图片入栈,但无 md5 进行过滤,直接放过任何过滤规则 new_img_list.append(_old_file_name) elif _old_file_name in final_pic_dict: if is_legal(_old_file_name): # 人脸识别过滤 if _old_file_name not in face_detected: if _old_file_name not in new_img_list: new_img_list.append(_old_file_name) # 当新增图片中有原先不存在的图片,按顺序增加图片 for k, v in final_pic_dict.items(): if is_legal(v): # 人脸识别过滤 if k not in face_detected: if v not in new_img_list: new_img_list.append(k) if old_first_img: if old_first_img in new_img_list: # 当首图没有被下掉的时候,使用原先首图 new_first_img = old_first_img # 从新图片列表中清除 first_img new_img_list.remove(old_first_img) # 在列表头部增加 first_img new_img_list.insert(0, old_first_img) else: # 否则使用新的首图 if new_img_list: new_first_img = new_img_list[0] else: new_first_img = '' else: if new_img_list: new_first_img = new_img_list[0] else: new_first_img = '' if new_first_img == '': new_img = new_first_img = max_size_img else: # 图片序列去重,不改变原来次序 final_new_img_list = list(set(new_img_list)) final_new_img_list.sort(key=new_img_list.index) new_img = '|'.join(filter(lambda x: is_legal(x), final_new_img_list)) return new_img, new_first_img