Ejemplo n.º 1
0
def update_img_info(img_name):
    # 更新图片信息
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''SELECT `info`
FROM poi_images
WHERE file_name = %s;''', (img_name,))
    info = cursor.fetchone()[0]
    cursor.close()
    conn.close()

    if info:
        _i = json.loads(info)
    else:
        _i = {}

    _i['delete_reason'] = "图片质量低,手动取消"

    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    cursor.execute('''UPDATE poi_images
SET `use` = 0, info = %s
WHERE file_name = %s;''', (json.dumps(_i), img_name,))
    conn.commit()
    cursor.close()
    conn.close()

    logger.info("[new info][info: {}]".format(json.dumps(_i)))
Ejemplo n.º 2
0
def get_file(sid):
    sql = '''SELECT
  file_name,
  pic_size,
  info
FROM poi_images
WHERE source = 'qyer' AND sid = '{}' AND `use` = 1;'''.format(sid)
    conn = base_data_final_pool.connection()
    cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
    cursor.execute(sql)
    file_pic_size = {}
    for line in cursor.fetchall():
        w, h = eval(line['pic_size'])
        _j_data = json.loads(line['info'])
        _j_data['down_reason'] = "图片数多余 90 张,下掉一部分"
        file_pic_size[(json.dumps(_j_data),
                       line['file_name'])] = int(w) * int(h)
    cursor.close()
    conn.close()

    down_imgs = list(
        map(lambda x: x[0],
            sorted(file_pic_size.items(), key=lambda x: x[1],
                   reverse=True)))[90:]
    update_sql(sid=sid, data=down_imgs)
Ejemplo n.º 3
0
def init_all_seek_dict():
    local_conn = base_data_final_pool.connection()
    local_cursor = local_conn.cursor()
    local_cursor.execute('''SELECT *
FROM data_insert_seek;''')
    global all_seek_dict
    all_seek_dict = {k: v for k, v in local_cursor.fetchall()}
    local_cursor.close()
    local_conn.close()
Ejemplo n.º 4
0
def insert_data(data):
    update_sql = '''INSERT IGNORE INTO poi_images (file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`)
VALUE (%(file_name)s, 'online', %(sid)s, %(url)s, %(pic_size)s, %(bucket_name)s, %(url_md5)s, %(pic_md5)s, %(use)s);'''
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(update_sql, data)
    conn.commit()
    cursor.close()
    conn.close()
    logger.debug("[move data][total: {}][execute: {}]".format(len(data), _res))
Ejemplo n.º 5
0
def create_table():
    final_conn = base_data_final_pool.connection()
    final_cursor = final_conn.cursor()
    for k, v in final_table.items():
        real_path = os.path.split(os.path.realpath(__file__))[0]
        sql_path = os.path.join(real_path, 'sql', v)
        final_sql = open(sql_path).read()
        table_name = "{}_final".format(k)
        final_cursor.execute(final_sql % (table_name, ))
        logger.debug('[create table][name: {}]'.format(table_name))
    final_cursor.close()
    final_conn.close()
Ejemplo n.º 6
0
def update_seek_table(table_name, update_time):
    local_conn = base_data_final_pool.connection()
    local_cursor = local_conn.cursor()
    local_cursor.execute('''REPLACE INTO data_insert_seek VALUES (%s, %s);''',
                         (table_name, update_time))
    logger.debug("[update seek table][table_name: {}][update_time: {}]".format(
        table_name, update_time))
    global all_seek_dict
    all_seek_dict[table_name] = update_time
    local_conn.commit()
    local_cursor.close()
    local_conn.close()
Ejemplo n.º 7
0
def update_sql(sid, data):
    sql = '''UPDATE poi_images
SET `use` = 0, info = %s
WHERE source = 'qyer' AND sid = '{}' AND file_name = %s;'''.format(sid)
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(sql, data)
    conn.commit()
    cursor.close()
    conn.close()
    logger.info("[sid: {}][total: {}][execute: {}]".format(
        sid, len(data), _res))
Ejemplo n.º 8
0
def insert_db(table_name, data):
    global offset
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    insert_into = "REPLACE INTO first_images (`source`, `source_id`, `first_img`) VALUES (%s, %s, %s)"
    res = cursor.executemany(insert_into, data)
    conn.commit()
    cursor.close()
    conn.close()
    logger.debug(
        "[insert data][table_name: {}][offset: {}][total: {}][insert: {}]".
        format(table_name, offset, len(data), res))
Ejemplo n.º 9
0
def update_db(data):
    sql = '''UPDATE poi_images
SET `use` = 0, info = JSON_SET(CASE WHEN info IS NULL
  THEN '{}'
                               ELSE info END, '$.down_reason', '疑似头像,下线')
WHERE file_name = %s;'''
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    _res = cursor.executemany(sql, data)
    conn.commit()
    cursor.close()
    conn.close()
    print("[total: {}][insert: {}]".format(len(data), _res))
Ejemplo n.º 10
0
def create_table(image_type):
    final_conn = base_data_final_pool.connection()
    final_cursor = final_conn.cursor()
    sql_name = final_table.get(image_type, None)
    if sql_name is None:
        raise TypeError("[Unknown View Type: {}]".format(image_type))
    final_sql = open(
        '/search/hourong/PycharmProjects/PoiCommonScript/serviceplatform_data/sql/{}'
        .format(sql_name)).read()
    table_name = "{}_images".format(image_type_dict[image_type])
    final_cursor.execute(final_sql % (table_name, ))
    logger.debug('[create table][name: {}]'.format(table_name))
    final_cursor.close()
    final_conn.close()
Ejemplo n.º 11
0
def create_table(image_type):
    final_conn = base_data_final_pool.connection()
    final_cursor = final_conn.cursor()
    sql_name = final_table.get(image_type, None)
    if sql_name is None:
        raise TypeError("[Unknown View Type: {}]".format(image_type))

    real_path = os.path.split(os.path.realpath(__file__))[0]
    sql_path = os.path.join(real_path, 'sql', sql_name)
    final_sql = open(sql_path).read()
    table_name = "{}_images".format(image_type_dict[image_type])
    final_cursor.execute(final_sql % (table_name, ))
    logger.debug('[create table][name: {}]'.format(table_name))
    final_cursor.close()
    final_conn.close()
Ejemplo n.º 12
0
def update_sql(data):
    sql = '''UPDATE hotel_images
SET info = JSON_SET(CASE WHEN info IS NULL
  THEN '{{}}'
                    ELSE info END, '$.down_reason', '扫库 md5 不对应,过滤')
WHERE pic_md5 IN ({});'''.format(','.join(map(lambda x: "'{}'".format(x),
                                              data)))
    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    try:
        _res = cursor.execute(sql)
    except Exception:
        print(sql)
        raise Exception()
    conn.commit()
    cursor.close()
    conn.close()
    logger.info("[total: {}][execute: {}]".format(len(data), _res))
Ejemplo n.º 13
0
def load_data(limit=400):
    local_conn = base_data_final_pool.connection()
    local_cursor = local_conn.cursor()
    local_cursor.execute('''SELECT TABLE_NAME
        FROM information_schema.TABLES
        WHERE TABLE_SCHEMA = '{}';'''.format(final_database))

    # 强制要求按照 tag 的先后顺序排列
    table_list = list(
        sorted(filter(lambda x: len(x.split('_')) in (3, 4),
                      map(lambda x: x[0], local_cursor.fetchall())),
               key=lambda x: x.split('_')[-1]))
    local_cursor.close()

    for each_table in table_list:
        each_table_key_list = each_table.split('_')
        if len(each_table_key_list) == 3:
            if each_table_key_list[0] not in ('attr', 'rest', 'total',
                                              'hotel'):
                logger.debug('[skip table][name: {}]'.format(each_table))
                continue
            if each_table_key_list[-1] < '20170929a':
                logger.debug('[skip table][name: {}]'.format(each_table))
                continue

            # 通过表明成获取类型以及 tag
            try:
                _type, _, _tag = each_table.split('_')
            except Exception:
                logger.error('[Unknown View Final: {}]'.format(each_table))
                continue

            # 生成如数据表名
            to_table_name = "{}_final".format(_type)
            _type = "{}_detail".format(_type)
        elif len(each_table_key_list) == 4:
            if each_table_key_list[1] != 'images':
                logger.debug('[skip table][name: {}]'.format(each_table))
                continue
            try:
                _type, _, _, _tag = each_table.split('_')
            except Exception:
                logger.error('[Unknown View Final: {}]'.format(each_table))
                continue
            # 生成如数据表名
            if _type == 'hotel':
                to_table_name = "hotel_images"
            elif _type == 'poi':
                to_table_name = "poi_images"
            else:
                raise TypeError('Unknown Type: {}'.format(_type))

            _type = "{}_images".format(_type)
        else:
            continue

        u_time = get_seek(table_name=each_table)
        start = time.time()

        # 开始进行数据合并
        local_cursor = local_conn.cursor()
        update_time_sql = '''SELECT {0}
        FROM {1}
        WHERE {0} >= '{2}'
        ORDER BY {0}
        LIMIT {3};'''.format(time_key[_type], each_table, u_time, limit)
        line_count = local_cursor.execute(update_time_sql)

        logger.debug('sql: %s\nselect_count: %s' %
                     (update_time_sql, str(local_cursor.rowcount)))

        if line_count == 0:
            # 如果已无数据,则不需要执行后面的处理
            continue
        # get final update time for inserting db next time
        final_update_time = max(map(lambda x: x[0], local_cursor.fetchall()))
        logger.debug('each_table: %s  final_update_time: %s' %
                     (each_table, str(final_update_time)))
        local_cursor.close()

        # replace into final data
        local_cursor = local_conn.cursor()
        query_sql_list = []
        if to_table_name == 'hotel_images':
            query_sql = '''REPLACE INTO {0} (source, source_id, pic_url, pic_md5, part, hotel_id, status, update_date, size, flag, file_md5)
  SELECT
    source,
    source_id,
    pic_url,
    pic_md5,
    part,
    hotel_id,
    status,
    update_date,
    size,
    flag,
    file_md5
  FROM
    {1}
  WHERE update_date >= '{2}'
  ORDER BY update_date
  LIMIT {3};'''.format(to_table_name, each_table, u_time, limit)
            query_sql_list.append(query_sql)
        elif to_table_name == 'poi_images':
            query_sql = '''REPLACE INTO {0}
            (file_name, source, sid, url, pic_size, bucket_name, url_md5, pic_md5, `use`, part, date)
              SELECT
                file_name,
                source,
                sid,
                url,
                pic_size,
                bucket_name,
                url_md5,
                pic_md5,
                `use`,
                part,
                date
              FROM
                {1}
              WHERE date >= '{2}'
              ORDER BY date
              LIMIT {3};'''.format(to_table_name, each_table, u_time, limit)
            query_sql_list.append(query_sql)
        elif u_time != '':
            query_sql = '''REPLACE INTO {1} SELECT *
            FROM {2}
            WHERE {0} >= '{3}'
            ORDER BY {0}
            LIMIT {4};'''.format(time_key[_type], to_table_name, each_table,
                                 u_time, limit)
            query_sql_list.append(query_sql)
            if to_table_name == 'attr_final':
                query_sql = '''REPLACE INTO poi_merge.attr SELECT *
                            FROM {2}
                            WHERE {0} >= '{3}'
                            ORDER BY {0}
                            LIMIT {4};'''.format(time_key[_type],
                                                 to_table_name, each_table,
                                                 u_time, limit)
                query_sql_list.append(query_sql)
            elif to_table_name == 'total_final':
                query_sql = '''REPLACE INTO poi_merge.attr
  SELECT
    id,
    source,
    name,
    name_en,
    alias,
    map_info,
    city_id,
    source_city_id,
    address,
    star,
    recommend_lv,
    pv,
    plantocounts,
    beentocounts,
    overall_rank,
    ranking,
    grade,
    grade_distrib,
    commentcounts,
    tips,
    tagid,
    related_pois,
    nomissed,
    keyword,
    cateid,
    url,
    phone,
    site,
    imgurl,
    commenturl,
    introduction,
    '',
    opentime,
    price,
    recommended_time,
    wayto,
    0,
    0,
    insert_time
  FROM {2}
  WHERE {0} > '{3}'
  ORDER BY {0}
  LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time,
                       limit)
                query_sql_list.append(query_sql)
                # elif to_table_name == 'rest_final':
                #     query_sql = '''REPLACE INTO poi_merge.rest SELECT *
                #                 FROM {2}
                #                 WHERE {0} >= '{3}'
                #                 ORDER BY {0}
                #                 LIMIT {4};'''.format(time_key[_type], to_table_name, each_table, u_time, limit)

        else:
            raise TypeError(
                "Unknown Type [u_time: {}][to_table_name: {}]".format(
                    u_time, to_table_name))

        for _each_query_sql in query_sql_list:
            is_replace = True
            try:
                replace_count = local_cursor.execute(_each_query_sql)
            except pymysql.err.IntegrityError as integrity_err:
                _args = integrity_err.args
                if 'Duplicate entry' in _args[1]:
                    # 当出现 duplicate entry 时候,使用 Insert Ignore 代替(replace into 会出现 duplicate error,暂时不知道原因)
                    is_replace = False
                    _each_query_sql = _each_query_sql.replace(
                        'REPLACE INTO', 'INSERT IGNORE INTO')
                    replace_count = local_cursor.execute(_each_query_sql)
                else:
                    logger.exception(
                        msg="[table_name: {}][error_sql: {}]".format(
                            each_table, _each_query_sql),
                        exc_info=integrity_err)
                    continue
            except Exception as e:
                logger.exception(msg="[table_name: {}][error_sql: {}]".format(
                    each_table, _each_query_sql),
                                 exc_info=e)
                continue
            logger.debug(
                "[insert data][to: {}][from: {}][update_time: {}][final_update_time: {}][limit: {}][line_count: {}]["
                "{}: {}][takes: {}]".format(
                    to_table_name if 'poi_merge' not in _each_query_sql else
                    'poi_merge.attr' if 'poi_merge.attr' in _each_query_sql
                    else 'poi_merge.rest', each_table, u_time,
                    final_update_time, limit, line_count,
                    'replace_count' if is_replace else 'insert_ignore_count',
                    replace_count,
                    time.time() - start))
        local_conn.commit()
        local_cursor.close()

        update_seek_table(each_table, final_update_time)
    local_conn.close()
Ejemplo n.º 14
0
def detectOriData(need_detection_table_name=None):
    city_map_info_dict = get_city_map()
    dt = datetime.datetime.now()

    # 从 base data final 中获取表信息
    _conn = base_data_final_pool.connection()
    _cursor = _conn.cursor()
    _cursor.execute('''SELECT TABLE_NAME
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = 'BaseDataFinal';''')
    # table 中携带数据库链接信息
    table_list = list(
        map(
            lambda x: (x[0], {
                'host': ori_ip,
                'user': ori_user,
                'passwd': ori_password,
                'db': ori_db_name
            }, 'ota'), _cursor.fetchall()))
    _cursor.close()
    _conn.close()

    # 暂停 hotel-api 数据统计
    #     _conn = hotel_api_pool.connection()
    #     _cursor = _conn.cursor()
    #     _cursor.execute('''SELECT TABLE_NAME
    # FROM information_schema.TABLES
    # WHERE TABLE_SCHEMA = 'hotel_api';''')
    #     for i in groupby(
    #             list(map(lambda x: x[0], _cursor.fetchall())),
    #             lambda x: '_'.join(x.split('_')[:2])
    #     ):
    #         table_names = sorted(filter(lambda x: len(x.split('_')) >= 3 and 'bak' not in x, i[1]), reverse=True)[:7]
    #         if not table_names:
    #             continue
    #
    #         for t_name in table_names:
    #             table_list.append((t_name, hotel_api_config, 'api'))
    #
    #     _cursor.close()
    #     _conn.close()

    report_data = []
    tasks_data = []
    for cand_table, conn_config, table_type in table_list:
        error_count = {}
        source_count = defaultdict(int)
        error_dict = defaultdict(int)

        if need_detection_table_name is not None and cand_table != need_detection_table_name:
            print('=====  跳过表 {}'.format(cand_table))
            continue

        if table_type == 'ota':
            # 当判定为 ota 类型时字段查找使用以下方式进行
            cand_list = cand_table.split('_')
            # 使用 BaseDataFinal 中的数据进行数据校验,跳过不为 3 的表
            # 其中的数据表名称类似 attr_final_20170929a
            if len(cand_list) != 3:
                continue

            task_type, _, task_tag = cand_list

            # 跳过非这四种抓取任务类型
            if task_type not in ('attr', 'rest', 'hotel', 'total'):
                continue

            logger.debug(('[Begin Detect][table: {}]'.format(cand_table)))

            if task_type == 'hotel':
                # 酒店类型
                sql = '''SELECT
          hotel_name,
          hotel_name_en,
          source,
          source_id,
          city_id,
          map_info,
          grade
        FROM {};'''.format(cand_table)

            elif task_type in ('attr', 'shop', 'rest', 'total'):
                # 景点、购物,餐厅当前 daodao 使用,以及全部 POI,qyer 使用
                sql = '''SELECT
          name,
          name_en,
          source,
          id,
          city_id,
          map_info,
          grade,
          address
        FROM {};'''.format(cand_table)
            else:
                # 未知类型,当前跳过
                continue
        elif table_type == 'api':
            _test_list = cand_table.split('_')
            if _test_list[0] == 'hotelinfo' and len(_test_list) >= 5:
                if not (_test_list[2].isdigit() and _test_list[3].isdigit()
                        and _test_list[4].isdigit()):
                    continue
                sql = '''SELECT
                          hotel_name,
                          hotel_name_en,
                          source,
                          source_id,
                          city_id,
                          map_info,
                          grade
                        FROM {};'''.format(cand_table)
                task_type = 'hotel_api'
                task_tag = "{}-{}-{}".format(_test_list[2], _test_list[3],
                                             _test_list[4])
            else:
                logger.info("[don't known this table][table_name: {}]".format(
                    cand_table))
                continue
        else:
            logger.info("[unknown table type][type: {}]".format(table_type))
            continue

        # 获取数据,使用迭代的方式获得
        datas = MysqlSource(db_config=conn_config,
                            table_or_query=sql,
                            size=10000,
                            is_table=False)

        # 经纬度记录集合,用于判定重复内容
        map_info_set = defaultdict(set)

        # 重复经纬度集合,用于提取重复经纬度,以及在经纬度重复的值的最后添加 len(duplicate_map_info_set) 值,
        # 以保证返回值不会丢失第一次出现的 map_info
        duplicate_map_info_set = defaultdict(set)

        total = 0
        success = 0
        for data in datas:
            # 该条数据情况,数据正确,默认为 True,后续流程中会修改为 False
            right = True

            total += 1

            if total % 10000 == 0:
                logger.debug(
                    "[table data detect][table: {}][count: {}]".format(
                        cand_table, total))
            word_list = []

            for word in data:
                if word is None:
                    word_list.append('')
                else:
                    word_list.append(word)

            name = word_list[0]
            name_en = word_list[1]
            source = word_list[2]
            sid = word_list[3]
            try:
                cid = str(int(word_list[4]))
            except Exception:
                cid = word_list[4]
            map_info = word_list[5]
            grade = word_list[6]
            address = word_list[7]

            # 增加本表中抓取源的统计
            source_count[source] += 1

            # # todo 从数据库中读取所有的抓取源
            # 抓取源错误为大错误,由于爬虫写错导致,应该发报警邮件
            # if source not in ('agoda', 'booking', 'ctrip', 'elong ', 'expedia', 'hotels'):
            #     error_dict['数据源错误'] += 1
            #     right = False

            if '' == name and '' == name_en:
                error_dict[(source, '无 name、name_en')] += 1
                right = False

            if '' != name and '' != name_en and is_contain_ch(name_en):
                if is_full_contain_ch(name_en):
                    if not is_contain_ch(name):
                        error_dict[(source, "中英文名字相反")] += 1
                        right = False

            if name.strip().lower() != name_en.strip().lower() \
                    and is_contain_ch(name) \
                    and not is_contain_ch(name_en) \
                    and len(name_en.split(' ')) >= 2 \
                    and name_en in name:
                error_dict[(source, "中文名中含有英文名")] += 1
                right = False

            if 'NULL' == map_info:
                error_dict[(source, '坐标错误(NULL)')] += 1
                right = False
            elif not map_info_legal(map_info):
                error_dict[(source, '坐标错误(坐标为空或坐标格式错误,除去NULL)')] += 1
                if address:
                    tasks_data.append(
                        supplement_field(cand_table, sid, source, address))
                right = False
            else:
                # 经纬度重复情况判定
                if map_info in map_info_set[(source, cid)]:
                    error_dict[(source, "经纬度重复")] += 1
                    if error_dict[(source, "经纬度重复")] == 1:
                        # 当此经纬度出现 1 次时,经纬度重复加 2 ,之后正常
                        error_dict[(source, "经纬度重复")] += 1
                    duplicate_map_info_set[source].add(map_info)
                    if address:
                        tasks_data.append(
                            supplement_field(cand_table, sid, source, address))
                    right = False

                # 当前情况为 map_info 为正确的情况,经纬度集合添加 map_info
                map_info_set[(source, cid)].add(map_info)

                # 当城市经纬度合法时计算相应的距离
                city_map_info = city_map_info_dict.get(cid, None)
                if map_info_legal(city_map_info):

                    cand_dist = getDistByMap(city_map_info, map_info)
                    cand_reverse_dist = getDistByMap(
                        city_map_info, ','.join(map_info.strip(',')[::-1]))

                    if cand_dist and cand_reverse_dist:
                        if cand_dist >= filter_dist:
                            right = False
                            error_dict[(source, '坐标与所属城市距离过远')] += 1
                            if cand_reverse_dist <= filter_dist:
                                error_dict[(source, "距离过远坐标翻转后属于所属城市")] += 1
                            else:
                                distance_set.add(sid)
                            if address:
                                tasks_data.append(
                                    supplement_field(cand_table, sid, source,
                                                     address))

            try:
                grade_f = float(grade)
                if grade_f > 10:
                    error_dict[(source, '静态评分异常(评分高于10分)')] += 1
                    right = False
            except:
                pass

            success += 1 if right else 0

        # 经纬度重复的值的最后添加 len(duplicate_map_info_set) 值
        # 以保证返回值不会丢失第一次出现的 map_info
        # todo duplicate map info fix source
        # error_dict[(source, '经纬度重复')] += len(duplicate_map_info_set)

        # 生成经纬度重复任务,当前只有 qyer
        if task_type in ('total', 'attr'):
            for source, each_duplicate_map_info in duplicate_map_info_set.items(
            ):
                # get each detail table name
                detail_table = '_'.join(
                    ['detail', task_type, source, task_tag])
                insert_error_map_info_task(
                    duplicate_map_info_set=each_duplicate_map_info,
                    task_table=detail_table,
                    task_type=task_type)

        logger.debug(
            "[table detected: {}][total: {}][error: {}][succeed: {}]".format(
                cand_table, total, error_count, success))

        for each_source, _c in source_count.items():
            report_data.append({
                'tag':
                task_tag,
                'source':
                each_source,
                'type':
                task_type,
                'error_type':
                '全量',
                'num':
                _c,
                'date':
                datetime.datetime.strftime(dt, '%Y%m%d'),
                'hour':
                datetime.datetime.strftime(dt, '%H'),
                'datetime':
                datetime.datetime.strftime(dt, '%Y%m%d%H00')
            })

        for s_err_type, num in error_dict.items():
            _source, _err_type = s_err_type
            report_data.append({
                'tag':
                task_tag,
                'source':
                _source,
                'type':
                task_type,
                'error_type':
                _err_type,
                'num':
                num,
                'date':
                datetime.datetime.strftime(dt, '%Y%m%d'),
                'hour':
                datetime.datetime.strftime(dt, '%H'),
                'datetime':
                datetime.datetime.strftime(dt, '%Y%m%d%H00')
            })

    db = dataset.connect(
        'mysql+pymysql://mioji_admin:[email protected]/Report?charset=utf8'
    )
    crawl_report_table = db['serviceplatform_crawl_report_summary']
    # serviceplatform_crawl_report_summary
    for each_data in report_data:
        try:
            crawl_report_table.upsert(
                each_data,
                keys=['tag', 'source', 'type', 'error_type', 'date'],
                ensure=None)
            logger.debug("[table_data: {}]".format(each_data))
        except Exception as exc:
            logger.exception(msg="[update report table error]", exc_info=exc)
    db.commit()
    logger.debug('Done')
    return report_data, tasks_data
Ejemplo n.º 15
0
def get_img(s_sid_set, poi_type, old_img='', old_first_img='', is_official=False):
    """
    Get img str by using source and sid set
    :param is_official: is official or not
    :param old_img: old img list, all img split with |
    :param old_first_img:  old first img, use old sorting
    :param poi_type: poi type, Eg: attr rest shop
    :param s_sid_set: source and sid set
    :return: tuple (new_img str, new_first_img str)
    """
    if not s_sid_set or is_official:
        return old_img, old_first_img

    conn = base_data_final_pool.connection()
    cursor = conn.cursor()
    query_sql = '''SELECT
  file_name,
  bucket_name,
  pic_size,
  pic_md5,
  `use`,
  info,
  url
FROM poi_images
WHERE (`source`, `sid`) IN ({});'''.format(','.join(map(lambda x: "('{}', '{}')".format(x[0], x[1]), s_sid_set)))
    _res = cursor.execute(query_sql)
    if not _res:
        return old_img, old_first_img

    max_size = -1
    max_size_img = ''
    file2phash = dict()
    pic_total = set()
    p_hash_dict = defaultdict(list)
    for file_name, bucket_name, pic_size, pic_md5, use, info, url in cursor.fetchall():
        if poi_type == 'shop' and bucket_name not in ('attr_bucket', 'shop_bucket', 'mioji-attr', 'mioji-shop'):
            # shopping img upload to mioji-attr or mioji-shop
            continue
        elif poi_type != 'shop' and poi_type not in bucket_name:
            # rest img upload to mioji-rest
            # attr img upload to mioji-attr
            continue

        # 生成 pic total,用于判定被过滤的图片是否为人工新添加的图片
        pic_total.add(file_name)

        # 裂图,必须过滤
        if r.get('error_img_{}'.format(file_name)) == '1':
            continue

        # pHash filter
        if url in ('', 'NULL', None):
            # 产品标注图片,不许过滤,直接使用
            file2phash[file_name] = 'USE'
            p_hash_dict["USE"].append((file_name, -1))
            continue
        elif not info:
            # 抓取图片,没有 pHash ,直接过滤
            continue
        else:
            p_hash = json.loads(info)['p_hash']

        # img can be used
        # pic size 为空一般是人工标的图片
        if not is_legal(pic_size):
            if file_name not in old_img:
                continue
            elif str(use) != '1':
                continue
            else:
                # 老图,人工标的,不能过滤
                file2phash[file_name] = 'USE'
                p_hash_dict["USE"].append((file_name, -1))
                continue

        # get max size
        h, w = literal_eval(pic_size)
        h = int(h)
        w = int(w)
        size = h * w
        if size > max_size:
            max_size = size
            max_size_img = file_name

        # use 1
        if str(use) == '1':
            # 过滤规则
            # pixel
            if size < 200000:
                continue

            # scale
            # min scale
            scale = w / h
            if scale < 0.9:
                if w < 500:
                    continue

            # max scale
            if scale > 2.5:
                continue

            p_hash_dict[p_hash].append((file_name, size))

    cursor.close()
    conn.close()

    if poi_type in ('attr', 'shop'):
        # 获取人脸识别数据
        _conn = poi_face_detect_pool.connection()
        _cursor = _conn.cursor()
        query_sql = '''SELECT pic_name
FROM PoiPictureInformation
WHERE is_available=0 AND poi_id IN ({});'''.format(
            ', '.join(
                map(
                    lambda x: "'{}'".format(
                        '###'.join(x) if x[0] != 'online' else x[1]),
                    s_sid_set
                )
            )
        )
        _cursor.execute(query_sql)
        face_detected = set([x[0].split('/')[-1] for x in _cursor.fetchall()])
        _cursor.close()
        _conn.close()
    else:
        face_detected = set()

    # 人工添加图片
    human_pic = p_hash_dict["USE"]

    # 机器图片,同一 pHash 中选取最大的一张图片
    final_pic_dict = {}
    for k, v in p_hash_dict.items():
        pic_res = sorted(v, key=lambda x: x[1], reverse=True)
        if pic_res:
            final_pic_dict[pic_res[0][0]] = k

    old_img_list = old_img.split('|')

    new_img_list = []
    # 按照旧的图片排列顺序增加图片,并去重
    for _old_file_name in old_img_list:
        # 人工添加图片入栈,但无 md5 进行过滤,直接放过 md5 过滤规则
        if (_old_file_name not in pic_total) or (_old_file_name in human_pic):
            # 如果数据合法
            if is_legal(_old_file_name):
                if _old_file_name not in face_detected:
                    if _old_file_name not in new_img_list:
                        # 人工添加图片入栈,但无 md5 进行过滤,直接放过任何过滤规则
                        new_img_list.append(_old_file_name)

        elif _old_file_name in final_pic_dict:
            if is_legal(_old_file_name):
                # 人脸识别过滤
                if _old_file_name not in face_detected:
                    if _old_file_name not in new_img_list:
                        new_img_list.append(_old_file_name)

    # 当新增图片中有原先不存在的图片,按顺序增加图片
    for k, v in final_pic_dict.items():
        if is_legal(v):
            # 人脸识别过滤
            if k not in face_detected:
                if v not in new_img_list:
                    new_img_list.append(k)

    if old_first_img:
        if old_first_img in new_img_list:
            # 当首图没有被下掉的时候,使用原先首图
            new_first_img = old_first_img
            # 从新图片列表中清除 first_img
            new_img_list.remove(old_first_img)
            # 在列表头部增加 first_img
            new_img_list.insert(0, old_first_img)
        else:
            # 否则使用新的首图
            if new_img_list:
                new_first_img = new_img_list[0]
            else:
                new_first_img = ''
    else:
        if new_img_list:
            new_first_img = new_img_list[0]
        else:
            new_first_img = ''

    if new_first_img == '':
        new_img = new_first_img = max_size_img
    else:
        # 图片序列去重,不改变原来次序
        final_new_img_list = list(set(new_img_list))
        final_new_img_list.sort(key=new_img_list.index)

        new_img = '|'.join(filter(lambda x: is_legal(x), final_new_img_list))

    return new_img, new_first_img