Ejemplo n.º 1
0
def task():
    name_dict = {}
    en_dict = {}
    query_sql = '''SELECT
  id,
  name,
  name_en
FROM chat_shopping;'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute(query_sql)

    for line in cursor.fetchall():
        miaoji_id = line['id']
        name = line['name']
        name_en = line['name_en']
        name_dict[name] = miaoji_id
        en_dict[get_similar_word(name_en)] = miaoji_id

    cursor.close()
    conn.close()

    query_sql = '''SELECT *
FROM qyer_outlets_new
WHERE city_id IS NOT NULL AND city_id != 'NULL';'''
    conn = poi_ori_pool.connection()
    cursor = conn.cursor(cursor=DictCursor)
    cursor.execute(query_sql)
    for line in cursor.fetchall():
        name = line['name']
        name_en = get_similar_word(line['name_en'])
        if (name in name_dict) and line['name'] != '':
            uid = name_dict.get(name, '')
            u_name, u_name_en, o_name, o_name_en = get_shop_info(uid)
            final_name = get_name(
                [(line['name'], 'qyer'), (line['name_en'], 'qyer'), (u_name, 'daodao'), (u_name_en, 'daodao'),
                 (o_name, 'online'), (o_name_en, 'online')])
            update_outlets(cid=line['city_id'], name=final_name, uid=uid)
            print(uid, final_name, line['city_id'])
        elif (name_en in en_dict) and line['name_en'] != '':
            uid = en_dict.get(name_en, '')
            u_name, u_name_en, o_name, o_name_en = get_shop_info(uid)
            final_name = get_name(
                [(line['name'], 'qyer'), (line['name_en'], 'qyer'), (u_name, 'daodao'), (u_name_en, 'daodao'),
                 (o_name, 'online'), (o_name_en, 'online')])
            update_outlets(cid=line['city_id'], name=final_name, uid=uid)
            print(uid, final_name, line['city_id'])
        else:
            continue
    cursor.close()
    conn.close()
Ejemplo n.º 2
0
def similar_dict():
    name_dict = {}
    en_dict = {}

    sql = '''SELECT
  id,
  name,
  name_en,
  city_id
FROM attr_unid
ORDER BY id;'''

    city_id_list = []

    conn = pymysql.connect(**attr_merge_conf)
    cursor = conn.cursor(cursor=DictCursor)

    cursor.execute(sql)
    for line in cursor.fetchall():
        mid = line['id']
        name = line['name']
        name_en = get_similar_word(line['name_en'])
        city_id = line['city_id']
        city_id_list.append(city_id)
        name_key = city_id + '|_|_|' + name
        en_key = city_id + '|_|_|' + name_en
        if name_key not in name_dict:
            name_dict[name_key] = mid
        if en_key not in en_dict:
            en_dict[en_key] = mid

    return name_dict, en_dict, city_id_list
Ejemplo n.º 3
0
def task():
    name_dict = {}
    en_dict = {}
    site_dict = {}
    sql = 'select id,name,name_en,website_url from chat_shopping'
    for line in db_114_35_shop.QueryBySQL(sql):
        miaoji_id = line['id']
        name = line['name']
        name_en = line['name_en']
        site = get_modify_url(line['website_url'])
        site_dict[site] = miaoji_id
        name_dict[name] = miaoji_id
        en_dict[get_similar_word(name_en)] = miaoji_id

    rows = []
    for line in db.QueryBySQL('select id,name,name_en,site from qyer_outlets'):
        source_id = line['id']
        name = line['name']
        name_en = get_similar_word(line['name_en'])
        site = get_modify_url(line['site'])
        if (name in name_dict) and line['name'] != '':
            rows.append((source_id, line['name'], line['name_en'],
                         line['site'], 'name', name, name_dict.get(name, '')))
        elif (name_en in en_dict) and line['name_en'] != '':
            rows.append(
                (source_id, line['name'], line['name_en'], line['site'],
                 'name_en', name_en, en_dict.get(name_en, '')))
        elif (site in site_dict) and line['site'] != '':
            rows.append((source_id, line['name'], line['name_en'],
                         line['site'], 'site', site, site_dict.get(site, '')))
    import csv
    f = open('/tmp/outlets.csv', 'w')
    writer = csv.writer(f)
    writer.writerow(['qyer_id', '名称', '英文名', '官网', '匹配条件', '匹配项', 'ID'])
    for i in set(rows):
        writer.writerow(i)
Ejemplo n.º 4
0
def task(task_source):
    id_keys = get_id_keys()
    name_dict, en_dict, city_id_list = similar_dict()

    count = 0
    data = []
    for each_city in get_task_city():
        city_id = each_city['city_id']
        city_name = each_city['city_name']
        city_map_info = each_city['map_info']
        country_name = each_city['country_name']

        print('#' * 100)
        print("Now City cid: {}, country: {}, name: {}".format(
            city_id, country_name, city_name))
        attr_info = get_attr_info(task_source, city_id)

        for each_attr_info_src in attr_info:
            each_attr_info = {}
            # 简体化字段
            for e_k in each_attr_info_src:
                if isinstance(each_attr_info_src[e_k], str):
                    each_attr_info[e_k] = tradition2simple(
                        each_attr_info_src[e_k]).decode()
                else:
                    each_attr_info[e_k] = each_attr_info_src[e_k]

            source = task_source
            source_id = each_attr_info['id']
            name = each_attr_info['name']
            name_en = get_similar_word(each_attr_info['name_en'] or '')
            name_key = city_id + '|_|_|' + (name or '')
            name_en_key = city_id + '|_|_|' + (name_en or '')
            if (name_key in name_dict or name_key in en_dict) and (
                    each_attr_info['name'] != '') and (each_attr_info['name']
                                                       is not None):
                miaoji_id = name_dict.get(name_key, '')
            elif (name_en_key in en_dict or name_en_key
                  in name_dict) and (each_attr_info['name_en'] != '') and (
                      each_attr_info['name_en'] is not None):
                miaoji_id = en_dict.get(name_en_key, '')
            else:
                miaoji_id = get_new_miaoji_id()

            # 融合过以及同源融合判定
            if (source, source_id) in id_keys[miaoji_id]:
                # 已融合过,跳过入库
                # 之后可以改为更新内容
                continue
            else:
                if source in id_keys[miaoji_id]:
                    # 同源融合
                    inner_source_merge_id.add(miaoji_id)

                    # 此部分有重复,同源融合日志输出时,需要先打印此部分
                    id_keys[miaoji_id].add(source)
                    id_keys[miaoji_id].add((source, source_id))
                    print("同源融合,mid: {0}, id_set: {1}".format(
                        miaoji_id, id_keys[miaoji_id]))
                else:
                    # 非同源融合,正常进行
                    pass

            # 增加同源融合判定信息
            id_keys[miaoji_id].add(source)
            id_keys[miaoji_id].add((source, source_id))

            count += 1
            each_data = (miaoji_id, city_id, city_name, country_name,
                         city_map_info, source, source_id,
                         each_attr_info['name'], each_attr_info['name_en'],
                         each_attr_info['map_info'], each_attr_info['grade'],
                         each_attr_info['star'], each_attr_info['ranking'],
                         each_attr_info['address'], each_attr_info['url'],
                         'meizhilv')

            # 增加进一步融合的 key
            name_dict[name_key] = miaoji_id
            en_dict[name_en_key] = miaoji_id

            # 添加数据
            data.append(each_data)

            if count % 1000 == 0:
                insert_db(data)
                data = []
                print(count)
    print(insert_db(data))
    print(count)

    # 打印同源融合情况
    print(inner_source_merge_id)