def get_shop_info(uid): conn = poi_ori_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT name, name_en FROM chat_shopping WHERE id=%s;''', (uid,)) _res_1 = cursor.fetchone() cursor.close() conn.close() conn = base_data_pool.connection() cursor = conn.cursor() _res = cursor.execute('''SELECT name, name_en FROM chat_shopping WHERE tag_id = 9 AND id=%s;''', (uid,)) if _res: _res_2 = cursor.fetchone() else: _res_2 = ('', '') cursor.close() conn.close() _res = list() _res.extend(_res_1) _res.extend(_res_2) return _res
def get_city_map(): conn = base_data_pool.connection() cursor = conn.cursor() sql = "select id,map_info from city;" cursor.execute(sql) datas = cursor.fetchall() cid2map = {} for data in datas: if None in data: continue cid = data[0] map_info = data[1] try: map_info_list = map_info.strip().split(',') lat = float(map_info_list[0]) lgt = float(map_info_list[1]) except Exception as e: continue cid2map[cid] = map_info cursor.close() conn.close() logger.debug('[cid size: {}]'.format(len(cid2map))) return cid2map
def get_tagid_dict(_poi_type): _dict = {} if _poi_type == 'attr': sql = '''SELECT tag, tag_en, original_tag FROM chat_attraction_tagS ORDER BY id;''' elif _poi_type == 'rest': sql = 'select tag,tag_en,original_tag from chat_restaurant_tagS' elif _poi_type == 'shop': sql = '''SELECT tag, tag_en, original_tag FROM chat_shopping_tagS ORDER BY id;''' else: raise TypeError("Unknown Type: {}".format(_poi_type)) conn = base_data_pool.connection() cursor = conn.cursor(cursor=DictCursor) cursor.execute(sql) for line in cursor.fetchall(): tag = line['tag'] tag_en = line['tag_en'] original_tag = line['original_tag'] _tags_set = set() for each_tag in original_tag.split('|'): if is_legal(each_tag): _tags_set.add(each_tag) _dict[tuple(_tags_set)] = (tag, tag_en) return _dict
def insert_city_data(): sql = '''SELECT id, name, name_en, map_info FROM city;''' conn = base_data_pool.connection() cursor = conn.cursor(cursor=DictCursor) cursor.execute(sql) cursor.close() conn.close() _count = 0 _total = 0 for each in cursor.fetchall(): _total += 1 try: map_info = each['map_info'] lng, lat = map_info.split(',') each["loc"] = { "type": "Point", "coordinates": [float(lng), float(lat)] } except Exception as e: logger.exception(msg="[map info error]", exc_info=e) continue city_collections.update({"id": each["id"]}, each, upsert=True) _count += 1 logger.debug("[insert_data][table: city][total: {}][count: {}]".format( _total, _count))
def get_c_info(): conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id, map_info FROM city WHERE map_info IS NOT NULL AND map_info != 'NULL';''') res = {line[0]: line[1] for line in cursor.fetchall()} cursor.close() conn.close() return res
def prepare_city_info(): conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id, CASE WHEN grade != -1 THEN grade ELSE 100 END AS grade FROM city;''') _res = {line[0]: line[1] for line in cursor.fetchall()} cursor.close() conn.close() return _res
def tag2id(): conn = base_data_pool.connection() sql = '''SELECT id, tag FROM {};'''.format(tag_s) cursor = conn.cursor() cursor.execute(sql) _dict = {} for _id, _tag_name in cursor.fetchall(): _dict[_tag_name] = str(_id) cursor.close() conn.close() return _dict
def init_id2tag(): conn = base_data_pool.connection() sql = '''SELECT id, Stag FROM {};'''.format(tag_b) cursor = conn.cursor() cursor.execute(sql) _dict = defaultdict(set) for _id, _l_s_tag in cursor.fetchall(): for each in _l_s_tag.split('|'): if is_legal(each): _dict[_id].add(each) cursor.close() conn.close() return _dict
def poi_ori(poi_type): already_merged_city = init_already_merged_city(poi_type=poi_type) conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id FROM city;''') cids = list(map(lambda x: x[0], cursor.fetchall())) cursor.close() conn.close() for cid in cids: if cid in already_merged_city: continue start = time.time() logger.info('[start][cid: {}]'.format(cid)) pool.apply_async(poi_merge, args=(cid, poi_type)) logger.info('[end][cid: {}][takes: {}]'.format(cid, time.time() - start)) pool.join()
def prepare_poi_info(): _dict = {} conn = base_data_pool.connection() cursor = conn.cursor() _sql = '''SELECT id, city_id FROM chat_attraction;''' cursor.execute(_sql) _count = 0 for line in cursor.fetchall(): _count += 1 if _count % 10000 == 0: logger.debug("[prepare_pic_task: {}]".format(_count)) _dict[line[0]] = line[1] cursor.close() conn.close() logger.debug("[prepare_pic_task: {}]".format(_count)) return _dict
def init_city_dict(): conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT * FROM (SELECT id, city.name AS name, city.name_en AS name_en, country.name AS country, CASE WHEN grade = -1 THEN 100 ELSE grade END AS grade FROM city JOIN country ON city.country_id = country.mid ORDER BY grade) t ORDER BY t.grade;''') for line in cursor.fetchall(): # id, name, name_en, country, grade yield line cursor.close() conn.close()
def city_task(): # private city 30km conn = private_data_test_pool.connection() cursor = conn.cursor(cursor=DictCursor) cursor.execute('''SELECT id, name, name_en, map_info FROM city;''') for line in cursor.fetchall(): if is_map_info_legal(line['map_info']): line['search_kilometer'] = 30 line['is_private_city'] = True yield line line['search_kilometer'] = 50 line['is_private_city'] = True yield line cursor.close() conn.close() conn = base_data_pool.connection() cursor = conn.cursor(cursor=DictCursor) cursor.execute('''SELECT id, name, name_en, map_info FROM city;''') for line in cursor.fetchall(): if is_map_info_legal(line['map_info']): line['search_kilometer'] = 30 line['is_private_city'] = False yield line line['search_kilometer'] = 50 line['is_private_city'] = False yield line cursor.close() conn.close()
def filter_data_already_online(_type, _mioji_id, error): if _type == 'attr': table_name = 'chat_attraction' elif _type == 'rest': table_name = 'chat_restaurant' elif _type == 'shop': table_name = 'chat_shopping' else: raise TypeError("Unknown Type: {}".format(_type)) _status = None try: conn = base_data_pool.connection() cursor = conn.cursor() sql = '''SELECT status_test FROM {} WHERE id='{}';'''.format(table_name, _mioji_id) _res = cursor.execute(sql) if not _res: return False _status = cursor.fetchone()[0] logger.debug("[type: {}][status: {}]".format(_type, _status)) cursor.close() conn.close() except Exception as exc: logger.exception(msg="[get online poi status error]", exc_info=exc) if not _status: return False try: conn = poi_ori_pool.connection() cursor = conn.cursor() sql = '''INSERT IGNORE INTO filter_data_already_online (type, mioji_id, error, status) VALUES (%s, %s, %s, %s);''' cursor.execute(sql, (_type, _mioji_id, error, _status)) conn.commit() cursor.close() conn.close() except Exception as exc: logger.exception(msg="[insert filter data already online error]", exc_info=exc)
def init_task(poi_type, process_num): # init city_list city_list = [[] for i in range(process_num)] already_merged_city = init_already_merged_city(poi_type="{}_data".format(poi_type)) conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT id FROM city;''') cids = list(map(lambda x: x[0], cursor.fetchall())) cursor.close() conn.close() _count = 0 for cid in cids: if str(cid) in already_merged_city: continue _count += 1 start = time.time() logger.info('[start][cid: {}]'.format(cid)) city_list[_count % process_num].append(cid) logger.info('[end][cid: {}][takes: {}]'.format(cid, time.time() - start)) for each_cids in city_list: os.system("nohup python3 multi_city_insert_db.py {} {} &".format(poi_type, ' '.join(each_cids)))
def get_max_id(): global poi_type global online_table_name global data_source_table conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT max(id) FROM {};'''.format(online_table_name)) _id_online = cursor.fetchone()[0] conn.close() conn = poi_ori_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT max(id) FROM {}_unid;'''.format(poi_type)) _id_merged = cursor.fetchone()[0] conn.close() if _id_merged and _id_online: return max(_id_online, _id_merged) elif _id_merged: return _id_merged elif _id_online: return _id_online else: raise Exception("无法获取最大 id ,不能进行融合")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/11/23 上午11:29 # @Author : Hou Rong # @Site : # @File : poi_city_mapping.py # @Software: PyCharm import re import pandas import dataset from service_platform_conn_pool import base_data_pool, source_info_str, source_info_pool, fetchall from my_logger import get_logger logger = get_logger("city_mapping") conn = base_data_pool.connection() cursor = conn.cursor() cursor.execute('''SELECT city.id AS city_id, country.mid AS country_id, city.name AS city_name, country.name AS country_name FROM city JOIN country ON city.country_id = country.mid;''') city_info = {line[0]: line for line in cursor.fetchall()} conn.close() def hotels_get_geo_id_by_dest_id(dest_id): sql = '''SELECT sid FROM ota_location
def insert_poi_unid(merged_dict, cid_or_geohash): global white_list global online_table_name global data_source_table start = time.time() # get city country name map_info _dev_conn = base_data_pool.connection() _dev_cursor = _dev_conn.cursor() _dev_cursor.execute('''SELECT city.id AS cid, city.name AS city_name, country.name AS country_name, city.map_info AS map_info FROM city JOIN country ON city.country_id = country.mid WHERE city.id = {};'''.format(cid_or_geohash)) cid, city_name, country, city_map_info = _dev_cursor.fetchone() _dev_cursor.close() _dev_conn.close() # 去除 total 的写法,费劲但是提升速度不明显,使用 total 后 5.9 秒获取巴黎全部信息,直接获取 6.9 秒,相差可以接受 # init id list # online_ids = set() # data_ids = set() # for _, s_sid_set in merged_dict.items(): # for source, sid in s_sid_set: # if source == 'online': # online_ids.add(sid) # else: # data_ids.add((source, sid)) # get data total # get online data name name_en map_info grade star ranking address url total_data = {} _dev_conn = base_data_pool.connection() _dev_cursor = _dev_conn.cursor() try: _t = time.time() sql = '''SELECT id, name, name_en, map_info, grade, -1, ranking, address, '' FROM {} WHERE city_id='{}';'''.format(online_table_name, cid_or_geohash) _dev_cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _dev_cursor.fetchall(): total_data[('online', line[0])] = line[1:] _dev_cursor.close() _dev_conn.close() # get poi name name_en map_info grade star ranking address url _data_conn = poi_ori_pool.connection() _data_cursor = _data_conn.cursor() try: _t = time.time() sql = '''SELECT source, id, name, name_en, map_info, grade, star, ranking, address, url FROM {} WHERE city_id='{}';'''.format(data_source_table, cid_or_geohash) _data_cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _data_cursor.fetchall(): total_data[(line[0], line[1])] = line[2:] _data_cursor.close() _data_conn.close() # init white list total data if white_list: _s_sid = [] for _each in white_list: _s_sid.extend(_each) _ori_conn = poi_ori_pool.connection() _ori_cursor = _ori_conn.cursor() try: _t = time.time() query_sql = '''SELECT source, id, name, name_en, map_info, grade, star, ranking, address, url FROM {} WHERE (source, id) IN ({});'''.format( data_source_table, ','.join(map(lambda x: "('{}', '{}')".format(*x), _s_sid))) _ori_cursor.execute(query_sql) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[sql exc][sql: {}]".format(sql), exc_info=exc) for line in _ori_cursor.fetchall(): total_data[(line[0], line[1])] = line[2:] _ori_cursor.close() _ori_conn.close() data = [] for uid, s_sid_set in merged_dict.items(): for source, sid in s_sid_set: # name name_en map_info grade star ranking address url name, name_en, map_info, grade, star, ranking, address, url = total_data[ (source, sid)] if not is_legal(name): name = '' if not is_legal(name_en): name_en = '' if not is_legal(grade): grade = -1.0 if not is_legal(star): star = -1.0 if not is_legal(ranking): ranking = -1.0 if not is_legal(address): address = '' if not is_legal(url): url = '' data.append( (uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade, star, ranking, address, url)) # data = [] # _dev_conn = base_data_pool.connection() # _dev_cursor = _dev_conn.cursor() # _data_conn = poi_ori_pool.connection() # _data_cursor = _data_conn.cursor() # for uid, s_sid_set in merged_dict.items(): # for source, sid in s_sid_set: # if source == 'online': # _dev_cursor.execute('''SELECT # name, # name_en, # map_info, # grade, # -1, # ranking, # address, # '' # FROM chat_attraction # WHERE id = '{}';'''.format(sid)) # try: # name, name_en, map_info, grade, star, ranking, address, url = _dev_cursor.fetchone() # except Exception as exc: # logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc) # continue # else: # _data_cursor.execute('''SELECT # CASE WHEN name NOT IN ('NULL', '', NULL) # THEN name # ELSE '' END, # CASE WHEN name_en NOT IN ('NULL', '', NULL) # THEN name_en # ELSE '' END, # map_info, # CASE WHEN grade NOT IN ('NULL', '', NULL) # THEN grade # ELSE -1.0 END AS grade, # CASE WHEN star NOT IN ('NULL', '', NULL) # THEN star # ELSE -1.0 END AS star, # CASE WHEN ranking NOT IN ('NULL', '', NULL) # THEN ranking # ELSE -1.0 END AS ranking, # CASE WHEN address NOT IN ('NULL', '', NULL) # THEN address # ELSE '' END, # CASE WHEN url NOT IN ('null', '', NULL) # THEN url # ELSE '' END # FROM attr # WHERE source = '{}' AND id = '{}';'''.format(source, sid)) # try: # name, name_en, map_info, grade, star, ranking, address, url = _data_cursor.fetchone() # except Exception as exc: # logger.exception("[error sql query][source: {}][sid: {}]".format(source, sid), exc_info=exc) # continue # # data.append((uid, cid, city_name, country, city_map_info, source, sid, name, name_en, map_info, grade, # star, ranking, address, url)) # _dev_cursor.close() # _data_cursor.close() # _dev_conn.close() # _data_conn.close() _final_conn = poi_ori_pool.connection() _final_cursor = _final_conn.cursor() # for d in data: try: _t = time.time() sql = '''REPLACE INTO {}_unid (id, city_id, city_name, country_name, city_map_info, source, source_id, name, name_en, map_info, grade, star, ranking, address, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'''.format( poi_type) _final_cursor.executemany(sql, data) logger.debug('[query][sql: {}][takes: {}]'.format( sql, time.time() - _t)) except Exception as exc: logger.exception("[insert unid table error]", exc_info=exc) _final_conn.commit() _final_cursor.close() _final_conn.close() logger.info( "[finish prepare data][city: {}][line_count: {}][takes: {}]".format( cid_or_geohash, len(data), time.time() - start))
def get_poi_dict(city_id): """ get all attraction info in the city :param city_id: city mioji id :return: city info dict """ global white_list_data _poi_dict = defaultdict(dict) _online_official_data = {} _online_nonofficial_data = {} # get whole data process conn = data_process_pool.connection() cursor = conn.cursor(cursor=DictCursor) sql = '''SELECT * FROM {} WHERE city_id='{}';'''.format(data_process_table_name, city_id) _t = time.time() cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t)) for each in cursor.fetchall(): if int(each["official"]) == 0: _online_nonofficial_data[each['id']] = each else: _online_official_data[each['id']] = each cursor.close() conn.close() # get whole base data, and update data process result conn = base_data_pool.connection() cursor = conn.cursor(cursor=DictCursor) sql = '''SELECT * FROM {} WHERE city_id='{}';'''.format(data_process_table_name, city_id) _t = time.time() cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t)) for each in cursor.fetchall(): each.pop('tag_id') if int(each["official"]) == 0: _online_nonofficial_data[each['id']].update(each) else: _online_official_data[each['id']].update(each) cursor.close() conn.close() # get whole source data conn = poi_ori_pool.connection() cursor = conn.cursor(cursor=DictCursor) sql = "select * from {} where city_id='{}'".format(poi_type, city_id) _t = time.time() cursor.execute(sql) logger.debug('[query][sql: {}][takes: {}]'.format(sql, time.time() - _t)) for line in cursor.fetchall(): source = line['source'] source_id = line['id'] _poi_dict[(source, source_id)] = line cursor.close() conn.close() for k, v in white_list_data.items(): _poi_dict[k] = v return _poi_dict, _online_official_data, _online_nonofficial_data