def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div class="searchResult[^"]*"', body): if 'intro' in m.group(): continue sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub) if m1 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = data['city'] addr_list = [ tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] else: m1 = re.search(ur'Tel:([^<>]+)', sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) start = sub.find(ur'Opening hours:') if start != -1: entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip() ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'Max Mara' in tmp[0]: del tmp[0] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def sense_cities(lower_bound='a', upper_bound='b'): """ 规则化城市字段 """ def get_unique_latlng(latlng_list, tol_lat=0.5, tol_lng=1): """ 从一组经纬度数据点中,去掉距离过远的数据点,取得集中的坐标。 :param latlng_list: :param tol_lat: 纬度的容忍度。 :param tol_lng: 经度的容忍度。 """ def get_avg(l): return float(sum(l)) / len(l) if len(l) > 0 else None def func(vals, tol): vals = list(vals) avg = None while True: avg = get_avg(vals) if not avg: break max_dist = sorted(tuple({'idx': idx, 'dist': abs(vals[idx] - avg)} for idx in xrange(len(vals))), key=lambda arg: arg['dist'])[-1] if max_dist['dist'] < tol: break elif len(vals) == 2: # 如果只有两个数据点,且相互离散,则该方法失效 avg = None break else: del vals[max_dist['idx']] return avg lat = func((tmp[0] for tmp in latlng_list), tol_lat) lng = func((tmp[1] for tmp in latlng_list), tol_lng) return (lat, lng) def register_city(geocoded_info): candidate_geo = None for geo_info in geocoded_info: admin_info = geo_info['administrative_info'] if 'country' not in admin_info: common.dump(u'Country info does not exist: %s' % admin_info) continue if 'locality' in admin_info: city = admin_info['locality'] elif 'sublocality' in admin_info: city = admin_info['sublocality'] elif 'administrative_area_level_3' in admin_info: city = admin_info['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info: city = admin_info['administrative_area_level_2'] else: common.dump(u'City info does not exist: %s' % admin_info) continue tmp_geo = {'city_e': city, 'country_e': admin_info['country']} if 'administrative_area_level_1' in admin_info: tmp_geo['region_e'] = admin_info['administrative_area_level_1'] else: tmp_geo['region_e'] = '' tmp_geo['formatted_address'] = geo_info['formatted_address'] if not candidate_geo: candidate_geo = tmp_geo # 检验一致性,国家或城市信息必须一致 ret1 = gs.look_up(country_e, 1) ret2 = gs.look_up(admin_info['country'], 1) if (ret1['name_e'] if ret1 else country_e) != (ret2['name_e'] if ret2 else admin_info['country']): common.dump(u'Countries does not match.', log_name) ret3 = gs.look_up(city_e, 1) ret4 = gs.look_up(city, 1) if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city): common.dump(u'Cities does not match.', log_name) continue # 如果走到这一步,说明geo_info通过了上述检验,可以使用 candidate_geo = tmp_geo break # candidate_geo是正确的地理信息 if not candidate_geo: return False # 登记城市标准化信息 std_info = candidate_geo # 获得中文信息 std_info['country_c'] = '' std_info['region_c'] = '' std_info['city_c'] = '' geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh') if geocoded_info_zh: admin_info_zh = geocoded_info_zh[0]['administrative_info'] if 'country' in admin_info_zh: std_info['country_c'] = admin_info_zh['country'] if 'locality' in admin_info_zh: std_info['city_c'] = admin_info_zh['locality'] elif 'sublocality' in admin_info_zh: std_info['city_c'] = admin_info_zh['sublocality'] elif 'administrative_area_level_3' in admin_info_zh: std_info['city_c'] = admin_info_zh['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info_zh: std_info['city_c'] = admin_info_zh['administrative_area_level_2'] if 'administrative_area_level_1' in admin_info_zh: std_info['region_c'] = admin_info_zh['administrative_area_level_1'] std_sig = u'|'.join((std_info['city_e'], std_info['region_e'], std_info['country_e'])) city_std[sig] = {'std_sig': std_sig} if 'std_sig' not in city_std: city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info} common.dump(u'%s => %s' % (sig, std_sig)) return True city_std = {} log_name = u'sense_cities.log' try: with open('data/city_std.dat', 'r') as f: # {'city|region|country':{'std_info':{'city':...,'region':...,'country':...}, 'geo_result': result}} # 城市的标准化映射信息 city_std = json.loads(f.readlines()[0]) except IOError: common.dump(u'Failed to load data/city_std.dat', log_name) db = common.StoresDb() db.connect_db(host='localhost', port=3306, user='******', passwd='123456', db='brand_stores') tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND (is_geocoded<4 OR is_geocoded>7) ORDER BY city_e, province_e, country_e LIMIT 99999" # tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND is_geocoded=6 ORDER BY city_e, province_e, country_e LIMIT 99999" tpl_pos = "SELECT lat, lng, addr_e, idstores FROM stores WHERE city_e='%s' AND province_e='%s' AND country_e='%s' LIMIT 99999" tpl_geocoded = "UPDATE stores SET is_geocoded=%d WHERE city_e='%s' AND province_e='%s' AND country_e='%s'" statement = tpl_entity % (lower_bound, upper_bound) common.dump(u"Processing cities from '%s' to '%s'..." % (lower_bound, upper_bound), log_name) for item in db.query_all(statement): try: sig = u'|'.join(item[i] for i in xrange(3)) if sig in city_std: common.dump(u'Geo item %s already processed.' % sig, log_name) tmp1 = [7] tmp1.extend(tmp.replace("'", r"\'") for tmp in (item[i] for i in xrange(3))) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) continue common.dump(u'Processing %s...' % sig, log_name) city_e, province_e, country_e = item geo_success = False statement = tpl_pos % tuple(tmp.replace("'", r"\'") for tmp in item) query_result = db.query_all(statement) # 使用经纬度进行查询 latlng_list = [] for lat, lng, addr, idstores in query_result: if not lat or not lng or lat == '' or lng == '': continue latlng_list.append(tuple(map(string.atof, (lat, lng)))) lat, lng = get_unique_latlng(latlng_list) if lat and lng: tmp = gs.geocode(latlng='%f,%f' % (lat, lng)) if tmp: geo_success = register_city(tmp) if geo_success: # 通过经纬度获得 tmp1 = [4] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) else: for lat, lng, addr, idstores in query_result: # 使用地址进行查询 tmp = gs.geocode(u'%s,%s,%s' % (city_e, province_e, country_e)) if not tmp: continue geo_success = register_city(tmp) if geo_success: break tmp = gs.geocode(addr) if not tmp: continue geo_success = register_city(tmp) if geo_success: break if geo_success: # 通过地址成功获得 tmp1 = [5] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) else: # 未能获得 tmp1 = [6] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) with open(u'data/city_std.dat', 'w') as f: f.write(json.dumps(city_std).encode('utf-8')) except Exception as e: common.dump(traceback.format_exc(), log_name) common.dump(u'Done!', log_name)
def register_city(geocoded_info): candidate_geo = None for geo_info in geocoded_info: admin_info = geo_info['administrative_info'] if 'country' not in admin_info: common.dump(u'Country info does not exist: %s' % admin_info) continue if 'locality' in admin_info: city = admin_info['locality'] elif 'sublocality' in admin_info: city = admin_info['sublocality'] elif 'administrative_area_level_3' in admin_info: city = admin_info['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info: city = admin_info['administrative_area_level_2'] else: common.dump(u'City info does not exist: %s' % admin_info) continue tmp_geo = {'city_e': city, 'country_e': admin_info['country']} if 'administrative_area_level_1' in admin_info: tmp_geo['region_e'] = admin_info['administrative_area_level_1'] else: tmp_geo['region_e'] = '' tmp_geo['formatted_address'] = geo_info['formatted_address'] if not candidate_geo: candidate_geo = tmp_geo # 检验一致性,国家或城市信息必须一致 ret1 = gs.look_up(country_e, 1) ret2 = gs.look_up(admin_info['country'], 1) if (ret1['name_e'] if ret1 else country_e) != (ret2['name_e'] if ret2 else admin_info['country']): common.dump(u'Countries does not match.', log_name) ret3 = gs.look_up(city_e, 1) ret4 = gs.look_up(city, 1) if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city): common.dump(u'Cities does not match.', log_name) continue # 如果走到这一步,说明geo_info通过了上述检验,可以使用 candidate_geo = tmp_geo break # candidate_geo是正确的地理信息 if not candidate_geo: return False # 登记城市标准化信息 std_info = candidate_geo # 获得中文信息 std_info['country_c'] = '' std_info['region_c'] = '' std_info['city_c'] = '' geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh') if geocoded_info_zh: admin_info_zh = geocoded_info_zh[0]['administrative_info'] if 'country' in admin_info_zh: std_info['country_c'] = admin_info_zh['country'] if 'locality' in admin_info_zh: std_info['city_c'] = admin_info_zh['locality'] elif 'sublocality' in admin_info_zh: std_info['city_c'] = admin_info_zh['sublocality'] elif 'administrative_area_level_3' in admin_info_zh: std_info['city_c'] = admin_info_zh['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info_zh: std_info['city_c'] = admin_info_zh['administrative_area_level_2'] if 'administrative_area_level_1' in admin_info_zh: std_info['region_c'] = admin_info_zh['administrative_area_level_1'] std_sig = u'|'.join((std_info['city_e'], std_info['region_e'], std_info['country_e'])) city_std[sig] = {'std_sig': std_sig} if 'std_sig' not in city_std: city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info} common.dump(u'%s => %s' % (sig, std_sig)) return True
entry[cm.city_e], entry[cm.city_c] = city_e, city_c entry[cm.name_e]=name entry[cm.addr_e]=name gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = gs.geocode(', '.join((entry[cm.name_e], entry[cm.city_c], entry[cm.country_c]))) if not ret: ret = gs.geocode(', '.join((entry[cm.city_c], entry[cm.country_c]))) if ret: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper()
type_list = [] for item in pq(body)('#map-panel ul li'): if item.text: val = cm.html2plain(item.text).strip() if val != '': type_list.append(val) entry[cm.store_type] = ', '.join(type_list) tmp = pq(body)('#map-panel iframe[src!=""]') if len(tmp) > 0: # map_url = tmp[0].attrib['src'] m = re.search(ur'daddr=([^&]+)', tmp[0].attrib['src']) if m: map_url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%s&sensor=false' % m.group(1) ret = gs.geocode(url=map_url) if ret: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip()
entry[cm.lng] = string.atof(m2.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.city_e] == '' or entry[cm.country_e] == '': ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone']))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper()
gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if (entry[cm.country_e] == '' or entry[cm.city_e] == ''): ret = None location_valid = True if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: location_valid = False ret = gs.geocode('%s, %s, %s' % (entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e])) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper()
def sense_cities(lower_bound='a', upper_bound='b'): """ 规则化城市字段 """ def get_unique_latlng(latlng_list, tol_lat=0.5, tol_lng=1): """ 从一组经纬度数据点中,去掉距离过远的数据点,取得集中的坐标。 :param latlng_list: :param tol_lat: 纬度的容忍度。 :param tol_lng: 经度的容忍度。 """ def get_avg(l): return float(sum(l)) / len(l) if len(l) > 0 else None def func(vals, tol): vals = list(vals) avg = None while True: avg = get_avg(vals) if not avg: break max_dist = sorted(tuple({ 'idx': idx, 'dist': abs(vals[idx] - avg) } for idx in xrange(len(vals))), key=lambda arg: arg['dist'])[-1] if max_dist['dist'] < tol: break elif len(vals) == 2: # 如果只有两个数据点,且相互离散,则该方法失效 avg = None break else: del vals[max_dist['idx']] return avg lat = func((tmp[0] for tmp in latlng_list), tol_lat) lng = func((tmp[1] for tmp in latlng_list), tol_lng) return (lat, lng) def register_city(geocoded_info): candidate_geo = None for geo_info in geocoded_info: admin_info = geo_info['administrative_info'] if 'country' not in admin_info: common.dump(u'Country info does not exist: %s' % admin_info) continue if 'locality' in admin_info: city = admin_info['locality'] elif 'sublocality' in admin_info: city = admin_info['sublocality'] elif 'administrative_area_level_3' in admin_info: city = admin_info['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info: city = admin_info['administrative_area_level_2'] else: common.dump(u'City info does not exist: %s' % admin_info) continue tmp_geo = {'city_e': city, 'country_e': admin_info['country']} if 'administrative_area_level_1' in admin_info: tmp_geo['region_e'] = admin_info['administrative_area_level_1'] else: tmp_geo['region_e'] = '' tmp_geo['formatted_address'] = geo_info['formatted_address'] if not candidate_geo: candidate_geo = tmp_geo # 检验一致性,国家或城市信息必须一致 ret1 = gs.look_up(country_e, 1) ret2 = gs.look_up(admin_info['country'], 1) if (ret1['name_e'] if ret1 else country_e) != ( ret2['name_e'] if ret2 else admin_info['country']): common.dump(u'Countries does not match.', log_name) ret3 = gs.look_up(city_e, 1) ret4 = gs.look_up(city, 1) if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city): common.dump(u'Cities does not match.', log_name) continue # 如果走到这一步,说明geo_info通过了上述检验,可以使用 candidate_geo = tmp_geo break # candidate_geo是正确的地理信息 if not candidate_geo: return False # 登记城市标准化信息 std_info = candidate_geo # 获得中文信息 std_info['country_c'] = '' std_info['region_c'] = '' std_info['city_c'] = '' geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh') if geocoded_info_zh: admin_info_zh = geocoded_info_zh[0]['administrative_info'] if 'country' in admin_info_zh: std_info['country_c'] = admin_info_zh['country'] if 'locality' in admin_info_zh: std_info['city_c'] = admin_info_zh['locality'] elif 'sublocality' in admin_info_zh: std_info['city_c'] = admin_info_zh['sublocality'] elif 'administrative_area_level_3' in admin_info_zh: std_info['city_c'] = admin_info_zh[ 'administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info_zh: std_info['city_c'] = admin_info_zh[ 'administrative_area_level_2'] if 'administrative_area_level_1' in admin_info_zh: std_info['region_c'] = admin_info_zh[ 'administrative_area_level_1'] std_sig = u'|'.join( (std_info['city_e'], std_info['region_e'], std_info['country_e'])) city_std[sig] = {'std_sig': std_sig} if 'std_sig' not in city_std: city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info} common.dump(u'%s => %s' % (sig, std_sig)) return True city_std = {} log_name = u'sense_cities.log' try: with open('data/city_std.dat', 'r') as f: # {'city|region|country':{'std_info':{'city':...,'region':...,'country':...}, 'geo_result': result}} # 城市的标准化映射信息 city_std = json.loads(f.readlines()[0]) except IOError: common.dump(u'Failed to load data/city_std.dat', log_name) db = common.StoresDb() db.connect_db(host='localhost', port=3306, user='******', passwd='123456', db='brand_stores') tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND (is_geocoded<4 OR is_geocoded>7) ORDER BY city_e, province_e, country_e LIMIT 99999" # tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND is_geocoded=6 ORDER BY city_e, province_e, country_e LIMIT 99999" tpl_pos = "SELECT lat, lng, addr_e, idstores FROM stores WHERE city_e='%s' AND province_e='%s' AND country_e='%s' LIMIT 99999" tpl_geocoded = "UPDATE stores SET is_geocoded=%d WHERE city_e='%s' AND province_e='%s' AND country_e='%s'" statement = tpl_entity % (lower_bound, upper_bound) common.dump( u"Processing cities from '%s' to '%s'..." % (lower_bound, upper_bound), log_name) for item in db.query_all(statement): try: sig = u'|'.join(item[i] for i in xrange(3)) if sig in city_std: common.dump(u'Geo item %s already processed.' % sig, log_name) tmp1 = [7] tmp1.extend( tmp.replace("'", r"\'") for tmp in (item[i] for i in xrange(3))) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) continue common.dump(u'Processing %s...' % sig, log_name) city_e, province_e, country_e = item geo_success = False statement = tpl_pos % tuple( tmp.replace("'", r"\'") for tmp in item) query_result = db.query_all(statement) # 使用经纬度进行查询 latlng_list = [] for lat, lng, addr, idstores in query_result: if not lat or not lng or lat == '' or lng == '': continue latlng_list.append(tuple(map(string.atof, (lat, lng)))) lat, lng = get_unique_latlng(latlng_list) if lat and lng: tmp = gs.geocode(latlng='%f,%f' % (lat, lng)) if tmp: geo_success = register_city(tmp) if geo_success: # 通过经纬度获得 tmp1 = [4] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) else: for lat, lng, addr, idstores in query_result: # 使用地址进行查询 tmp = gs.geocode(u'%s,%s,%s' % (city_e, province_e, country_e)) if not tmp: continue geo_success = register_city(tmp) if geo_success: break tmp = gs.geocode(addr) if not tmp: continue geo_success = register_city(tmp) if geo_success: break if geo_success: # 通过地址成功获得 tmp1 = [5] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) else: # 未能获得 tmp1 = [6] tmp1.extend(tmp.replace("'", r"\'") for tmp in item) statement = tpl_geocoded % tuple(tmp1) db.execute(statement) with open(u'data/city_std.dat', 'w') as f: f.write(json.dumps(city_std).encode('utf-8')) except Exception as e: common.dump(traceback.format_exc(), log_name) common.dump(u'Done!', log_name)
def register_city(geocoded_info): candidate_geo = None for geo_info in geocoded_info: admin_info = geo_info['administrative_info'] if 'country' not in admin_info: common.dump(u'Country info does not exist: %s' % admin_info) continue if 'locality' in admin_info: city = admin_info['locality'] elif 'sublocality' in admin_info: city = admin_info['sublocality'] elif 'administrative_area_level_3' in admin_info: city = admin_info['administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info: city = admin_info['administrative_area_level_2'] else: common.dump(u'City info does not exist: %s' % admin_info) continue tmp_geo = {'city_e': city, 'country_e': admin_info['country']} if 'administrative_area_level_1' in admin_info: tmp_geo['region_e'] = admin_info['administrative_area_level_1'] else: tmp_geo['region_e'] = '' tmp_geo['formatted_address'] = geo_info['formatted_address'] if not candidate_geo: candidate_geo = tmp_geo # 检验一致性,国家或城市信息必须一致 ret1 = gs.look_up(country_e, 1) ret2 = gs.look_up(admin_info['country'], 1) if (ret1['name_e'] if ret1 else country_e) != ( ret2['name_e'] if ret2 else admin_info['country']): common.dump(u'Countries does not match.', log_name) ret3 = gs.look_up(city_e, 1) ret4 = gs.look_up(city, 1) if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city): common.dump(u'Cities does not match.', log_name) continue # 如果走到这一步,说明geo_info通过了上述检验,可以使用 candidate_geo = tmp_geo break # candidate_geo是正确的地理信息 if not candidate_geo: return False # 登记城市标准化信息 std_info = candidate_geo # 获得中文信息 std_info['country_c'] = '' std_info['region_c'] = '' std_info['city_c'] = '' geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh') if geocoded_info_zh: admin_info_zh = geocoded_info_zh[0]['administrative_info'] if 'country' in admin_info_zh: std_info['country_c'] = admin_info_zh['country'] if 'locality' in admin_info_zh: std_info['city_c'] = admin_info_zh['locality'] elif 'sublocality' in admin_info_zh: std_info['city_c'] = admin_info_zh['sublocality'] elif 'administrative_area_level_3' in admin_info_zh: std_info['city_c'] = admin_info_zh[ 'administrative_area_level_3'] elif 'administrative_area_level_2' in admin_info_zh: std_info['city_c'] = admin_info_zh[ 'administrative_area_level_2'] if 'administrative_area_level_1' in admin_info_zh: std_info['region_c'] = admin_info_zh[ 'administrative_area_level_1'] std_sig = u'|'.join( (std_info['city_e'], std_info['region_e'], std_info['country_e'])) city_std[sig] = {'std_sig': std_sig} if 'std_sig' not in city_std: city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info} common.dump(u'%s => %s' % (sig, std_sig)) return True
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub) if m1 is None: continue ret = gs.geocode(latlng=m1.group(1)) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'MAX' in tmp[0]: del tmp[0] if cm.extract_tel(tmp[-1])!='': del tmp[-1] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) else: cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name) continue
entry = fetch_contact_info(data, entry, s['id']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(', '.join((entry[cm.addr_e], s['city']))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']:
entry = fetch_contact_info(data, entry, s["id"]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == "": entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == "" or entry[cm.city_e] == "": ret = None if entry[cm.lat] != "" and entry[cm.lng] != "": ret = gs.geocode(latlng="%f,%f" % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(", ".join((entry[cm.addr_e], s["city"]))) if ret is not None: city = "" province = "" country = "" zip_code = "" tmp = ret[0]["address_components"] for v in tmp: if "locality" in v["types"]: city = v["long_name"].strip().upper() elif "administrative_area_level_1" in v["types"]: province = v["long_name"].strip().upper() elif "country" in v["types"]:
sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] entry[cm.addr_e] = cm.reformat_addr(m.group(1)) m = re.search(ur'<h4>(.+?)</h4>', sub) if m is not None and 't:' in m.group(1).lower(): entry[cm.tel] = cm.extract_tel(m.group(1)) m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone']))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper()
gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == "": entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == "" or entry[cm.city_e] == "": ret = None location_valid = True if entry[cm.lat] != "" and entry[cm.lng] != "": ret = gs.geocode(latlng="%f,%f" % (entry[cm.lat], entry[cm.lng])) if ret is None: location_valid = False ret = gs.geocode("%s, %s, %s" % (entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e])) if ret is not None: city = "" province = "" country = "" zip_code = "" tmp = ret[0]["address_components"] for v in tmp: if "locality" in v["types"]: city = v["long_name"].strip().upper() elif "administrative_area_level_1" in v["types"]: province = v["long_name"].strip().upper()
entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(entry[cm.addr_e]) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']:
entry[cm.city_e], entry[cm.city_c] = city_e, city_c entry[cm.name_e] = name entry[cm.addr_e] = name gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = gs.geocode(', '.join( (entry[cm.name_e], entry[cm.city_c], entry[cm.country_c]))) if not ret: ret = gs.geocode(', '.join( (entry[cm.city_c], entry[cm.country_c]))) if ret: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']:
gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None location_valid = True if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: location_valid = False ret = gs.geocode(', '.join( (entry[cm.addr_e], entry[cm.country_e]))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']:
gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None location_valid = True if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: location_valid = False ret = gs.geocode(', '.join((entry[cm.addr_e], entry[cm.country_e]))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper()
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] m = re.search(ur'var\s+geoShops\s*=', body) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] tmp = cm.extract_closure(body[m.end():], ur'\[', ur'\]')[0] raw = json.loads(re.sub(ur'(?<!")(city|address|lat|lng)(?!")', ur'"\1"', tmp)) store_list = [] for s in raw: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.city_e] = s['city'].strip().upper() if s['lat'] is not None and s['lat'] != '': entry[cm.lat] = string.atof(s['lat']) if s['lng'] is not None and s['lng'] != '': entry[cm.lng] = string.atof(s['lng']) addr = cm.reformat_addr(s['address']) pat = re.compile(ur'ph[\.:](.*)$', re.I) m = re.search(pat, addr) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.addr_e] = re.sub(pat, '', addr).strip() addr1 = re.sub(ur'[\u2e80-\u9fff]+', '', '%s, %s' % (addr, s['city'])).strip() ret = gs.geocode(addr1, '%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(addr1) if ret is None: ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code else: ret = gs.addr_sense(addr1) if ret[0] is not None: entry[cm.country_e] = ret[0] if ret[1] is not None: entry[cm.province_e] = ret[1] if ret[2] is not None: entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None location_valid = True if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: location_valid = False ret = gs.geocode('%s, %s, %s' % (entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e])) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper()
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div class="searchResult[^"]*"', body): if 'intro' in m.group(): continue sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub) if m1 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = data['city'] addr_list = [tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] else: m1 = re.search(ur'Tel:([^<>]+)', sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) start = sub.find(ur'Opening hours:') if start != -1: entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip() ret = None if entry[cm.lat]!='' and entry[cm.lng]!='': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'Max Mara' in tmp[0]: del tmp[0] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)