def fetch_cities(data): """ 城市列表 :param data: """ html = data['html'] store_list = [] while True: m = re.search(ur'<li class="expanded"><a href=".*?">(.+?)</a><br\s*?/>', html) if m is None: break html = html[m.start():] sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>') html = html[end:] d = data.copy() d['html'] = sub[len(m.group(0)):-len('</li>')] terms = m.group(1).strip().upper().split(' ') if len(terms) > 1 and cm.is_chinese(terms[-1]): d['city_c'] = terms[-1].strip() terms = terms[:-1] d['city_e'] = ' '.join(terms) if d['country_e'] == 'USA': m1 = re.search(ur'([A-Z]{2})\s*-\s*(.+)', d['city_e']) if m1: d['city_e'] = m1.group(2).strip() d['province_e'] = m1.group(1).strip() print 'Processing %s' % d['city_e'] store_list.extend(fetch_stores(d)) return store_list
def fetch_cities(data): """ 城市列表 :param data: """ html = data['html'] store_list = [] while True: m = re.search( ur'<li class="expanded"><a href=".*?">(.+?)</a><br\s*?/>', html) if m is None: break html = html[m.start():] sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>') html = html[end:] d = data.copy() d['html'] = sub[len(m.group(0)):-len('</li>')] terms = m.group(1).strip().upper().split(' ') if len(terms) > 1 and cm.is_chinese(terms[-1]): d['city_c'] = terms[-1].strip() terms = terms[:-1] d['city_e'] = ' '.join(terms) if d['country_e'] == 'USA': m1 = re.search(ur'([A-Z]{2})\s*-\s*(.+)', d['city_e']) if m1: d['city_e'] = m1.group(2).strip() d['province_e'] = m1.group(1).strip() print 'Processing %s' % d['city_e'] store_list.extend(fetch_stores(d)) return store_list
def fetch_stores(data): """ 获得商店信息 :param data: :return: """ url = data["post_url"] try: js = json.loads( cm.post_data( url, {"country_id": data["country_id"], "retail_city": "", "retail_type": data["retail_type"]} ).decode("unicode_escape") ) except Exception: print "Error occured in getting country list: %s" % url dump_data = {"level": 1, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]} cm.dump(dump_data) return [] # country_id=108&retail_city=&retail_type=retail # country_id=99&retail_city=&retail_type=service store_list = [] for s in js: entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) tmp = s["retail_name"].strip() if cm.is_chinese(tmp): entry[cm.name_c] = tmp else: entry[cm.name_e] = tmp entry[cm.addr_e] = s["retail_gmap"].strip() entry[cm.zip_code] = s["retail_zipcode"].strip() entry[cm.city_e] = s["retail_city"].strip().upper() if s["retail_email"] is not None: entry[cm.email] = s["retail_email"].strip() if s["retail_website"] is not None: entry[cm.url] = s["retail_website"].strip() if data["retail_type"] == "retail": entry[cm.store_class] = "Retail" else: entry[cm.store_class] = "Service Center" entry[cm.country_e] = s["country_name"].strip().upper() entry[cm.continent_e] = s["continent_name"].strip().upper() gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ) store_list.append(entry) db.insert_record(entry, "stores") return store_list
def fetch_stores(data): """ 获得商店信息 :param data: :return: """ url = data['post_url'] try: js = json.loads(cm.post_data(url, {'country_id': data['country_id'], 'retail_city': '', 'retail_type': data['retail_type']}).decode('unicode_escape')) except Exception: print 'Error occured in getting country list: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # country_id=108&retail_city=&retail_type=retail # country_id=99&retail_city=&retail_type=service store_list = [] for s in js: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) tmp = s['retail_name'].strip() if cm.is_chinese(tmp): entry[cm.name_c] = tmp else: entry[cm.name_e] = tmp entry[cm.addr_e] = s['retail_gmap'].strip() entry[cm.zip_code] = s['retail_zipcode'].strip() entry[cm.city_e] = s['retail_city'].strip().upper() if s['retail_email'] is not None: entry[cm.email] = s['retail_email'].strip() if s['retail_website'] is not None: entry[cm.url] = s['retail_website'].strip() if data['retail_type'] == 'retail': entry[cm.store_class] = 'Retail' else: entry[cm.store_class] = 'Service Center' entry[cm.country_e] = s['country_name'].strip().upper() entry[cm.continent_e] = s['continent_name'].strip().upper() gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
def fetch_stores(data): print '(%s/%d) Found city: %s' % (data['brandname_e'], data['brand_id'], data['city_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('var\s+data\s*=\s*', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'\[', r'\]') if end == 0: return [] store_list = [] for s in json.loads(sub): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) name = s['Name'] if cm.is_chinese(name): entry[cm.name_c] = name else: entry[cm.name_e] = name entry[cm.addr_e] = cm.html2plain(s['Street']) entry[cm.city_e] = cm.extract_city(data['city_e'])[0] entry[cm.country_e] = data['country_e'] entry[cm.province_e] = data['province_e'] pat = re.compile(ur'tel[\.: ]*', re.I) entry[cm.tel] = re.sub(pat, '', s['Phone']).strip() pat = re.compile(ur'fax[\.: ]*', re.I) entry[cm.fax] = re.sub(pat, '', s['Fax']).strip() entry[cm.email] = s['Email'].strip() entry[cm.url] = s['Website'].strip() coord = s['LatLng'] if coord is not None and len(coord) >= 2: if coord[0] is not None: entry[cm.lat] = string.atof(coord[0]) if coord[1] is not None: entry[cm.lng] = string.atof(coord[1]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
tmp = cm.reformat_addr(m) terms = tmp.split(',') ret = gs.look_up(terms[-1], 1) if ret is not None: # t2 = cm.geo_translate(terms[-1]) # if len(t2) != 0: # 这是一个国家 # 把最后的国家项分离出来 street_addr = ', '.join(terms[:-1]) entry[cm.addr_e] = cm.reformat_addr(street_addr) entry[cm.country_c] = ret['name_c'] entry[cm.country_e] = ret['name_e'] entry[cm.continent_c] = ret['continent']['name_c'] entry[cm.continent_e] = ret['continent']['name_e'] else: if cm.is_chinese(tmp): entry[cm.addr_c] = tmp else: entry[cm.addr_e] = tmp else: street_addr = ', '.join([street_addr, zip_code, city]) entry[cm.addr_e] = cm.reformat_addr(street_addr) ret = gs.look_up(country, 1) if ret is None: # t2 = cm.geo_translate(country) # if len(t2) == 0: entry[cm.country_c] = country else: entry[cm.country_c] = ret['name_c'] entry[cm.country_e] = ret['name_e'] entry[cm.continent_c] = ret['continent']['name_c']
def fetch(level=1, data=None, user='******', passwd=''): db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) data = { 's': -89, 'w': -179, 'n': 89, 'e': 179, 'chinese': 0, 'repair': 1, 'store': 1 } try: html = common.get_data(url_init, data) except Exception: print 'Error occured in getting the list of countries: %s' % url_init dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'data': url_init }, 'brand_id': brand_id } common.dump(dump_data) return [] store_list = [] store_map = json.loads(html) tot = 0 while True: # 得到{'uid':entry}的字典 tmp = store_map['lists'] # 是否有'more' flag = False if 'has_key' not in dir(tmp): raw_stores = {} for item in tmp: if 'more' in item: flag = item['more'] else: raw_stores[item['nid']] = item else: raw_stores = tmp for k in tmp: if 'more' in tmp[k]: flag = tmp[k]['more'] break # 分析raw_stores for k in raw_stores: s = raw_stores[k] if 'more' in s: flag = s['more'] else: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) if s['country'] is not None: country_c = s['country'].strip().upper() ret = gs.look_up(country_c, 1) if ret is not None: entry[common.country_e] = ret['name_e'] entry[common.country_c] = ret['name_c'] else: if common.is_chinese(country_c): entry[common.country_c] = country_c else: entry[common.country_e] = country_c if s['address'] is not None: addr = common.reformat_addr(s['address']) if common.is_chinese(addr): entry[common.addr_c] = addr else: entry[common.addr_e] = addr city = s['city'] if city is not None: city = city.strip().upper() ret = gs.look_up(city, 3) if ret is not None: entry[common.city_c] = ret['name_c'] entry[common.city_e] = ret['name_e'] else: if common.is_chinese(city): entry[common.city_c] = city else: entry[common.city_e] = city entry[common.city_e] = common.extract_city( entry[common.city_e])[0] if s['email'] is not None: entry[common.email] = s['email'] if s['fax'] is not None: entry[common.fax] = s['fax'] if s['latitude'] is not None: entry[common.lat] = string.atof(s['latitude']) if s['longitude'] is not None: entry[common.lng] = string.atof(s['longitude']) if s['phone'] is not None: entry[common.tel] = s['phone'] if s['postal_code'] is not None: entry[common.zip_code] = s['postal_code'] if s['title'] is not None: name = s['title'] if common.is_chinese(name): entry[common.name_c] = name else: entry[common.name_e] = name if s['operating_hours'] is not None: entry[common.hours] = s['operating_hours'] if s['url'] is not None: entry[common.url] = host + s['url'] gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) if flag: tot += len(store_map['lists']) - 1 data['offset'] = tot store_map = json.loads(common.get_data(url_more, data)) continue else: tot += len(store_map['lists']) break print 'Found a total of %d stores.' % tot db.disconnect_db() return store_list
def fetch(level=1, data=None, user='******', passwd=''): db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) data = {'s': -89, 'w': -179, 'n': 89, 'e': 179, 'chinese': 0, 'repair': 1, 'store': 1} try: html = common.get_data(url_init, data) except Exception: print 'Error occured in getting the list of countries: %s' % url_init dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url_init}, 'brand_id': brand_id} common.dump(dump_data) return [] store_list = [] store_map = json.loads(html) tot = 0 while True: # 得到{'uid':entry}的字典 tmp = store_map['lists'] # 是否有'more' flag = False if 'has_key' not in dir(tmp): raw_stores = {} for item in tmp: if 'more' in item: flag = item['more'] else: raw_stores[item['nid']] = item else: raw_stores = tmp for k in tmp: if 'more' in tmp[k]: flag = tmp[k]['more'] break # 分析raw_stores for k in raw_stores: s = raw_stores[k] if 'more' in s: flag = s['more'] else: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) if s['country'] is not None: country_c = s['country'].strip().upper() ret = gs.look_up(country_c, 1) if ret is not None: entry[common.country_e] = ret['name_e'] entry[common.country_c] = ret['name_c'] else: if common.is_chinese(country_c): entry[common.country_c] = country_c else: entry[common.country_e] = country_c if s['address'] is not None: addr = common.reformat_addr(s['address']) if common.is_chinese(addr): entry[common.addr_c] = addr else: entry[common.addr_e] = addr city = s['city'] if city is not None: city = city.strip().upper() ret = gs.look_up(city, 3) if ret is not None: entry[common.city_c] = ret['name_c'] entry[common.city_e] = ret['name_e'] else: if common.is_chinese(city): entry[common.city_c] = city else: entry[common.city_e] = city entry[common.city_e] = common.extract_city(entry[common.city_e])[0] if s['email'] is not None: entry[common.email] = s['email'] if s['fax'] is not None: entry[common.fax] = s['fax'] if s['latitude'] is not None: entry[common.lat] = string.atof(s['latitude']) if s['longitude'] is not None: entry[common.lng] = string.atof(s['longitude']) if s['phone'] is not None: entry[common.tel] = s['phone'] if s['postal_code'] is not None: entry[common.zip_code] = s['postal_code'] if s['title'] is not None: name = s['title'] if common.is_chinese(name): entry[common.name_c] = name else: entry[common.name_e] = name if s['operating_hours'] is not None: entry[common.hours] = s['operating_hours'] if s['url'] is not None: entry[common.url] = host + s['url'] gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) if flag: tot += len(store_map['lists']) - 1 data['offset'] = tot store_map = json.loads(common.get_data(url_more, data)) continue else: tot += len(store_map['lists']) break print 'Found a total of %d stores.' % tot db.disconnect_db() return store_list
def fetch_stores(data): print '(%s/%d) Found city: %s' % (data['brandname_e'], data['brand_id'], data['city_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('var\s+data\s*=\s*', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'\[', r'\]') if end == 0: return [] store_list = [] for s in json.loads(sub): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) name = s['Name'] if cm.is_chinese(name): entry[cm.name_c] = name else: entry[cm.name_e] = name entry[cm.addr_e] = cm.html2plain(s['Street']) entry[cm.city_e] = cm.extract_city(data['city_e'])[0] entry[cm.country_e] = data['country_e'] entry[cm.province_e] = data['province_e'] pat = re.compile(ur'tel[\.: ]*', re.I) entry[cm.tel] = re.sub(pat, '', s['Phone']).strip() pat = re.compile(ur'fax[\.: ]*', re.I) entry[cm.fax] = re.sub(pat, '', s['Fax']).strip() entry[cm.email] = s['Email'].strip() entry[cm.url] = s['Website'].strip() coord = s['LatLng'] if coord is not None and len(coord) >= 2: if coord[0] is not None: entry[cm.lat] = string.atof(coord[0]) if coord[1] is not None: entry[cm.lng] = string.atof(coord[1]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list