def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<item id="\d+">', body): sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<country>([^<>]+)</country>', sub) if m1 is not None: tmp = m1.group(1).split('/') for v in tmp: ret = gs.look_up(v.strip().upper(), 1) if ret is not None: entry[cm.country_e] = ret['name_e'] break m1 = re.search(ur'<city>([^<>]+)</city>', sub) if m1 is not None: val = cm.reformat_addr(m1.group(1)) if entry[cm.country_e] == 'UNITED STATES': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]): entry[cm.province_e] = tmp_list[1] entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<brands>([^<>]+)</brands>', sub) if m1 is not None: tmp = m1.group(1).split('/') brand_list = [] for v in tmp: if v.strip() != '': brand_list.append(v) entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list) m1 = re.search(ur'<name>([^<>]+)</name>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() m1 = re.search(ur'<address>([^<>]+)</address>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<tel>([^<>]+)</tel>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def get_detailed_store(html, store_cat): store_list = [] start = 0 while True: sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>") if end == 0: break # 得到单个门店的页面代码 html = html[end:] entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html) if len(m) > 0: entry[common.name_e] = common.reformat_addr(m[0]) m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S) if len(m) > 0: addr = common.reformat_addr(m[0]) # 最后一行是否为电话号码? terms = addr.split(", ") tel = common.extract_tel(terms[-1]) if tel != "": addr = ", ".join(terms[:-1]) entry[common.tel] = tel entry[common.addr_e] = addr # 获得门店类型 # store_type = [store_cat] type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>") if type_end != 0: store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)] store_type.insert(0, store_cat) entry[common.store_type] = ", ".join(store_type) else: entry[common.store_type] = store_cat # 获得经纬度 m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lat] = string.atof(m[0]) m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lng] = string.atof(m[0]) entry[common.city_e] = common.extract_city(data[common.city_e])[0] entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper() gs.field_sense(entry) print "%s: Found store: %s, %s (%s, %s, %s)" % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.city_e], entry[common.country_e], entry[common.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def fetch_stores(data): param = {'action': 'getStoresFromAjax', 'country': data['country_code'], 'region': data['city'], 'collection': ''} url = data['url'] try: body = cm.post_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] for m1 in re.finditer(ur'<div class="shop-type-container">', body): sub = cm.extract_closure(body[m1.start():], ur'<div\b', ur'</div>')[0] store_class = '' m2 = re.search(ur'<div class="shop-type-title">(.+?)</div>', sub, re.S) if m2 is not None: store_class = cm.reformat_addr(m2.group(1)) for m2 in re.finditer(ur'<div class="shop"', sub): store_sub = cm.extract_closure(sub[m2.start():], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_class] = store_class entry[cm.country_e] = data['country_code'] entry[cm.city_e] = cm.extract_city(data['city'])[0] m3 = re.search(ur'loadStore\((\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\)', store_sub) if m3 is not None: data['store_id'] = string.atoi(m3.group(1)) entry[cm.lat] = string.atof(m3.group(2)) entry[cm.lng] = string.atof(m3.group(3)) entry[cm.store_type] = ', '.join(get_detail(data)) m3 = re.search(ur'<div class="shop-name shop-details shop-main-name">([^<>]+)</div>', store_sub) if m3 is not None: entry[cm.name_e] = m3.group(1).strip() addr_list = [] m3 = re.search(ur'<div class="shop-street shop-details">([^<>]+)</div>', store_sub) if m3 is not None: addr_list.append(cm.reformat_addr(m3.group(1))) m3 = re.search(ur'<div class="shop-city shop-details">([^<>]+)</div>', store_sub) if m3 is not None: tmp = cm.reformat_addr(m3.group(1)) m3 = re.search(ur'(\d{4,})', tmp) if m3 is not None: entry[cm.zip_code] = m3.group(1).strip() addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m1 in re.finditer(ur'<lignecountry\s+titre\s*=\s*"([^"]+)"', body): country = m1.group(1).strip().upper() if country == 'U.S.A.': country = 'US' sub_country = cm.extract_closure(body[m1.start():], ur'<lignecountry\b', ur'</lignecountry>')[0] for m2 in re.finditer(ur'<lignecity\s+titre\s*=\s*"([^"]+)"', sub_country): city = m2.group(1).strip().upper() sub_city = cm.extract_closure(sub_country[m2.start():], ur'<lignecity\b', ur'</lignecity>')[0] m3 = re.search(ur'<!\[CDATA\[(.+?)\]\]>', sub_city, re.S) if m3 is None: continue sub_city = m3.group(1) store_subs = re.split(ur'<\s*h2\s*>\s*LANVIN BOUTIQUE\s*<\s*/h2\s*>', sub_city) for s in store_subs: if s.strip() == '': continue m4 = re.search(ur'<p>(.+?)</p>', s, re.S) if m4 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = country entry[cm.city_e] = city s = m4.group(1) m4 = re.search(ur'(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.addr_e] = cm.reformat_addr(m4.group(1)) m4 = re.search(ur'Phone:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.tel] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Boutique Hours:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.hours] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Products available:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.store_type] = m4.group(1).strip() m4 = re.search(ur'Email:\s*<a href="mailto:([^"]+)">', s) if m4 is not None: entry[cm.email] = m4.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['url'] param = {'country_id': data['country_code'], 'city': '', 'label_id': '', 'lang': 'en'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] start = body.find(ur'<stores>') if start == -1: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] body = cm.extract_closure(body[start:], ur'<stores>', ur'</stores>')[0] store_list=[] for m in re.findall(ur'<store\b[^<>]+>(.+?)</store>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] m1 = re.search(ur'<name>(.+?)</name>', m) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1).strip()) m1 = re.search(ur'<address>(.+?)</address>', m) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1).strip()) m1 = re.search(ur'<city>(.+)</city>', m) if m1 is not None: entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<zip>(.+?)</zip>', m) if m1 is not None: entry[cm.zip_code] = m1.group(1).strip() m1 = re.search(ur'<tel>(.+?)</tel>', m) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'<fax>(.+?)</fax>', m) if m1 is not None: entry[cm.fax] = m1.group(1).strip() m1 = re.search(ur'<email>(.+?)</email>', m) if m1 is not None: entry[cm.email] = m1.group(1).strip() m1 = re.search(ur'<link>(.+?)</link>', m) if m1 is not None: entry[cm.url] = m1.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['store_url'] param = {'store_country': data['country_code'], 'store_city': data['city_code']} try: body = cm.post_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] for s in re.findall(ur'<marker\b([^<>]+)/\s*>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'store_name="([^"]+)"', s) if m is not None: entry[cm.name_e] = cm.reformat_addr(m.group(1)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] addr_list = [] for key in ['store_mall_name', 'store_address', 'store_zip_code']: m = re.search(ur'%s="([^"]+)"' % key, s) if m is not None: tmp = cm.reformat_addr(m.group(1)) if tmp != '': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) m = re.search(ur'store_zip_code="([^"]+)"', s) if m is not None: entry[cm.zip_code] = m.group(1).strip() m = re.search(ur'store_telephone="([^"]+)"', s) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'store_fax="([^"]+)"', s) if m is not None: entry[cm.fax] = m.group(1).strip() m = re.search(ur'store_email="([^"]+)"', s) if m is not None: entry[cm.email] = m.group(1).strip() m = re.search(ur'store_latitude="([^"]+)"', s) if m is not None: entry[cm.lat] = string.atof(m.group(1).strip()) m = re.search(ur'store_longitude="([^"]+)"', s) if m is not None: entry[cm.lng] = string.atof(m.group(1).strip()) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper()}) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
def fetch_stores(data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) code = data['country_code'] if gs.look_up(code, 1) is None: entry[cm.country_e] = cm.html2plain(data['country']).strip().upper() else: entry[cm.country_e] = code entry[cm.name_e] = data['store_name'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.lat] = data['lat'] if data['lat'] is not None else '' entry[cm.lng] = data['lng'] if data['lng'] is not None else '' m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content']) sub = data['content'][m.end():] m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub) if m1 is not None: entry[cm.store_class] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1)) if m2: entry = fetch_details(data, m2.group(1), entry) m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S) if m2: ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(',')) entry[cm.tel] = cm.extract_tel(ct_list[0]) if len(ct_list) > 1: entry[cm.email] = ct_list[1].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return tuple(entry)
def fetch_details(data): url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = data[cm.name_e] start = html.find(ur'<div class="field-address">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>') if end == 0: return [] m1 = re.search(ur'<div class="locality">(.+?)</div>', sub) if m1 is not None: entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<div class="postal-code">(.+?)</div>', sub) if m1 is not None: entry[cm.zip_code] = m1.group(1).strip() entry[cm.country_e] = data[cm.country_e] pat = re.compile(ur'<[^<>]+?>', re.S) entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub)) m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S) if m1 is not None: entry[cm.hours] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html) if m1 is not None: lat = string.atof(m1.group(1)) lng = string.atof(m1.group(2)) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) entry[cm.continent_e] = data[cm.continent_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def get_store_details(data): url = data['url'] try: html = cm.post_data(url, {'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) info = json.loads(html)['elements'] addr = cm.reformat_addr(info['address'].replace('\\', '').replace('<p>', ',').replace('</p>', ',')) # 第一行为商店名称 terms = addr.split(',') if len(terms) > 0: entry[cm.name_e] = cm.reformat_addr(terms[0]) entry[cm.addr_e] = addr gmap_url = info['gmap'] m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) entry[cm.url] = info['shareurl'].replace('\\', '') entry[cm.hours] = info['openingtimes'] entry[cm.comments] = info['other'] # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body)['results'] store_list = [] for key in raw: store = raw[key] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = store['post_title'] entry[cm.url] = store['post_permalink'].replace(u'\\', '') entry[cm.country_e] = store['country'].strip().upper() entry[cm.city_e] = store['city'].strip().upper() if '_yoox_store_latlong' in store: m = re.findall(ur'-?\d+\.\d+', store['_yoox_store_latlong']) if len(m) == 2: entry[cm.lat] = string.atof(m[0]) entry[cm.lng] = string.atof(m[1]) if 'store_phone' in store: entry[cm.tel] = store['store_phone'].replace('P:', '').replace('T:', '') \ .replace('P', '').replace('T', '').strip() if 'store_email' in store: entry[cm.email] = store['store_email'] if 'store_fax' in store: entry[cm.fax] = store['store_fax'].replace('F:', '').replace('F', '').strip() if 'store_hours' in store: entry[cm.hours] = cm.reformat_addr(store['store_hours']) if 'store_address' in store: entry[cm.addr_e] = cm.reformat_addr(store['store_address']) if 'women' in store and 'men' in store: entry[cm.store_type] = 'Women: %s, men: %s' % (', '.join(store['women']), ', '.join(store['men'])) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) ret = gs.look_up(data['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] m = re.search(ur'<span class="type">Address</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: addr = cm.reformat_addr(m.group(1)) country, province, city = gs.addr_sense(addr) if country is not None and entry[cm.country_e] == '': entry[cm.country_e] = country if province is not None: entry[cm.province_e] = province if city is not None: entry[cm.city_e] = city entry[cm.addr_e] = addr m = re.search(ur'<span class="type">Phone</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.tel] = m.group(1) m = re.search(ur'<span class="type">Opening hours</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) m = re.search(ur'<span class="type">You can find</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.store_type] = cm.reformat_addr(m.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', body, re.S) entry[cm.lat]=string.atof(m.group(1)) entry[cm.lng]=string.atof(m.group(2)) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['url'] try: body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for s in raw['stores']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() addr_list = [] for key in ['address1', 'address2']: if s[key].strip() != '': addr_list.append(cm.reformat_addr(s[key])) entry[cm.addr_e] = ' '.join(addr_list) # r=s['region'].strip().upper() # m = re.search(ur'\b([A-Z]{2})\b', r) # if data[cm.country_e]=='UNITED STATES' and m is not None: # # 美国 # ret = gs.look_up(m.group(1), 2) # if ret is not None: # r = ret['name_e'] # entry[cm.province_e] = r entry[cm.city_e] = cm.extract_city(s['city'])[0] entry[cm.zip_code] = s['zip'].strip() entry[cm.country_e] = data[cm.country_e] entry[cm.lat] = string.atof(s['lat']) entry[cm.lng] = string.atof(s['lng']) entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.email] = s['emailaddress'].strip() entry[cm.url] = s['website'].strip() days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] opening = [] if 'openingHours' in s and s['openingHours'] is not None: for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']): opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip())) entry[cm.hours] = ', '.join(opening) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?' ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat, lng = map(string.atof, [m[1], m[2]]) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) sub = m[0].strip() m1 = re.search(ur'<b>(.+?)</b>', sub) if m1 is None: continue entry[cm.name_c] = m1.group(1) sub = sub.replace(m1.group(0), '') m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub) if m1 is not None: entry[cm.tel]=m1.group(1) sub=sub.replace(m1.group(0), '<') sub = re.sub(ur'<img\b.*?/>', '', sub) entry[cm.addr_c] = cm.reformat_addr(sub) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_store_details(data): url = data['url'] try: body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find('<div class="store_locator') if start == -1: print 'Failed processing %s' % url return [] sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S) if m is not None: addr_list = cm.reformat_addr(m.group(1)).split(', ') ret = cm.extract_tel(addr_list[-1]) if ret != '': entry[cm.tel] = ret del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) addr_text=sub[m.end():] m = re.search(ur'<div class="title locator">', addr_text) if m is not None: tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data["host"] + data["country_url"] % data["country_id"] try: body = cm.get_data(url) except Exception: cm.dump("Error in fetching countries: %s" % url, log_name) return [] raw = json.loads(body)["rawPos"] store_list = [] for s in raw: entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) addr_list = [] for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ["address%d" % v for v in xrange(1, 5)]]: if tmp2 != "": addr_list.append(tmp2) entry[cm.addr_e] = ", ".join(addr_list) entry[cm.city_e] = cm.extract_city(s["city"]["name"])[0] entry[cm.country_e] = s["country"]["countryCode"] entry[cm.email] = s["email"] entry[cm.fax] = s["fax"] if s["latitude"] != "": entry[cm.lat] = string.atof(s["latitude"]) if s["longitude"] != "": entry[cm.lng] = string.atof(s["longitude"]) entry[cm.hours] = cm.reformat_addr(s["openingSchedule"]) phone_list = [] for key in ["phone1", "phone2"]: if s[key].strip() != "": phone_list.append(s[key].strip()) entry[cm.tel] = ", ".join(phone_list) entry[cm.zip_code] = s["postalCode"] entry[cm.name_e] = s["shopName"] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ), log_name, ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def fetch_stores(data): # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata url = data['data_url'] param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02', 'undercolor': ' 06', 'togetmap': 'mapdata'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False) dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'<marker (.+?)>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'name=\\"(.+?)\\"', m) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', '')) m1 = re.search(ur'address=\\"(.+?)\\"', m) if m1 is not None: addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', '')) tel = cm.extract_tel(addr) if tel != '': entry[cm.tel] = tel addr = addr.replace(tel, '') entry[cm.addr_e] = cm.reformat_addr(addr) m1 = re.search(ur'lat=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data['country'].strip().upper() entry[cm.city_e] = cm.extract_city(data['city'])[0] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') store_list.append(entry)
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(store_entry, {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel}) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def fetch(level=1, data=None, user='******', passwd=''): db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] js = json.loads(html) store_list = [] for s in js['data']['list']: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {cm.lat: string.atof(s['geo']['lat']), cm.lng: string.atof(s['geo']['lng'])}) entry[cm.name_e] = s['contact']['title'] entry[cm.addr_e] = cm.reformat_addr(s['contact']['address']) entry[cm.tel] = s['contact']['phone'] entry[cm.fax] = s['contact']['fax'] entry[cm.hours] = cm.reformat_addr(s['contact']['hours']) entry[cm.store_type]=s['contact']['selling'] entry[cm.url]=host+s['link'] gs.update_city_map(s['city'], s['country'], s['continent']) cm.update_entry(entry,{cm.continent_e:s['continent'], cm.country_e:s['country'], cm.city_e:s['city']}) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) db.disconnect_db() gs.commit_maps(1) gs.commit_maps(3) return store_list
def fetch_store_details(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = body.find(ur'<h3>available in store</h3>') if start != -1: type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] entry[cm.store_type] = ', '.join( cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) start = body.find(ur"<div class='gmap_info_box'") if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lat] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['data_url'] param = {'output': 'json', 'country': data['country_code'], 'brand': 'dkny'} page = 0 tot_page = -1 store_list = [] while True: page += 1 if tot_page != -1 and page > tot_page: break param['p'] = page try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return () raw = json.loads(body) tot_page = raw['Stores']['TotalPages'] if data['country_code'] not in region_map: # 构造州列表 region_map[data['country_code']] = dict((item['RegionId'], item['Name']) for item in raw['Regions']) for s in raw['Stores']['Items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'].upper() entry[cm.city_e] = cm.extract_city(s['City'])[0] entry[cm.name_e] = cm.html2plain(s['Name']).strip() entry[cm.addr_e] = cm.reformat_addr(s['Address']) entry[cm.tel] = s['Phone'].strip() if s['Phone'] else '' entry[cm.fax] = s['Fax'].strip() if s['Fax'] else '' entry[cm.email] = s['Email'].strip() if s['Email'] else '' entry[cm.lat] = s['Latitude'] if s['Latitude'] else '' entry[cm.lng] = s['Longitude'] if s['Longitude'] else '' region_id = s['RegionId'] if region_id in region_map[data['country_code']]: entry[cm.province_e] = cm.html2plain(region_map[data['country_code']][region_id]).strip().upper() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['post_shops'] param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0, 'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0} try: html = cm.post_data(url, param) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] try: for store in (pq(tmp) for tmp in pq(html)('ul')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip() entry[cm.country_e] = data[cm.country_e] entry[cm.city_e] = data[cm.city_e] addr_list = [] for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')): if term != '': addr_list.append(term) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') except (IndexError, TypeError) as e: cm.dump(u'Error in parsing %s, %s' % (url, param), log_name) print traceback.format_exc() continue except Exception, e: print traceback.format_exc()
def parse_store(data, body=None): if body is None: url = data['url'] try: body = cm.post_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] start = body.find(ur'jQuery.extend(Drupal.settings,') latlng_map = {} if start != -1: for item in json.loads(cm.extract_closure(body[start:], ur'\{', ur'\}')[0])['getlocations']['key_1']['latlons']: latlng_map[cm.reformat_addr(item[3])] = {'lat': string.atof(item[0]), 'lng': string.atof(item[1])}
def fetch_stores(data): url = data['store_url'] try: body = cm.get_data(url, {'country': data['country'], 'city': data['city']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for item in raw['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'].strip().upper() tmp = cm.extract_city(data['city'])[0] if entry[cm.country_e] == 'USA': entry[cm.province_e] = tmp else: entry[cm.city_e] = tmp gs.field_sense(entry) addr = cm.reformat_addr(item['address'].replace(u'\\', '')) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel !='': entry[cm.tel]=tel del addr_list[-1] entry[cm.addr_e]=', '.join(addr_list) entry[cm.store_type] = item['shop_type'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): url = data['host'] + data['store_url'] param = {'CC': data['country_code'], 'City': data['city']} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] # pat_tel = re.compile(ur'tel:\s*', re.I) # pat_fax = re.compile(ur'fax:\s*', re.I) # pat_email = re.compile(ur'email:\s*', re.I) pat_tel = re.compile(ur'tel:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S) pat_fax = re.compile(ur'fax:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S) pat_email = re.compile(ur'email:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S) for m in re.finditer(ur'<div class="store-info">', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = cm.extract_city(data['city'])[0] sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2 class="store-name[^"]*">(.+?)</h2>', sub) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1)) entry[cm.store_class] = entry[cm.name_e] m1 = re.search(ur'<dt class="address"', sub) if m1 is not None: tmp = cm.reformat_addr(cm.extract_closure(sub[m1.end():], ur'<dd>', ur'</dd>')[0]) entry[cm.addr_e] = tmp if len(tmp) > 1: m1 = re.search(ur'[\d\-]{4,}', tmp.split(',')[-2]) if m1 is not None and len(re.findall(ur'\d', m1.group())) >= 4: entry[cm.zip_code] = m1.group().strip()
def fetch_stores(data): url = data['store_url'] param = {'myid': data['key'], 'idioma': 'in'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] store_list = [] for s in json.loads(body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.name_e] = cm.reformat_addr(s['title']) m = re.search(ur'(.+?)-\s*<', s['key']) addr_list = [entry[cm.name_e]] if m is not None: m1 = re.search(ur'-+', m.group(1)) if m1 is not None: tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]] else: tmp = [m.group(1)] if len(tmp) > 1: entry[cm.tel] = cm.extract_tel(tmp[1]) m1 = re.search(ur'\d{4,}', tmp[0]) if m1 is not None: entry[cm.zip_code] = m1.group() addr_list.append(tmp[0].strip()) entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_store_details(data): url = '%s/%d' % (data['url'], data['store_id']) try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching countries: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<h1 class="with-back-option">\s*([^<>]+)\s*[<>]', body) if m is not None: entry[cm.name_e] = m.group(1).strip() start = body.find(ur'<div class="store-details">') if start != -1: sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] addr = cm.extract_closure(sub, ur'<p\b', ur'</p>')[0] m = re.search(ur'<span class="locality">([^<>]+?)</span>', addr) if m is not None: entry[cm.city_e] = m.group(1).split(',')[0].strip().upper() m = re.search(ur'<span class="postal-code">([^<>]+?)</span>', addr) if m is not None: entry[cm.zip_code] = m.group(1).strip() m = re.search(ur'<span class="country-name">([^<>]+?)</span>', addr) if m is not None: entry[cm.country_e] = m.group(1).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr) start = body.find(ur'<div class="contact">') if start != -1: sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] m = re.search(ur'<span class="tel">(.+?)</span>', sub) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'<span class="fax">(.+?)</span>', sub) if m is not None: entry[cm.fax] = m.group(1).strip() m = re.search(ur'<a href="mailto:([^"]+)">Email</a>', sub) if m is not None: entry[cm.email] = m.group(1).strip() start = body.find(ur'<h3>Opening hours</h3>') if start != -1: tmp = [] sub = cm.extract_closure(body[start:], ur'<table>', ur'</table>')[0] for m in re.findall(ur'<t[hd][^<>]*>([^<>]+)</t[hd]>', sub): tmp.append(m) entry[cm.hours] = ' '.join(tmp)
def fetch_stores(data): """ 获得商店信息 :param data: """ url = data["post_url"] try: html = cm.post_data(url, {"pid": data["city_id"], "lang": "en", "action": "popola_box_DX"}) except Exception: print "Error occured in getting city list: %s" % url dump_data = {"level": 2, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<a href="(.+?)".*?>', html): entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) entry[cm.url] = m.group(1) store_html, start, end = cm.extract_closure(html[m.start() :], ur"<a href", ur"</a>") if end == 0: continue m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S) if len(m1) > 0: entry[cm.name_e] = m1[0].strip() m1 = re.findall(ur"<p\b.*?>(.+?)(?:</p>|</div>)", store_html, re.S) if len(m1) > 0: terms = cm.reformat_addr(m1[0]).split(",") tel = cm.extract_tel(terms[-1]) if tel != "": terms = terms[:-1] entry[cm.tel] = tel entry[cm.addr_e] = ", ".join([v.strip() for v in terms]) entry["country_e"] = data["country_e"] entry["city_e"] = data["city_e"] gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ) store_list.append(entry) db.insert_record(entry, "stores")
def fetch_stores(data): """ 商店列表 :param data: """ html = data['html'] store_list = [] while True: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<li class="leaf end"><div><u>(.+?)</u>', html) if m is None: break html = html[m.start():] entry[cm.name_e] = m.group(1) sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>') html = html[end:] # 单个商店的页面 sub = sub[len(m.group(0)):-len('</li>')] m = re.search(ur'<a href="(http.+?)"', sub) if m is not None: entry[cm.url] = m.group(1) m = re.search(ur'<a href="mailto:(.+?)"', sub) if m is not None: entry[cm.email] = m.group(1) m = re.search(ur'(?:<a\b|</div>)', sub) if m is not None: addr = sub[:m.start()] else: addr = sub # 解析地址栏 addr = cm.reformat_addr(addr) terms = addr.split(',') new_terms = [] for t in terms: if re.search(ur'phone', t, re.IGNORECASE) is not None: entry[cm.tel] = cm.extract_tel(t) elif re.search(ur'fax', t, re.IGNORECASE) is not None: entry[cm.fax] = cm.extract_tel(t) elif data['city_e'] in t.strip().upper(): # 邮编 m = re.search(ur'\d+', t) if m is not None: entry[cm.zip_code] = m.group(0)
if entry[cm.lat] == 0 and entry[cm.lng] == 0: entry[cm.lat], entry[cm.lng] = '', '' item = pq(store) tmp = item('h1') entry[cm.name_e] = cm.html2plain( tmp[0].text).strip() if len(tmp) > 0 and tmp[0].text else '' tmp = item('dd.location') tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else '' entry[cm.city_e] = cm.extract_city(tmp)[0] tmp = item('dd.street') tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else '' entry[cm.addr_e] = cm.reformat_addr(tmp) tmp = item('dd.phone') tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else '' entry[cm.tel] = tmp.strip() tmp = item('dd.hours') tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else '' entry[cm.hours] = tmp.strip() tmp = item('dd.products') tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else '' entry[cm.store_type] = tmp.strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e])
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'): tot = 0 start = 0 store_list = [] data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100} # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100} db = cm.StoresDb() db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores') db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) while True: cm.dump('Fetching from %d' % start, 'triumph_log.txt') try: data['start'] = start html = cm.get_data(url, data) raw_list = json.loads(html) if tot == 0: tot = raw_list['response']['numFound'] cm.dump('Found: %d' % tot, 'triumph_log.txt') raw_list = raw_list['response']['docs'] except Exception: cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt') dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] idx = 0 if len(raw_list) < data['rows'] and start + len(raw_list) < tot: cm.dump('Cooling down...', 'triumph_log.txt') time.sleep(5) continue for v in raw_list: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {cm.store_type: v['class'], cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'], cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours']}) entry[cm.name_e] = cm.reformat_addr(v['name']) entry[cm.city_e], tmp = cm.extract_city(v['city']) if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '': entry[cm.zip_code] = tmp if v['location'] != '': terms = v['location'].split(',') cm.update_entry(entry, {cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1])}) addr = v['address'] if v['address2'] != '': addr += ', ' + v['address2'] entry[cm.addr_e] = cm.reformat_addr(addr) ret = gs.look_up(v['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] else: cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt') gs.field_sense(entry) cm.dump('(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % ( brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), 'triumph_log.txt') store_list.append(entry) db.insert_record(entry, 'stores') idx += 1 if tot - start <= len(raw_list): break else: start += len(raw_list)
def fetch_stores(data): url = data['store_url'] param = { 'store_country': data['country_code'], 'store_city': data['city_code'] } try: body = cm.post_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] for s in re.findall(ur'<marker\b([^<>]+)/\s*>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'store_name="([^"]+)"', s) if m is not None: entry[cm.name_e] = cm.reformat_addr(m.group(1)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] addr_list = [] for key in ['store_mall_name', 'store_address', 'store_zip_code']: m = re.search(ur'%s="([^"]+)"' % key, s) if m is not None: tmp = cm.reformat_addr(m.group(1)) if tmp != '': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) m = re.search(ur'store_zip_code="([^"]+)"', s) if m is not None: entry[cm.zip_code] = m.group(1).strip() m = re.search(ur'store_telephone="([^"]+)"', s) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'store_fax="([^"]+)"', s) if m is not None: entry[cm.fax] = m.group(1).strip() m = re.search(ur'store_email="([^"]+)"', s) if m is not None: entry[cm.email] = m.group(1).strip() m = re.search(ur'store_latitude="([^"]+)"', s) if m is not None: entry[cm.lat] = string.atof(m.group(1).strip()) m = re.search(ur'store_longitude="([^"]+)"', s) if m is not None: entry[cm.lng] = string.atof(m.group(1).strip()) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_store_details(data): url = data['url'] try: body = cm.post_data(url, { 'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = body.find('<div class="store_locator') if start == -1: print 'Failed processing %s' % url return [] sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S) if m is not None: addr_list = cm.reformat_addr(m.group(1)).split(', ') ret = cm.extract_tel(addr_list[-1]) if ret != '': entry[cm.tel] = ret del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) addr_text = sub[m.end():] m = re.search(ur'<div class="title locator">', addr_text) if m is not None: tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] raw = json.loads(body) store_list = [] for s in raw: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = s['storename'] entry[cm.addr_e] = cm.reformat_addr(', '.join([s['building'].replace(u'operated by ', u''), s['street'].strip()])) if s['country'] is not None: entry[cm.country_e] = s['country'].strip().upper() if s['city'] is not None: if s['country'].strip() == u'US': tmp = s['city'].split(',') entry[cm.city_e] = tmp[0].strip().upper() if len(tmp) > 1: entry[cm.province_e] = tmp[1].strip().upper() else: entry[cm.city_e] = s['city'].strip().upper() entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] if s['zip'] is not None: entry[cm.zip_code] = s['zip'].strip() if s['phone'] is not None: entry[cm.tel] = s['phone'].strip() if s['storeemail'] is not None: entry[cm.email] = s['storeemail'].strip() if s['storelink'] is not None and u'@' not in s['storelink']: entry[cm.url] = s['storelink'].strip() if s['storetype'] is not None: entry[cm.store_class] = s['storetype'].strip() hours = [] for item in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']: if s[item] is not None: hours.append('%s: %s' % (item, s[item])) entry[cm.hours] = ', '.join(hours) styles = [] for item in ['menswear', 'womenswear', 'kidswear']: if s[item] == '1': styles.append(item) entry[cm.store_type] = ', '.join(styles) if s['latitude'] is not None and s['latitude'].strip() != '': entry[cm.lat] = string.atof(s['latitude']) if s['longitude'] is not None and s['longitude'].strip() != '': entry[cm.lng] = string.atof(s['longitude']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
# cm.dump('Error in fetching stores: %s' % url, log_name) return () m = re.search(ur'<div class="col">\s*<h3>Boutique</h3>\s*<div class="content">(.+?)</div>', body, re.S) if not m: return () sub = m.group(1) entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) md5 = hashlib.md5() md5.update(url) entry[cm.native_id] = md5.hexdigest() entry[cm.country_e] = data['country_code'] if entry[cm.country_e] == 'US': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(data['city']).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]) or tmp_list[1] == 'D.C.': entry[cm.province_e] = tmp_list[1] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.province_e] = data['state'] if data['state_code'] else '' sub_list = re.findall(ur'<p>(.+?)</p>', m.group(1), re.S) if len(sub_list) < 2: return () title_list = tuple(tmp.strip() for tmp in cm.reformat_addr(sub_list[0]).split(',')) entry[cm.name_e] = title_list[0] if len(title_list) > 1: entry[cm.store_class] = title_list[1] entry[cm.addr_e] = cm.reformat_addr(sub_list[1])
store_list = [] for store in tree.iter('poi'): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) val = store.getiterator('uid')[0].text if val in store_map: continue store_map[val] = entry val = store.getiterator('name')[0].text entry[cm.name_e] = cm.html2plain(val).strip() if val else '' addr_list = [] for idx in xrange(1, 3): val = store.getiterator('address%d' % idx)[0].text if val: val = cm.reformat_addr(val) if val != '': addr_list.append(val) entry[cm.addr_e] = ', '.join(addr_list) val = store.getiterator('city')[0].text entry[cm.city_e] = cm.extract_city(val)[0] if val else '' val = store.getiterator('province')[0].text entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else '' if entry[cm.province_e] == '': val = store.getiterator('state')[0].text entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else '' val = store.getiterator('country')[0].text entry[cm.country_e] = val.strip().upper() if val else '' val = store.getiterator('email')[0].text
def fetch_stores(data): url = '%s%d/' % (data['store_url'], data['city_id']) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="store">', html): store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b', ur'</div') if set == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = store_sub.find('<div class="store_name">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() start = store_sub.find('<div class="store_address">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub, re.S) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(',') tmp = [] tel_pat = re.compile(ur'^tel[\.: ]+', re.I) for term in addr_list: if re.search(tel_pat, term.strip()) is not None: term = re.sub(tel_pat, '', term.strip()) entry[cm.tel] = cm.extract_tel(term) else: tmp.append(term.strip()) entry[cm.addr_e] = ', '.join(tmp) m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data[cm.country_e] entry[cm.continent_e] = data[cm.continent_e] entry[cm.city_e] = data[cm.city_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def get_store_details(data): url = data['url'] try: html = cm.post_data( url, { 'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) info = json.loads(html)['elements'] addr = cm.reformat_addr(info['address'].replace('\\', '').replace( '<p>', ',').replace('</p>', ',')) # 第一行为商店名称 terms = addr.split(',') if len(terms) > 0: entry[cm.name_e] = cm.reformat_addr(terms[0]) entry[cm.addr_e] = addr gmap_url = info['gmap'] m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url) if len(m) > 0: cm.update_entry(entry, { cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1]) }) entry[cm.url] = info['shareurl'].replace('\\', '') entry[cm.hours] = info['openingtimes'] entry[cm.comments] = info['other'] # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
entry[cm.name_e] = cm.html2plain(m.group(1)).strip() m = re.search(ur'<brands>(.+?)</brands>', s) if m is not None: brand_list = [] for m1 in re.findall(ur'<brand>(.+?)</brand>', m.group(1)): brand_list.append(m1) entry[cm.store_type] = ', '.join(brand_list) m = re.search(ur'<city>(.+?)</city>', s) if m is not None: entry[cm.city_e] = cm.html2plain(m.group(1)).strip().upper() m = re.search(ur'<address>(.+?)</address>', s) if m is not None: entry[cm.addr_e] = cm.reformat_addr(m.group(1)).strip() m = re.search(ur'<phone>(.+?)</phone>', s) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search( ur'<(?:latitude|latitiude)>(.+?)</(?:latitude|latitiude)>', s) if m is not None: entry[cm.lat] = string.atof(m.group(1)) m = re.search(ur'<longitude>(.+?)</longitude>', s) if m is not None: entry[cm.lng] = string.atof(m.group(1)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return () m = re.search(ur'<div class="contact-info">(.+?)</div>', body, re.S) if m is None: return s entry = s.copy() pat_tel = re.compile(ur'\s*Numéro de téléphone\s*[:\.]\s*') pat_fax = re.compile(ur'\s*Numéro de fax\s*[:\.]\s*') pat_email = re.compile(ur'\s*Adresse électronique\s*[:\.]\s*') for term in [ tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',') ]: if re.search(pat_tel, term): entry[cm.tel] = re.sub(pat_tel, '', term).strip() if re.search(pat_fax, term): entry[cm.fax] = re.sub(pat_fax, '', term).strip() if re.search(pat_email, term): entry[cm.email] = re.sub(pat_email, '', term).strip() return entry def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception, e:
def fetch_stores(data): url = data['data_url'] logger = logging.getLogger('firenzeLogger') param = {'storeId': 10551, 'catalogId': 10051, 'countryTab': 'in', 'countryCode': data['country_code']} if data['country_code'] == 'US': param['radius'] = 20 param['state'] = data['state_code'] try: body = cm.get_data(url, param)['body'] except Exception as e: logger.error('Error in fetching stores: %s, %s' % (url, param)) return () store_list = [] for item in (pq(tmp) for tmp in pq(body)('div.vcard')): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] tmp = item('div.resultsHeader a b') if len(tmp) > 0 and tmp[0].text: entry[cm.name_e] = cm.html2plain(tmp[0].text) tmp = item('div.adr') if len(tmp) > 0: tmp = pq(tmp[0]) entry[cm.addr_e] = cm.reformat_addr(unicode(tmp)) tmp1 = tmp('.locality') if len(tmp1) > 0 and tmp1[0].text: entry[cm.city_e] = cm.extract_city(tmp1[0].text)[0] tmp1 = tmp('.region') if len(tmp1) > 0 and tmp1[0].text: entry[cm.province_e] = cm.html2plain(tmp1[0].text).strip().upper() tmp1 = tmp('.postal-code') if len(tmp1) > 0 and tmp1[0].text: entry[cm.zip_code] = tmp1[0].text tmp = item('div.tel') if len(tmp) > 0: entry[cm.tel] = tmp[0].text if tmp[0].text else '' tmp = item('div.store_hours') if len(tmp) > 0: entry[cm.hours] = cm.reformat_addr(unicode(pq(tmp[0]))) tmp = item('#map') if len(tmp) > 0: m = re.search(ur'Lat=(-?\d+\.\d+)', unicode(pq(tmp[0]))) if m: entry[cm.lat] = string.atof(m.group(1)) m = re.search(ur'Lng=(-?\d+\.\d+)', unicode(pq(tmp[0]))) if m: entry[cm.lng] = string.atof(m.group(1)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) # cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], # entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], # # entry[cm.continent_e]), log_name) # cm.insert_record(data['database'], entry, 'stores') store_list.append(entry) return tuple(store_list)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub) if m1 is None: continue ret = gs.geocode(latlng=m1.group(1)) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'MAX' in tmp[0]: del tmp[0] if cm.extract_tel(tmp[-1])!='': del tmp[-1] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) else: cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name) continue
try: details = pq(pq(body)('.store-details')[0]) entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.province_e] = data['state'] if data['state'] else '' entry[cm.url] = url entry[cm.name_e] = data['store_name'] entry[cm.city_e] = data['city'] if data['city'] else '' if data['addr']: entry[cm.addr_e] = data['addr'] else: entry[cm.addr_e] = cm.reformat_addr(unicode(pq(details('p')[0]))) if data['tel']: entry[cm.tel] = data['tel'] else: tmp = details('p')[1].text pat = re.compile(ur'(phone|tel|telephone)\s*[\.: ]?\s*', re.I) if re.search(pat, tmp): entry[cm.tel] = re.sub(pat, '', tmp).strip() sub = unicode(details) start = sub.find(u'Regular Store Hours') if start != -1: m = re.search(ur'<p>(.+?)<ul', sub[start:], re.S) if m: entry[cm.hours] = cm.reformat_addr(m.group(1))
entry[cm.tel] = s['phone'] if s['phone'] else '' entry[cm.url] = ( data['host'] + s['storeDetailUrl']) if s['storeDetailUrl'] else '' hour_list = [] try: body = cm.get_data(entry[cm.url], client='iPad') html = pq(body) for sub in (pq(tmp) for tmp in html( 'table.storeDetailed-horaires-content tr')): tmp = sub('td[class!="hours"]') if len(tmp) == 0: continue val1 = cm.reformat_addr(tmp[0].text).strip() tmp = sub('td.hours') if len(tmp) == 0: continue val2 = cm.reformat_addr(tmp[0].text).strip() if val1 == '' or val2 == '': continue hour_list.append('%s %s' % (val1, val2)) tmp = html('div.storeDetailed-horaires-content') if len(tmp) > 0: hour_list.append('Closing days: ' + cm.reformat_addr(tmp[0].text).strip()) entry[cm.hours] = ', '.join(hour_list) except Exception as e: print traceback.format_exc()
def fetch(level=1, data=None, user='******', passwd=''): db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) data = { 's': -89, 'w': -179, 'n': 89, 'e': 179, 'chinese': 0, 'repair': 1, 'store': 1 } try: html = common.get_data(url_init, data) except Exception: print 'Error occured in getting the list of countries: %s' % url_init dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'data': url_init }, 'brand_id': brand_id } common.dump(dump_data) return [] store_list = [] store_map = json.loads(html) tot = 0 while True: # 得到{'uid':entry}的字典 tmp = store_map['lists'] # 是否有'more' flag = False if 'has_key' not in dir(tmp): raw_stores = {} for item in tmp: if 'more' in item: flag = item['more'] else: raw_stores[item['nid']] = item else: raw_stores = tmp for k in tmp: if 'more' in tmp[k]: flag = tmp[k]['more'] break # 分析raw_stores for k in raw_stores: s = raw_stores[k] if 'more' in s: flag = s['more'] else: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) if s['country'] is not None: country_c = s['country'].strip().upper() ret = gs.look_up(country_c, 1) if ret is not None: entry[common.country_e] = ret['name_e'] entry[common.country_c] = ret['name_c'] else: if common.is_cjk(country_c): entry[common.country_c] = country_c else: entry[common.country_e] = country_c if s['address'] is not None: addr = common.reformat_addr(s['address']) if common.is_cjk(addr): entry[common.addr_c] = addr else: entry[common.addr_e] = addr city = s['city'] if city is not None: city = city.strip().upper() ret = gs.look_up(city, 3) if ret is not None: entry[common.city_c] = ret['name_c'] entry[common.city_e] = ret['name_e'] else: if common.is_cjk(city): entry[common.city_c] = city else: entry[common.city_e] = city entry[common.city_e] = common.extract_city( entry[common.city_e])[0] if s['email'] is not None: entry[common.email] = s['email'] if s['fax'] is not None: entry[common.fax] = s['fax'] if s['latitude'] is not None: entry[common.lat] = string.atof(s['latitude']) if s['longitude'] is not None: entry[common.lng] = string.atof(s['longitude']) if s['phone'] is not None: entry[common.tel] = s['phone'] if s['postal_code'] is not None: entry[common.zip_code] = s['postal_code'] if s['title'] is not None: name = s['title'] if common.is_cjk(name): entry[common.name_c] = name else: entry[common.name_e] = name if s['operating_hours'] is not None: entry[common.hours] = s['operating_hours'] if s['url'] is not None: entry[common.url] = host + s['url'] gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) if flag: tot += len(store_map['lists']) - 1 data['offset'] = tot store_map = json.loads(common.get_data(url_more, data)) continue else: tot += len(store_map['lists']) break print 'Found a total of %d stores.' % tot db.disconnect_db() return store_list
body = re.sub(ur'GetLocalLevisCallback\(', '', body)[:-1] for s in json.loads(body)['d']['results']: try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) uid = s['__metadata']['uri'] if uid in store_map: cm.dump(u'%s already exists.' % uid, log_name) continue entry[cm.country_e] = cm.html2plain( s['CountryRegion']).strip().upper() entry[cm.native_id] = uid entry[cm.city_e] = cm.extract_city(s['Locality'])[0] entry[cm.addr_e] = cm.reformat_addr(s['AddressLine']) entry[cm.zip_code] = s['PostalCode'] entry[cm.tel] = s['Phone'] entry[cm.name_e] = cm.html2plain( s['BranchName']).strip() if s['BranchName'] else '' try: entry[cm.lat] = string.atof( s['Latitude']) if s['Latitude'] != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) try: entry[cm.lng] = string.atof( s['Longitude']) if s['Longitude'] != '' else '' except (ValueError, KeyError, TypeError) as e:
def fetch_stores(data): url = data['host'] + 'after-sales-services/boutique-finder' param = { 'productOffer': 'All', 'city': data['city_id'], 'boutiqueType': 'All', 'country': data['country_id'] } if data['state'] is not None: param['prefecture'] = data['state']['state_id'] page = 0 totStore = -1 store_list = [] while True: if totStore != -1 and len(store_list) >= totStore: break else: page += 1 param['numPageToGet'] = page try: body = cm.get_data(url, param) except Exception, e: # cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) break m = re.search(ur'<list id="WS_boutique_list" nbBoutique="(\d+)">', body) if m is None: # cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) break totStore = string.atoi(m.group(1)) sub = cm.extract_closure(body[m.start():], ur'<list\b', ur'</list>')[0] for m in re.finditer(ur'<list id="WS_boutique_\d+">', sub): store_sub = cm.extract_closure(sub[m.start():], ur'<list\b', ur'</list>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.city_e] = cm.extract_city(data['city_name'])[0] entry[cm.country_e] = data['country_name'] if data['state'] is not None: entry[cm.province_e] = data['state']['state_name'] m1 = re.search(ur'productOffers="([^"]+)"', store_sub) if m1 is not None: entry[cm.store_type] = m1.group(1).strip() m1 = re.search(ur'boutiqueType="([^"]+)">', store_sub) if m1 is not None: entry[cm.store_class] = m1.group(1).strip() m1 = re.search(ur"<p class='boutique_title'>(.+?)</p>", store_sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() m1 = re.search( ur'<object type="text" id="WS_boutique_detail[^"]+">(.+?)</object>', store_sub, re.S) if m1 is not None: m2 = re.search(ur'<p [^<>]*>(.+?)</p>', m1.group(1), re.S) if m2 is not None: addr_list = [] for term in (tmp.strip() for tmp in cm.reformat_addr( m2.group(1)).split(',')): pat_tel = re.compile(ur'phone:\s*', re.I) pat_fax = re.compile(ur'fax:\s*', re.I) pat_email = re.compile( r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])' ) if re.search(pat_tel, term) is not None: entry[cm.tel] = re.sub(pat_tel, '', term).strip() elif re.search(pat_fax, term) is not None: entry[cm.fax] = re.sub(pat_fax, '', term).strip() elif re.search(pat_email, term) is not None: entry[cm.email] = re.search(pat_email, term).group() else: addr_list.append(term) entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) # db.insert_record(entry, 'stores') store_list.append(entry)
entry[cm.zip_code]) except (IndexError, TypeError): pass try: tmp = store('td.address span.tel')[0].text entry[cm.tel] = tmp if tmp else '' except IndexError: pass try: tmp = store('td.address span.fax')[0].text entry[cm.fax] = tmp if tmp else '' except IndexError: pass hours_list = [] for item in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('td.opening table tr')): if 'opening times' in item.lower(): continue hours_list.append(re.sub(ur':\s*,\s*', ': ', item)) entry[cm.hours] = ', '.join(hours_list) gs.field_sense(entry) if entry[cm.addr_e]: ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry)
start = body.find(ur'<div class="storeLocation">') if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return () sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.url] = url m = re.search(ur'<h2 class="title"\s*>([^<>]+)</h2>', sub) entry[cm.name_e] = cm.html2plain(m.group(1)).strip() if m else '' m = re.search(ur'<address class="address"\s*>(.+?)</address>', sub, re.S) entry[cm.addr_e] = cm.reformat_addr(m.group(1)) if m else '' m = re.search(ur'<p>(.+)</p>', sub[m.end():], re.S) contact_list = [] pat_tel = re.compile(ur'phone[\s\d]*[:\.]\s*', re.I) pat_fax = re.compile(ur'fax[\s\d]*[:\.]\s*', re.I) pat_email = re.compile(ur'email[\s\d]*[:\.]\s*', re.I) for term in (tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')): if re.search(pat_tel, term): entry[cm.tel] = re.sub(pat_tel, '', term).strip() elif re.search(pat_fax, term): entry[cm.fax] = re.sub(pat_fax, '', term).strip() elif re.search(pat_email, term): entry[cm.email] = re.sub(pat_email, '', term).strip()
store_list = [] for s in re.findall(ur'<div class="store_wrapper">(.+?)</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m = re.search(ur'<h2>(.+?)</h2>', s) if m is not None: entry[cm.name_e] = cm.html2plain(m.group(1)) m = re.search(ur'<p>(.+?)</p>', s, re.S) if m is not None: addr_list = [ tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',') ] tel = cm.extract_tel( re.sub(re.compile('^\s*t\s*(\.|:)\s*', re.I), '', addr_list[-1])) if tel != '': if entry[cm.country_e] == 'CHINA': if len(re.findall(r'\d', tel)) > 6: entry[cm.tel] = tel del addr_list[-1] else: entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry)
def fetch_stores(data): url = data['data_url'] param = {'lang': data['lang'], 'country': data['country_id'], 'region': data['region_id'], 'city': data['city_id']} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), 'tudor_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for store in (pq(tmp) for tmp in pq(body.encode('utf-8'))('dealer')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] entry[cm.province_e] = data['region_name'].replace('PROVINCE', '').strip() entry[cm.city_e] = data['city_name'] store_id = store[0].attrib['id'] if store_id in id_set: if data['country_code'] == 'CN': entry = id_set[store_id] entry[cm.name_c] = cm.reformat_addr(store('name')[0].text).strip() tmp = store('address') entry[cm.addr_c] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' entry[cm.province_c] = data['region_name'] entry[cm.city_c] = data['city_name'] db.execute(u'DELETE FROM stores WHERE brand_id=%d AND native_id="%s"' % ( data['brand_id'], entry[cm.native_id])) db.insert_record(entry, 'stores') else: entry[cm.native_id] = store_id entry[cm.name_e] = cm.reformat_addr(store('name')[0].text).strip() tmp = store('address') entry[cm.addr_e] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' tmp = store('phone1') entry[cm.tel] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else '' tmp = store('fax1') entry[cm.fax] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else '' tmp = store('latitude') try: entry[cm.lat] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) tmp = store('longitude') try: entry[cm.lng] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'tudor_log.txt') db.insert_record(entry, 'stores') id_set[store_id] = entry store_list.append(entry) except (IndexError, TypeError) as e: print traceback.format_exc() continue return store_list
entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e])) cm.insert_record(db, entry, 'spider_stores.stores') store_list.append(entry) elif data['m'] == 5: for country in (pq(tmp) for tmp in pq(body)('table[cellpadding="6"]')): country_e = cm.html2plain(country('td[style="color:#FFF;"]')[0].text).strip().upper() country_e = 'UAE' if 'arab emirates' in country_e.lower() else country_e for store in country('td[valign="top"]'): if 'bgcolor' in store.attrib: continue addr_raw = cm.reformat_addr(unicode(pq(store))) if addr_raw == '': continue addr_list = [tmp.strip() for tmp in addr_raw.split(',')] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = addr_list[0] entry[cm.country_e] = country_e del addr_list[0] if country_e in ('HONG KONG', 'JAPAN', 'UAE') or ( country_e == 'THAILAND' and 'ext.' in addr_list[-1]): entry[cm.tel] = addr_list[-1] del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry( store, { cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper() }) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
def fetch_stores(data): url = data['url'] try: html, cookie_map = cm.get_data_cookie(url) except Exception: print 'Error occured in getting country list: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] print 'SLEEPING>>>>' time.sleep(5) m = re.search( 'http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html) if m is None: return [] url = m.group(0) cookie_map_new = {} for key in cookie_map: if 'dwpersonalization_' in key or key == 'sr_token': continue cookie_map_new[key] = cookie_map[key] cookie_map_new['invited_visitor_22225'] = '1' cookie_map = cookie_map_new try: html = cm.post_data(url, { 'dwfrm_storelocator_startaddress': 'kingman', 'dwfrm_storelocator_maxDistance': 30.00, 'dwfrm_storelocator_outlet': 'true', 'dwfrm_storelocator_retail': 'true', 'dwfrm_storelocator_optical': 'true', 'dwfrm_storelocator_eyewear': 'true', 'dwfrm_storelocator_apparel': 'true', 'dwfrm_storelocator_attire': 'true', 'dwfrm_storelocator_department': 'true', 'dwfrm_storelocator_IsMensFootwear': 'true', 'dwfrm_storelocator_IsRRR': 'true', 'dwfrm_storelocator_IsRRNY': 'true', 'dwfrm_storelocator_IsRRS': 'true', 'dwfrm_storelocator_wholesale': 'true', 'dwfrm_storelocator_bba': 'true', 'dwfrm_storelocator_ba': 'true', 'dwfrm_storelocator_search.x': 0, 'dwfrm_storelocator_search.y': 0, 'dwfrm_storelocator_countryCode': 'US', 'dwfrm_storelocator_postalCode': '67068', 'dwfrm_storelocator_distanceUnit': 'mi', 'dwfrm_storelocator_long': -98.117208, 'dwfrm_storelocator_lat': 37.647131, }, cookie=cookie_map) except Exception: print 'Error occured in getting country list: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m1 in re.finditer(ur'<div class="storeColumnOne">', html): sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>') if end == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub) if m2 is not None: entry[cm.name_e] = m2.group(1).strip() addr_list = [ m2 for m2 in re.findall( ur'<div class="adddressline">([^<>]+)</div>', sub) ] entry[cm.addr_e] = ', '.join(addr_list) m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub) if m2 is not None: tmp = cm.reformat_addr(m2.group(1)) terms = re.split('[, ]+', tmp) if len(terms) < 3: entry[cm.addr_e] = tmp else: ret = gs.look_up(terms[0], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] else: entry[cm.city_e] = terms[0].strip().upper() ret = gs.look_up(terms[1], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] else: entry[cm.province_e] = terms[0].strip().upper() if re.match('\s*\d{5,}\s*', terms[2]) is not None: entry[cm.zip_code] = terms[2].strip() m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub) if m2 is not None: entry[cm.tel] = m2.group(1) cm.update_entry(entry, { 'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA' }) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
# entry[cm.city_e] = data['city'] store_sub = cm.extract_closure(sub[m.start():], ur'<tr\b', ur'</tr')[0] pat = re.compile(ur'<strong>([^<>]+)</strong>') m1 = re.search(pat, store_sub) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1)).strip() store_sub = re.sub(pat, '', store_sub) start = store_sub.find(ur'<table>') if start != -1: addr_list = [] pat_tel = re.compile(ur'phone\s*[:\.]', re.I) pat_fax = re.compile(ur'fax\s*[:\.]', re.I) for term in [ tmp.strip() for tmp in cm.reformat_addr(store_sub[:start]).split(',') ]: if term == '': continue elif re.search(pat_tel, term): entry[cm.tel] = re.sub(pat_tel, '', term).strip() elif re.search(pat_fax, term): entry[cm.fax] = re.sub(pat_fax, '', term).strip() else: addr_list.append(term) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur'href="([^"]+id=\d+)"', store_sub) if m1 is None or m1.group(1) in store_map: continue
def fetch_stores(db, data, logger): brand_id, brand_name, url = (data[key] for key in ('brand_id', 'brandname_c', 'url')) # try: body = cm.get_data(url) q = pq(body) # except Exception, e: # logger.error(unicode.format(u'Error in fetching contents for {0}', url)) # return () m1 = re.search(ur'var\s+markers\s*=\s*\[', body) if not m1: logger.error( unicode.format(u'Error in finding stores for {0}:{1}', brand_id, brand_name)) return () body = body[m1.end() - 1:] m2 = re.search(ur'\]\s*;', body) if not m2: logger.error( unicode.format(u'Error in finding stores for {0}:{1}', brand_id, brand_name)) return () raw = json.loads(body[:m2.end() - 1]) store_list = [] for s in raw: entry = cm.init_store_entry(brand_id, brand_name, data['brandname_c']) # try: try: entry[cm.lat], entry[cm.lng] = (float(s['location'][idx]) for idx in (0, 1)) except (KeyError, IndexError, ValueError, TypeError): pass s = s['content'] try: entry[cm.name_e] = cm.html2plain(s['title']).strip() except (KeyError, TypeError): pass tmp_list = s['analytics_label'].split('-') entry[cm.country_e] = tmp_list[0] entry[cm.city_e] = cm.extract_city(tmp_list[1])[0] try: entry[cm.addr_e] = cm.reformat_addr(s['address']).strip() except (KeyError, TypeError): pass try: entry[cm.fax] = s['fax'].strip() except (KeyError, TypeError): pass try: entry[cm.tel] = s['phone'].strip() except (KeyError, TypeError): pass try: entry[cm.email] = s['mail'].strip() except (KeyError, TypeError): pass try: entry[ cm. url] = u'http://en.longchamp.com/store/map' + s['url'].strip() except (KeyError, TypeError): pass try: entry[cm.zip_code] = cm.html2plain(s['zipcode_town']).replace( tmp_list[1], '').strip() except (KeyError, TypeError): pass gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( unicode.format( u'{0}:{1} FOUND STORE: {2}, {3}, ({4}, {5}, {6})', data['brand_id'], data['brandname_e'], *(entry[key] for key in (cm.name_e, cm.addr_e, cm.city_e, cm.country_e, cm.continent_e)))) cm.insert_record(db, entry, 'spider_stores.stores') store_list.append(entry) return tuple(store_list)
url = data['url'] param = {'br': '_1', 'ca': '_R', 'wr': 'HC', 'cn': u'中国', 'cr': data['province'], 'cy': data['city']} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] else: body = data['body'] store_list = [] city = data['city'] if city == '': m = re.search(ur'<span id="m_sthead"\s*>(.+?)</span>', body) if m is not None: city = cm.reformat_addr(m.group(1)) city = city.replace(u'市', u'').strip() for m in re.finditer(ur'<span id="m_stname"[^<>]*>(.+?)</span>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.province_c] = data['province'] ret = gs.look_up(data['province'], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.city_c] = city ret = gs.look_up(city, 3) if ret is not None: entry[cm.city_e] = ret['name_e'] entry[cm.name_e] = cm.reformat_addr(m.group(1))
def get_stores(data): url = data['url'] print 'Trying to get stores for %s' % data['name'] try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } common.dump(dump_data) return [] start = 0 store_list = [] while True: start = html.find('<li class="info-store clearfix">', start) if start == -1: break end = html.find('<li class="info-store clearfix">', start + 1) sub_html = html[start:end] start = end entry = common.init_store_entry(brand_id, brandname_e, brandname_c) for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html): entry[common.url] = host + m[0] entry[common.name_e] = common.html2plain(m[1].strip()) break for m in re.findall( r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>', sub_html): common.update_entry(entry, { common.lat: string.atof(m[0]), common.lng: string.atof(m[1]) }) break for m in re.findall(r'<span class="map-address">(.*?)</span>', sub_html): entry[common.addr_e] = common.reformat_addr(m) break for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />', sub_html): entry[common.tel] = m.strip() break for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">', sub_html): entry[common.email] = m.strip() break opening_s = sub_html.find('<ul class="opening-hours') if opening_s != -1: opening_e = sub_html.find('</ul>', opening_s) o_str = sub_html[opening_s:opening_e] entry[common.hours] = ', '.join( [m for m in re.findall(r'<li>(.+?)</li>', o_str)]) brand_s = sub_html.find('<ul class="brands clearfix">') if brand_s != -1: brand_e = sub_html.find('</ul>', brand_s) b_str = sub_html[brand_s:brand_e] entry[common.store_type] = ', '.join([ common.html2plain(m) for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str) ]) # Geo if 'state' in data: entry[common.province_e] = data['state'] country_e = data['name'].strip().upper() entry[common.country_e] = country_e gs.field_sense(entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
m = re.search( ur'json_init_map\s*=\s*\["(-?\d+\.?\d*)"\s*,\s*"(-?\d+\.?\d*)"', body) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) start = body.find(ur'<div class="box-testuale-right">') if start == -1: cm.dump('Error in fetching stores: %s' % url, log_name) return [] sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] entry[cm.addr_e] = cm.reformat_addr(m.group(1)) m = re.search(ur'<h4>(.+?)</h4>', sub) if m is not None and 't:' in m.group(1).lower(): entry[cm.tel] = cm.extract_tel(m.group(1)) m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone']))) if ret is not None: city = '' province = ''