def fetch_stores(data): """ 商店列表 :param data: """ html = data['html'] store_list = [] while True: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<li class="leaf end"><div><u>(.+?)</u>', html) if m is None: break html = html[m.start():] entry[cm.name_e] = m.group(1) sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>') html = html[end:] # 单个商店的页面 sub = sub[len(m.group(0)):-len('</li>')] m = re.search(ur'<a href="(http.+?)"', sub) if m is not None: entry[cm.url] = m.group(1) m = re.search(ur'<a href="mailto:(.+?)"', sub) if m is not None: entry[cm.email] = m.group(1) m = re.search(ur'(?:<a\b|</div>)', sub) if m is not None: addr = sub[:m.start()] else: addr = sub # 解析地址栏 addr = cm.reformat_addr(addr) terms = addr.split(',') new_terms = [] for t in terms: if re.search(ur'phone', t, re.IGNORECASE) is not None: entry[cm.tel] = cm.extract_tel(t) elif re.search(ur'fax', t, re.IGNORECASE) is not None: entry[cm.fax] = cm.extract_tel(t) elif data['city_e'] in t.strip().upper(): # 邮编 m = re.search(ur'\d+', t) if m is not None: entry[cm.zip_code] = m.group(0)
def fetch_stores(data): url = data['store_url'] try: body = cm.get_data(url, { 'country': data['country'], 'city': data['city'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for item in raw['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'].strip().upper() tmp = cm.extract_city(data['city'])[0] if entry[cm.country_e] == 'USA': entry[cm.province_e] = tmp else: entry[cm.city_e] = tmp gs.field_sense(entry) addr = cm.reformat_addr(item['address'].replace(u'\\', '')) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) entry[cm.store_type] = item['shop_type'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def get_detailed_store(html, store_cat): store_list = [] start = 0 while True: sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>") if end == 0: break # 得到单个门店的页面代码 html = html[end:] entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html) if len(m) > 0: entry[common.name_e] = common.reformat_addr(m[0]) m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S) if len(m) > 0: addr = common.reformat_addr(m[0]) # 最后一行是否为电话号码? terms = addr.split(", ") tel = common.extract_tel(terms[-1]) if tel != "": addr = ", ".join(terms[:-1]) entry[common.tel] = tel entry[common.addr_e] = addr # 获得门店类型 # store_type = [store_cat] type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>") if type_end != 0: store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)] store_type.insert(0, store_cat) entry[common.store_type] = ", ".join(store_type) else: entry[common.store_type] = store_cat # 获得经纬度 m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lat] = string.atof(m[0]) m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lng] = string.atof(m[0]) entry[common.city_e] = common.extract_city(data[common.city_e])[0] entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper() gs.field_sense(entry) print "%s: Found store: %s, %s (%s, %s, %s)" % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.city_e], entry[common.country_e], entry[common.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(db, data, logger): """ 获得商店信息 :param data: """ url = data['post_url'] try: html = cm.post_data(url, { 'pid': data['city_id'], 'lang': 'en', 'action': 'popola_box_DX' }) if html.strip() == u'': logger.error( unicode.format(u'Failed to fetch stores for city {0}', data['city_id'])) return [] body = pq(html) except Exception as e: print 'Error occured in getting city list: %s' % url dump_data = { 'level': 2, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for item in (pq(temp) for temp in body('a[href]')): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.url] = item[0].attrib['href'] entry[cm.name_e] = item('h3.titleShop')[0].text.strip() # terms = cm.reformat_addr(item('div.txtBoxSingleStore p.lineHeight14')[0].text).split(',') terms = cm.reformat_addr( unicode(item('div.txtBoxSingleStore p.lineHeight14'))).split(',') tel = cm.extract_tel(terms[-1]) if tel != '': terms = terms[:-1] entry[cm.tel] = tel entry[cm.addr_e] = u', '.join([v.strip() for v in terms]) entry['country_e'] = data['country_e'] entry['city_e'] = data['city_e'] gs.field_sense(entry) logger.info( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) store_list.append(entry) cm.insert_record(db, entry, 'spider_stores.stores') return store_list
def fetch_uk(body, data): start = body.find(u'<div class="fableft">') if start == -1: print "Error in finding %s stores" % data["name"] return [] body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>") if end == 0: print "Error in finding %s stores" % data["name"] return [] store_list = [] for m in re.findall(ur"<div>\s*(.+?)\s*</div>", body, re.S): entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) entry[cm.country_e] = data["name"] addr_list = re.findall(ur"<p>\s*(.+?)\s*</p>", m) tel = cm.extract_tel(addr_list[-1]) if tel != "": entry[cm.tel] = tel del addr_list[-1] if data["name"] == "AUSTRALIA": country, province, city = gs.addr_sense(", ".join(addr_list), data["name"]) if city is not None: entry[cm.city_e] = city if province is not None: entry[cm.province_e] = province else: city = addr_list[-2].strip().upper() entry[cm.city_e] = city ret = gs.look_up(city, 3) if ret is not None and ret["country"]["name_e"] == gs.look_up("UK", 1)["name_e"]: entry[cm.city_e] = ret["name_e"] entry[cm.zip_code] = addr_list[-1].strip().upper() entry[cm.addr_e] = ", ".join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry)
def get_detailed_store(html, store_cat): store_list = [] start = 0 while True: sub_html, start, end = common.extract_closure(html, ur'<li\b', ur'</li>') if end == 0: break # 得到单个门店的页面代码 html = html[end:] entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html) if len(m) > 0: entry[common.name_e] = common.reformat_addr(m[0]) m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S) if len(m) > 0: addr = common.reformat_addr(m[0]) # 最后一行是否为电话号码? terms = addr.split(', ') tel = common.extract_tel(terms[-1]) if tel != '': addr = ', '.join(terms[:-1]) entry[common.tel] = tel entry[common.addr_e] = addr # 获得门店类型 # store_type = [store_cat] type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur'</ul>') if type_end != 0: store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)] store_type.insert(0, store_cat) entry[common.store_type] = ', '.join(store_type) else: entry[common.store_type] = store_cat # 获得经纬度 m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lat] = string.atof(m[0]) m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lng] = string.atof(m[0]) entry[common.city_e] = common.extract_city(data[common.city_e])[0] entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper() gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.city_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): """ 获得商店信息 :param data: """ url = data['post_url'] try: html = cm.post_data(url, { 'pid': data['city_id'], 'lang': 'en', 'action': 'popola_box_DX' }) except Exception: print 'Error occured in getting city list: %s' % url dump_data = { 'level': 2, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<a href="(.+?)".*?>', html): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.url] = m.group(1) store_html, start, end = cm.extract_closure(html[m.start():], ur'<a href', ur'</a>') if end == 0: continue m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S) if len(m1) > 0: entry[cm.name_e] = m1[0].strip() m1 = re.findall(ur'<p\b.*?>(.+?)(?:</p>|</div>)', store_html, re.S) if len(m1) > 0: terms = cm.reformat_addr(m1[0]).split(',') tel = cm.extract_tel(terms[-1]) if tel != '': terms = terms[:-1] entry[cm.tel] = tel entry[cm.addr_e] = ', '.join([v.strip() for v in terms]) entry['country_e'] = data['country_e'] entry['city_e'] = data['city_e'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(store_entry, {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel}) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def proc_store(sub, data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub) if m1 is not None: entry[cm.store_class] = m1.group(1).strip() m1 = re.search(ur'<span itemprop="address"', sub) if m1 is not None: addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0] m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.zip_code] = m2.group(1).strip() m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr_sub) m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S) if m2 is not None: entry[cm.tel] = m2.group(1).strip() m2 = re.search(ur'Fax\b(.+?)</p>', sub) if m2 is not None: entry[cm.fax] = cm.extract_tel(m2.group(1)) m2 = re.search( ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub) if m2 is not None: geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8')) param = { 'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True } try: geo_body = cm.get_data(geo_url, param) m3 = re.search( ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body) if m3 is not None: entry[cm.lat] = string.atof(m3.group(1)) entry[cm.lng] = string.atof(m3.group(2)) except Exception, e: cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
def fetch_stores(data): url = data['post_shops'] param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0, 'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0} try: html = cm.post_data(url, param) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] try: for store in (pq(tmp) for tmp in pq(html)('ul')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip() entry[cm.country_e] = data[cm.country_e] entry[cm.city_e] = data[cm.city_e] addr_list = [] for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')): if term != '': addr_list.append(term) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') except (IndexError, TypeError) as e: cm.dump(u'Error in parsing %s, %s' % (url, param), log_name) print traceback.format_exc() continue except Exception, e: print traceback.format_exc()
def fetch_uk(body, data): start = body.find(u'<div class="fableft">') if start == -1: print 'Error in finding %s stores' % data['name'] return [] body, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') if end == 0: print 'Error in finding %s stores' % data['name'] return [] store_list = [] for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['name'] addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] if data['name'] == 'AUSTRALIA': country, province, city = gs.addr_sense(', '.join(addr_list), data['name']) if city is not None: entry[cm.city_e] = city if province is not None: entry[cm.province_e] = province else: city = addr_list[-2].strip().upper() entry[cm.city_e] = city ret = gs.look_up(city, 3) if ret is not None and ret['country']['name_e'] == gs.look_up('UK', 1)['name_e']: entry[cm.city_e] = ret['name_e'] entry[cm.zip_code] = addr_list[-1].strip().upper() entry[cm.addr_e] = ', '.join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['store_url'] param = {'myid': data['key'], 'idioma': 'in'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] store_list = [] for s in json.loads(body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.name_e] = cm.reformat_addr(s['title']) m = re.search(ur'(.+?)-\s*<', s['key']) addr_list = [entry[cm.name_e]] if m is not None: m1 = re.search(ur'-+', m.group(1)) if m1 is not None: tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]] else: tmp = [m.group(1)] if len(tmp) > 1: entry[cm.tel] = cm.extract_tel(tmp[1]) m1 = re.search(ur'\d{4,}', tmp[0]) if m1 is not None: entry[cm.zip_code] = m1.group() addr_list.append(tmp[0].strip()) entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) code = data['country_code'] if gs.look_up(code, 1) is None: entry[cm.country_e] = cm.html2plain(data['country']).strip().upper() else: entry[cm.country_e] = code entry[cm.name_e] = data['store_name'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.lat] = data['lat'] if data['lat'] is not None else '' entry[cm.lng] = data['lng'] if data['lng'] is not None else '' m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content']) sub = data['content'][m.end():] m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub) if m1 is not None: entry[cm.store_class] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1)) if m2: entry = fetch_details(data, m2.group(1), entry) m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S) if m2: ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(',')) entry[cm.tel] = cm.extract_tel(ct_list[0]) if len(ct_list) > 1: entry[cm.email] = ct_list[1].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return tuple(entry)
def func(item): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip() addr_sub = unicode(pq(item('p')[0])) addr_list = [ term.strip() for term in cm.reformat_addr(addr_sub).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) temp = item('a.track_map[href]') m = hashlib.md5() m.update(url) if len(temp) > 0: map_ref = temp[0].attrib['href'] m.update(map_ref) m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref)) if m_query: query_parm = m_query.group(1).replace('+', ' ') entry['geo_query_param'] = query_parm else: m.update(entry[cm.addr_e]) fingerprint = m.hexdigest() entry[cm.native_id] = fingerprint if entry[cm.native_id] in data['store_list']: return entry[cm.country_e] = data['country'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( ('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) return entry
def fetch_stores(data): url = data['store_url'] try: body = cm.get_data(url, {'country': data['country'], 'city': data['city']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for item in raw['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'].strip().upper() tmp = cm.extract_city(data['city'])[0] if entry[cm.country_e] == 'USA': entry[cm.province_e] = tmp else: entry[cm.city_e] = tmp gs.field_sense(entry) addr = cm.reformat_addr(item['address'].replace(u'\\', '')) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel !='': entry[cm.tel]=tel del addr_list[-1] entry[cm.addr_e]=', '.join(addr_list) entry[cm.store_type] = item['shop_type'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): url = data['store_url'] param = {'myid': data['key'], 'idioma': 'in'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] store_list = [] for s in json.loads(body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.name_e] = cm.reformat_addr(s['title']) m = re.search(ur'(.+?)-\s*<', s['key']) addr_list = [entry[cm.name_e]] if m is not None: m1 = re.search(ur'-+', m.group(1)) if m1 is not None: tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]] else: tmp = [m.group(1)] if len(tmp) > 1: entry[cm.tel] = cm.extract_tel(tmp[1]) m1 = re.search(ur'\d{4,}', tmp[0]) if m1 is not None: entry[cm.zip_code] = m1.group() addr_list.append(tmp[0].strip()) entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): """ 获得商店信息 :param data: """ url = data["post_url"] try: html = cm.post_data(url, {"pid": data["city_id"], "lang": "en", "action": "popola_box_DX"}) except Exception: print "Error occured in getting city list: %s" % url dump_data = {"level": 2, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<a href="(.+?)".*?>', html): entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) entry[cm.url] = m.group(1) store_html, start, end = cm.extract_closure(html[m.start() :], ur"<a href", ur"</a>") if end == 0: continue m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S) if len(m1) > 0: entry[cm.name_e] = m1[0].strip() m1 = re.findall(ur"<p\b.*?>(.+?)(?:</p>|</div>)", store_html, re.S) if len(m1) > 0: terms = cm.reformat_addr(m1[0]).split(",") tel = cm.extract_tel(terms[-1]) if tel != "": terms = terms[:-1] entry[cm.tel] = tel entry[cm.addr_e] = ", ".join([v.strip() for v in terms]) entry["country_e"] = data["country_e"] entry["city_e"] = data["city_e"] gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ) store_list.append(entry) db.insert_record(entry, "stores")
def fetch_store_details(data): url = data['url'] try: body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find('<div class="store_locator') if start == -1: print 'Failed processing %s' % url return [] sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S) if m is not None: addr_list = cm.reformat_addr(m.group(1)).split(', ') ret = cm.extract_tel(addr_list[-1]) if ret != '': entry[cm.tel] = ret del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) addr_text=sub[m.end():] m = re.search(ur'<div class="title locator">', addr_text) if m is not None: tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata url = data['data_url'] param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02', 'undercolor': ' 06', 'togetmap': 'mapdata'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False) dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'<marker (.+?)>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'name=\\"(.+?)\\"', m) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', '')) m1 = re.search(ur'address=\\"(.+?)\\"', m) if m1 is not None: addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', '')) tel = cm.extract_tel(addr) if tel != '': entry[cm.tel] = tel addr = addr.replace(tel, '') entry[cm.addr_e] = cm.reformat_addr(addr) m1 = re.search(ur'lat=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data['country'].strip().upper() entry[cm.city_e] = cm.extract_city(data['city'])[0] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') store_list.append(entry)
def proc_store(sub, data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub) if m1 is not None: entry[cm.store_class] = m1.group(1).strip() m1 = re.search(ur'<span itemprop="address"', sub) if m1 is not None: addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0] m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.zip_code] = m2.group(1).strip() m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr_sub) m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S) if m2 is not None: entry[cm.tel] = m2.group(1).strip() m2 = re.search(ur'Fax\b(.+?)</p>', sub) if m2 is not None: entry[cm.fax] = cm.extract_tel(m2.group(1)) m2 = re.search(ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub) if m2 is not None: geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8')) param = {'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True} try: geo_body = cm.get_data(geo_url, param) m3 = re.search(ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body) if m3 is not None: entry[cm.lat] = string.atof(m3.group(1)) entry[cm.lng] = string.atof(m3.group(2)) except Exception, e: cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for s in re.findall(ur'<div class="store_wrapper">(.+?)</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m = re.search(ur'<h2>(.+?)</h2>', s) if m is not None: entry[cm.name_e] = cm.html2plain(m.group(1)) m = re.search(ur'<p>(.+?)</p>', s, re.S) if m is not None: addr_list = [tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')] tel = cm.extract_tel(re.sub(re.compile('^\s*t\s*(\.|:)\s*', re.I), '', addr_list[-1])) if tel != '': if entry[cm.country_e] == 'CHINA': if len(re.findall(r'\d', tel)) > 6: entry[cm.tel] = tel del addr_list[-1] else: entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '':
entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) start = body.find(ur'<div class="box-testuale-right">') if start == -1: cm.dump('Error in fetching stores: %s' % url, log_name) return [] sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] entry[cm.addr_e] = cm.reformat_addr(m.group(1)) m = re.search(ur'<h4>(.+?)</h4>', sub) if m is not None and 't:' in m.group(1).lower(): entry[cm.tel] = cm.extract_tel(m.group(1)) m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone']))) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components']
entry[cm.lng] = string.atof(s['latlong']['lng']) break store_sub = cm.extract_closure(sub[m.start():], ur'<li\b', ur'</li>')[0] m1 = re.search(ur'<div class="storelocator-item-title">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1)).strip() m1 = re.search(ur'<div class="storelocator-item-address">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)).strip() m1 = re.search(ur'<div class="storelocator-item-phone">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-fax">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.fax] = cm.extract_tel(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-email">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.email] = cm.extract_email(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-hours">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.hours] = m1.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
for i in xrange(len(tmp) - 1): sub_list.append({ 'content': body1[tmp[i]['idx2']:tmp[i + 1]['idx1']], 'name': tmp[i]['name'] }) for sub in sub_list: for m in re.findall(ur'<p>(.+?)</p>', sub['content'], re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'UNITED KINGDOM' entry[cm.city_e] = sub['name'] addr_list = cm.reformat_addr(m).split(', ') entry[cm.addr_e] = ', '.join(addr_list[:-1]) entry[cm.tel] = cm.extract_tel(addr_list[-1]) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'UNITED KINGDOM' entry[cm.city_e] = u'EDINBURGH' entry[cm.addr_e] = u'OCEAN DRIVE, LEITH, EDINBURGH' entry[cm.tel] = u'0131 554 8622' for m in re.findall(ur'<p>(.+?)</p>', body3, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
cm.dump('Error in fetching countries: %s' % url, log_name) return [] body = cm.extract_closure(body[start + 6:], ur'\[', ur'\]')[0] store_list = [] for m in re.finditer(ur'<div class="store ', body): s = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<h6>([^<>]+)</h6>', s) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() addr_sub = cm.extract_closure(s, ur'<p>', ur'</p>')[0] addr_list = [term.strip() for term in cm.reformat_addr(addr_sub).split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur'll=(-?\d+\.\d+),(-?\d+\.\d+)', addr_sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '':
cm.dump('Error in parsing %s' % m.group(1), log_name) continue sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_type] = store_type m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) entry[cm.tel] = cm.extract_tel(sub1) ret = gs.look_up(data['country_code'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] if ret[2] is not None: entry[cm.city_e] = ret[2] else: entry[cm.city_e] = data['city'].strip().upper() if entry[cm.name_e] in latlng_map: tmp = latlng_map[entry[cm.name_e]] entry[cm.lat] = tmp['lat'] entry[cm.lng] = tmp['lng']
store_list = [] for m in re.finditer(ur'<div class="store ', body): s = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<h6>([^<>]+)</h6>', s) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() addr_sub = cm.extract_closure(s, ur'<p>', ur'</p>')[0] addr_list = [ term.strip() for term in cm.reformat_addr(addr_sub).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur'll=(-?\d+\.\d+),(-?\d+\.\d+)', addr_sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '':
m1 = re.search(ur'<div class="storelocator-item-title">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1)).strip() m1 = re.search( ur'<div class="storelocator-item-address">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)).strip() m1 = re.search(ur'<div class="storelocator-item-phone">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-fax">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.fax] = cm.extract_tel(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-email">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.email] = cm.extract_email(m1.group(1)) m1 = re.search(ur'<div class="storelocator-item-hours">([^<>]+)</div>', store_sub) if m1 is not None: entry[cm.hours] = m1.group(1).strip()
if start == -1: cm.dump('Error in parsing %s' % m.group(1), log_name) continue sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_type] = store_type m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) entry[cm.tel] = cm.extract_tel(sub1) ret = gs.look_up(data['country_code'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] if ret[2] is not None: entry[cm.city_e] = ret[2] else: entry[cm.city_e] = data['city'].strip().upper() if entry[cm.name_e] in latlng_map: tmp = latlng_map[entry[cm.name_e]] entry[cm.lat] = tmp['lat'] entry[cm.lng] = tmp['lng']
def fetch_stores(data): """ country_id: country_id """ country = data['country'] country_id = data['country_id'] city = data['city'] city_id = data['city_id'] try: html = cm.post_data(url, {'country_id': country_id, 'city_id': city_id}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id} cm.dump(dump_data) return [] start = html.find('class="boutique_store"') if start == -1: return [] end = html.find('</ul>', start) html = html[start:end] # <li><h6>Paris</h6><p>36 Avenue Montaigne<br />+33 1 47 20 04 45<br />France</p></li> stores = [] for m in re.finditer(r'<li><h6>([^<>]+)</h6><p>(.*?)</p></li>', html): store_item = cm.init_store_entry(brand_id, brandname_e, brandname_c) # city = m.group(1) content = m.group(2) + r'<br />' addr = '' idx = 0 for m1 in re.finditer(r'(.*?)<br\s*?/>', content): idx += 1 # 第一个为门店名称 if idx == 1 and re.match(ur'.*?\d+', m1.group(1)) is None: store_item[cm.name_e] = cm.reformat_addr(m1.group(1)) addr += m1.group(1) + '\r\n' else: # 是否为电话? tel_str = cm.extract_tel(m1.group(1)) if tel_str != '': store_item[cm.tel] = tel_str else: addr += m1.group(1) + '\r\n' store_item[cm.addr_e] = cm.reformat_addr(addr) store_item[cm.city_e] = city store_item[cm.country_e] = country gs.field_sense(store_item) # term = cm.geo_translate(country) # if len(term) == 0: # print 'Error in geo translating: %s' % country # else: # store_item[cm.continent_c] = term[cm.continent_c] # store_item[cm.continent_e] = term[cm.continent_e] # store_item[cm.country_c] = term[cm.country_c] # store_item[cm.country_e] = term[cm.country_e] # store_item[cm.brandname_e] = brandname_e # store_item[cm.brandname_c] = brandname_c # cm.chn_check(store_item) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store_item[cm.name_e], store_item[cm.addr_e], store_item[cm.country_e], store_item[cm.continent_e]) db.insert_record(store_item, 'stores') stores.append(store_item)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub) if m1 is None: continue ret = gs.geocode(latlng=m1.group(1)) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'MAX' in tmp[0]: del tmp[0] if cm.extract_tel(tmp[-1])!='': del tmp[-1] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) else: cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name) continue
if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) addr_sub, info_sub = m.group(1).split('Practical Info') m = re.search(ur'<h2>(.+?)</h2>', addr_sub) if m is not None: entry[cm.name_e] = cm.html2plain(m.group(1)) addr_list = [] for term in re.findall(ur'<p>(.+?)</p>', addr_sub): tmp = cm.reformat_addr(term) if 'tel' in tmp.lower(): tel = cm.extract_tel(tmp) if tel != '': entry[cm.tel] = tel elif 'fax' in tmp.lower(): fax = cm.extract_tel(tmp) if fax != '': entry[cm.fax] = fax elif tmp != '': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) for term in (tmp.strip() for tmp in cm.reformat_addr(info_sub).split(',')): if '@' in term and '.' in term: entry[cm.email] = term elif 'www.' in term or '.com' in term or '.cn' in term: entry[cm.url] = term
for m in re.finditer(ur'<h3>\s*(.+?)\s*</h3>', body1): tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()}) tmp.append({'idx1': -1}) sub_list = [] for i in xrange(len(tmp) - 1): sub_list.append({'content': body1[tmp[i]['idx2']:tmp[i + 1]['idx1']], 'name': tmp[i]['name']}) for sub in sub_list: for m in re.findall(ur'<p>(.+?)</p>', sub['content'], re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'UNITED KINGDOM' entry[cm.city_e] = sub['name'] addr_list = cm.reformat_addr(m).split(', ') entry[cm.addr_e] = ', '.join(addr_list[:-1]) entry[cm.tel] = cm.extract_tel(addr_list[-1]) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'UNITED KINGDOM' entry[cm.city_e] = u'EDINBURGH' entry[cm.addr_e] = u'OCEAN DRIVE, LEITH, EDINBURGH' entry[cm.tel] = u'0131 554 8622' for m in re.findall(ur'<p>(.+?)</p>', body3, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'UNITED KINGDOM'
m = re.search(ur'<span class="street-address">(.+?)</span>', sub, re.S) if m is not None: entry[cm.addr_e] = cm.reformat_addr(m.group(1)) m = re.search(ur'<span class="postal-code">([^<>]+)</span>', sub, re.S) if m is not None: entry[cm.zip_code] = m.group(1).strip() m = re.search(ur'maps\.google\.com/\?q=(-?\d+\.?\d*),(-?\d+\.?\d*)', sub) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) for m in re.findall(ur'<div class="tel">(.+?)</div>', sub): if 'voice' in m: entry[cm.tel] = cm.extract_tel(cm.reformat_addr(m).replace('t.', '')) elif 'fax' in m: entry[cm.fax] = cm.extract_tel(cm.reformat_addr(m).replace('f.', '')) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores')
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( store_entry, { cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel }) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def fetch_stores(data): url = '%s%d/' % (data['store_url'], data['city_id']) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="store">', html): store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b', ur'</div') if set == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = store_sub.find('<div class="store_name">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() start = store_sub.find('<div class="store_address">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub, re.S) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(',') tmp = [] tel_pat = re.compile(ur'^tel[\.: ]+', re.I) for term in addr_list: if re.search(tel_pat, term.strip()) is not None: term = re.sub(tel_pat, '', term.strip()) entry[cm.tel] = cm.extract_tel(term) else: tmp.append(term.strip()) entry[cm.addr_e] = ', '.join(tmp) m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data[cm.country_e] entry[cm.continent_e] = data[cm.continent_e] entry[cm.city_e] = data[cm.city_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
entry[cm.country_e] = data['country'] entry[cm.province_c] = data['province'] ret = gs.look_up(data['province'], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.city_c] = city ret = gs.look_up(city, 3) if ret is not None: entry[cm.city_e] = ret['name_e'] entry[cm.name_e] = cm.reformat_addr(m.group(1)) m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end():]) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(',') tel = cm.extract_tel(addr_list[-1]).strip() if tel != '': del addr_list[-1] entry[cm.tel] = tel entry[cm.addr_e] = ', '.join([tmp.strip() for tmp in addr_list]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div class="searchResult[^"]*"', body): if 'intro' in m.group(): continue sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub) if m1 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = data['city'] addr_list = [tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] else: m1 = re.search(ur'Tel:([^<>]+)', sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) start = sub.find(ur'Opening hours:') if start != -1: entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip() ret = None if entry[cm.lat]!='' and entry[cm.lng]!='': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'Max Mara' in tmp[0]: del tmp[0] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S): if len(m.strip()) == 0: break for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.tel] = m1.strip() break for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.fax] = m1.strip() break if entry[cm.tel] == '' and entry[cm.fax] == '': entry[cm.tel] = cm.extract_tel(m.strip()) for m in re.findall( ur'<p class="boutique-info-cadre-horaires">(.*?)</p>', s, re.S): if len(m.strip()) > 0: entry[cm.hours] = m.strip() break for m in re.findall( ur'<p class="boutique-info-cadre-adresse".*?>(.*?)</p>', s, re.S): if len(m.strip()) == 0: break street_addr = '' zip_code = '' city = '' country = ''
def fetch_store_details(data): url = data['url'] try: body = cm.post_data(url, { 'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = body.find('<div class="store_locator') if start == -1: print 'Failed processing %s' % url return [] sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S) if m is not None: addr_list = cm.reformat_addr(m.group(1)).split(', ') ret = cm.extract_tel(addr_list[-1]) if ret != '': entry[cm.tel] = ret del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) addr_text = sub[m.end():] m = re.search(ur'<div class="title locator">', addr_text) if m is not None: tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = '%s%d/' % (data['store_url'], data['city_id']) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="store">', html): store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b', ur'</div') if set == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = store_sub.find('<div class="store_name">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() start = store_sub.find('<div class="store_address">') if start == -1: continue sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<p>(.+?)</p>', sub, re.S) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(',') tmp = [] tel_pat = re.compile(ur'^tel[\.: ]+', re.I) for term in addr_list: if re.search(tel_pat, term.strip()) is not None: term = re.sub(tel_pat, '', term.strip()) entry[cm.tel] = cm.extract_tel(term) else: tmp.append(term.strip()) entry[cm.addr_e] = ', '.join(tmp) m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>', store_sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data[cm.country_e] entry[cm.continent_e] = data[cm.continent_e] entry[cm.city_e] = data[cm.city_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): """ country_id: country_id """ country = data['country'] country_id = data['country_id'] city = data['city'] city_id = data['city_id'] try: html = cm.post_data(url, { 'country_id': country_id, 'city_id': city_id }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id } cm.dump(dump_data) return [] start = html.find('class="boutique_store"') if start == -1: return [] end = html.find('</ul>', start) html = html[start:end] # <li><h6>Paris</h6><p>36 Avenue Montaigne<br />+33 1 47 20 04 45<br />France</p></li> stores = [] for m in re.finditer(r'<li><h6>([^<>]+)</h6><p>(.*?)</p></li>', html): store_item = cm.init_store_entry(brand_id, brandname_e, brandname_c) # city = m.group(1) content = m.group(2) + r'<br />' addr = '' idx = 0 for m1 in re.finditer(r'(.*?)<br\s*?/>', content): idx += 1 # 第一个为门店名称 if idx == 1 and re.match(ur'.*?\d+', m1.group(1)) is None: store_item[cm.name_e] = cm.reformat_addr(m1.group(1)) addr += m1.group(1) + '\r\n' else: # 是否为电话? tel_str = cm.extract_tel(m1.group(1)) if tel_str != '': store_item[cm.tel] = tel_str else: addr += m1.group(1) + '\r\n' store_item[cm.addr_e] = cm.reformat_addr(addr) store_item[cm.city_e] = city store_item[cm.country_e] = country gs.field_sense(store_item) # term = cm.geo_translate(country) # if len(term) == 0: # print 'Error in geo translating: %s' % country # else: # store_item[cm.continent_c] = term[cm.continent_c] # store_item[cm.continent_e] = term[cm.continent_e] # store_item[cm.country_c] = term[cm.country_c] # store_item[cm.country_e] = term[cm.country_e] # store_item[cm.brandname_e] = brandname_e # store_item[cm.brandname_c] = brandname_c # cm.chn_check(store_item) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store_item[cm.name_e], store_item[cm.addr_e], store_item[cm.country_e], store_item[cm.continent_e]) db.insert_record(store_item, 'stores') stores.append(store_item)
entry[cm.country_e] = data["country"] entry[cm.province_c] = data["province"] ret = gs.look_up(data["province"], 2) if ret is not None: entry[cm.province_e] = ret["name_e"] entry[cm.city_c] = city ret = gs.look_up(city, 3) if ret is not None: entry[cm.city_e] = ret["name_e"] entry[cm.name_e] = cm.reformat_addr(m.group(1)) m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end() :]) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(",") tel = cm.extract_tel(addr_list[-1]).strip() if tel != "": del addr_list[-1] entry[cm.tel] = tel entry[cm.addr_e] = ", ".join([tmp.strip() for tmp in addr_list]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( "(%s / %d) Found store: %s, %s (%s, %s)"
m3 = re.search(ur'<div id="mallhotel">([^<>]+)', body) val = cm.html2plain(m3.group(1)).strip() if m3 else '' if val != '': addr_list.append(val) m3 = re.search(ur'<div id="address1">([^<>]+)', body) val = cm.html2plain(m3.group(1)).strip() if m3 else '' if val != '': addr_list.append(val) m3 = re.search(ur'<div id="address2">([^<>]+)', body) val = cm.html2plain(m3.group(1)).strip() if m3 else '' if val != '': addr_list.append(val) entry[cm.addr_e] = ', '.join(addr_list) m = re.search(ur'<div id="phone">([^<>]+)</div>', body) entry[cm.tel] = cm.extract_tel(m.group(1)) if m else '' m = re.search(ur'<div id="fax">([^<>]+)</div>', body) entry[cm.fax] = cm.extract_tel(m.group(1)) if m else '' m = re.search(ur'<div id="email">([^<>]+)</div>', body) entry[cm.email] = m.group(1).strip() if m else '' m = re.search(ur'<div id="opening">', body) if m: hours_list = [] for m in re.findall(ur'<li>([^<>]+)', cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]): if m.strip() != '': hours_list.append(m.strip()) entry[cm.hours] = ', '.join(hours_list) m = re.search(ur'<div id="products">', body) if m:
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<div class="searchResult[^"]*"', body): if 'intro' in m.group(): continue sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub) if m1 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = data['city'] addr_list = [ tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] else: m1 = re.search(ur'Tel:([^<>]+)', sub) if m1 is not None: entry[cm.tel] = cm.extract_tel(m1.group(1)) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) start = sub.find(ur'Opening hours:') if start != -1: entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip() ret = None if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None: tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')] if 'Max Mara' in tmp[0]: del tmp[0] if len(tmp) > 0: ret = gs.geocode(', '.join(tmp)) if ret is not None: city = '' province = '' country = '' zip_code = '' tmp = ret[0]['address_components'] for v in tmp: if 'locality' in v['types']: city = v['long_name'].strip().upper() elif 'administrative_area_level_1' in v['types']: province = v['long_name'].strip().upper() elif 'country' in v['types']: country = v['long_name'].strip().upper() elif 'postal_code' in v['types']: zip_code = v['long_name'].strip() entry[cm.country_e] = country entry[cm.province_e] = province entry[cm.city_e] = city entry[cm.zip_code] = zip_code gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
store_list = [] for m in sub_list: city_id = m['city_id'] sub_html = m['html'] entry = common.init_store_entry(brand_id, brandname_e, brandname_c) for m1 in re.findall(ur'<div class="store-desc">(.+?)</div>', sub_html, re.S): entry[common.name_e] = common.reformat_addr(m1) break for m1 in re.findall(ur'<div class="store-terminal">(.+?)</div>', sub_html, re.S): entry[common.addr_e] = common.reformat_addr(m1) break for m1 in re.findall(ur'<div class="store-tel">(.+?)</div>', sub_html, re.S): entry[common.tel] = common.extract_tel(m1) break for m1 in re.findall(ur'<div class="store-opening-hour">\s*?(?:Opening Hours:)?(.+?)</div>', sub_html, re.S): entry[common.hours] = common.reformat_addr(m1) break m1 = re.findall(ur'href="/(.+?)" title="View on map"', sub_html) if len(m1) > 0: entry[common.url] = host + '/' + m1[0] lat, lng = get_coordinates(entry[common.url]) common.update_entry(entry, {common.lat: lat, common.lng: lng}) # geo city_e = cities[city_id]['name'].strip()
if len(m.strip()) >= 0: entry[cm.store_type] = m.strip() break for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S): if len(m.strip()) == 0: break for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.tel] = m1.strip() break for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.fax] = m1.strip() break if entry[cm.tel] == '' and entry[cm.fax] == '': entry[cm.tel] = cm.extract_tel(m.strip()) for m in re.findall(ur'<p class="boutique-info-cadre-horaires">(.*?)</p>', s, re.S): if len(m.strip()) > 0: entry[cm.hours] = m.strip() break for m in re.findall(ur'<p class="boutique-info-cadre-adresse".*?>(.*?)</p>', s, re.S): if len(m.strip()) == 0: break street_addr = '' zip_code = '' city = '' country = '' for m1 in re.findall(ur'<span itemprop="streetAddress">(.*?)</span>', m, re.S): if len(m1.strip()) > 0: street_addr = cm.reformat_addr(m1) break
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall( ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, { cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1]) }) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list = [ tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',') ] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
m = re.search(ur'<div id="coordonnees"[^<>]*>(.+?)</div>', body, re.S) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) addr_sub, info_sub = m.group(1).split('Practical Info') m = re.search(ur'<h2>(.+?)</h2>', addr_sub) if m is not None: entry[cm.name_e] = cm.html2plain(m.group(1)) addr_list = [] for term in re.findall(ur'<p>(.+?)</p>', addr_sub): tmp = cm.reformat_addr(term) if 'tel' in tmp.lower(): tel = cm.extract_tel(tmp) if tel != '': entry[cm.tel] = tel elif 'fax' in tmp.lower(): fax = cm.extract_tel(tmp) if fax != '': entry[cm.fax] = fax elif tmp != '': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) for term in (tmp.strip() for tmp in cm.reformat_addr(info_sub).split(',')): if '@' in term and '.' in term: entry[cm.email] = term elif 'www.' in term or '.com' in term or '.cn' in term: entry[cm.url] = term