def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?' ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat, lng = map(string.atof, [m[1], m[2]]) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) sub = m[0].strip() m1 = re.search(ur'<b>(.+?)</b>', sub) if m1 is None: continue entry[cm.name_c] = m1.group(1) sub = sub.replace(m1.group(0), '') m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub) if m1 is not None: entry[cm.tel]=m1.group(1) sub=sub.replace(m1.group(0), '<') sub = re.sub(ur'<img\b.*?/>', '', sub) entry[cm.addr_c] = cm.reformat_addr(sub) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_cn(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = html.find('arrData = [') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'\[', ur'\]') raw_list = json.loads(sub) store_list = [] for v1 in raw_list: # 省 province = v1[0].strip() for v2 in v1[1]: # 市 city = v2[0].strip() for v3 in v2[1]: # 商店 entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) terms = v3.split(';') if len(terms) < 2: continue entry['name_c'] = terms[0].strip() entry['addr_e'] = terms[1].strip() cm.update_entry( entry, { cm.city_c: city, cm.province_c: province, cm.country_c: u'中国', cm.country_e: u'CHINA', cm.continent_c: u'亚洲', cm.continent_e: u'ASIA' }) print '(%s/%d) Found store: %s, %s (%s, %s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch(level=1, data=None, user='******', passwd=''): db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] js = json.loads(html) store_list = [] for s in js['data']['list']: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( entry, { cm.lat: string.atof(s['geo']['lat']), cm.lng: string.atof(s['geo']['lng']) }) entry[cm.name_e] = s['contact']['title'] entry[cm.addr_e] = cm.reformat_addr(s['contact']['address']) entry[cm.tel] = s['contact']['phone'] entry[cm.fax] = s['contact']['fax'] entry[cm.hours] = cm.reformat_addr(s['contact']['hours']) entry[cm.store_type] = s['contact']['selling'] entry[cm.url] = host + s['link'] gs.update_city_map(s['city'], s['country'], s['continent']) cm.update_entry( entry, { cm.continent_e: s['continent'], cm.country_e: s['country'], cm.city_e: s['city'] }) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) db.disconnect_db() gs.commit_maps(1) gs.commit_maps(3) return store_list
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(store_entry, {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel}) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper()}) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
def fetch_details(data): url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = data[cm.name_e] start = html.find(ur'<div class="field-address">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>') if end == 0: return [] m1 = re.search(ur'<div class="locality">(.+?)</div>', sub) if m1 is not None: entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<div class="postal-code">(.+?)</div>', sub) if m1 is not None: entry[cm.zip_code] = m1.group(1).strip() entry[cm.country_e] = data[cm.country_e] pat = re.compile(ur'<[^<>]+?>', re.S) entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub)) m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S) if m1 is not None: entry[cm.hours] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html) if m1 is not None: lat = string.atof(m1.group(1)) lng = string.atof(m1.group(2)) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) entry[cm.continent_e] = data[cm.continent_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def get_store_details(data): url = data['url'] try: html = cm.post_data(url, {'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) info = json.loads(html)['elements'] addr = cm.reformat_addr(info['address'].replace('\\', '').replace('<p>', ',').replace('</p>', ',')) # 第一行为商店名称 terms = addr.split(',') if len(terms) > 0: entry[cm.name_e] = cm.reformat_addr(terms[0]) entry[cm.addr_e] = addr gmap_url = info['gmap'] m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) entry[cm.url] = info['shareurl'].replace('\\', '') entry[cm.hours] = info['openingtimes'] entry[cm.comments] = info['other'] # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.findall( ur'var markerContent\s*?=\s*?"(.+?)".+?' ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat, lng = map(string.atof, [m[1], m[2]]) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) sub = m[0].strip() m1 = re.search(ur'<b>(.+?)</b>', sub) if m1 is None: continue entry[cm.name_c] = m1.group(1) sub = sub.replace(m1.group(0), '') m1 = re.search(ur'聯系電話(?::|:)(.+?)<', sub) if m1 is not None: entry[cm.tel] = m1.group(1) sub = sub.replace(m1.group(0), '<') sub = re.sub(ur'<img\b.*?/>', '', sub) entry[cm.addr_c] = cm.reformat_addr(sub) print '(%s/%d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch(level=1, data=None, user='******', passwd=''): db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] js = json.loads(html) store_list = [] for s in js['data']['list']: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {cm.lat: string.atof(s['geo']['lat']), cm.lng: string.atof(s['geo']['lng'])}) entry[cm.name_e] = s['contact']['title'] entry[cm.addr_e] = cm.reformat_addr(s['contact']['address']) entry[cm.tel] = s['contact']['phone'] entry[cm.fax] = s['contact']['fax'] entry[cm.hours] = cm.reformat_addr(s['contact']['hours']) entry[cm.store_type]=s['contact']['selling'] entry[cm.url]=host+s['link'] gs.update_city_map(s['city'], s['country'], s['continent']) cm.update_entry(entry,{cm.continent_e:s['continent'], cm.country_e:s['country'], cm.city_e:s['city']}) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) db.disconnect_db() gs.commit_maps(1) gs.commit_maps(3) return store_list
def fetch_cn(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = html.find('arrData = [') if start == -1: return [] sub, start, end=cm.extract_closure(html[start:], ur'\[', ur'\]') raw_list=json.loads(sub) store_list=[] for v1 in raw_list: # 省 province = v1[0].strip() for v2 in v1[1]: # 市 city = v2[0].strip() for v3 in v2[1]: # 商店 entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) terms=v3.split(';') if len(terms)<2: continue entry['name_c']=terms[0].strip() entry['addr_e']=terms[1].strip() cm.update_entry(entry, {cm.city_c:city, cm.province_c:province, cm.country_c:u'中国', cm.country_e:u'CHINA', cm.continent_c:u'亚洲', cm.continent_e:u'ASIA'}) print '(%s/%d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
def get_stores(url, data): """ 从json对象中获得商店信息 """ opener = urllib2.build_opener() opener.addheaders = [ ("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)" "Chrome/27.0.1453.94 Safari/537.36"), ('Accept', '*/*'), ('X-Requested-With', 'XMLHttpRequest'), ('Connection', 'keep-alive') ] response = opener.open(url) html = response.read().encode('utf-8') jsonobj = json.loads(html) stores = jsonobj[u'Stores'][u'Items'] region_list = jsonobj['Regions'] region_id = jsonobj['Region'] region = '' if len(region_list) > 0 and region_id != 0: for val in region_list: if val['RegionId'] == region_id: region = val['Name'] break country = jsonobj['CurrentCountry']['Name'] store_list = [] for s in stores: # print('Found store: %s, %s. Tel: %s, lat=%s, lng=%s' % ( # s['Name'], s['Address'], s['Phone'], s['Latitude'], s['Longitude'])) store_type = [''] # Some stores may have varioius store types if len(s['StoreTypes']) > 0: store_type = list(val['Name'] for val in s['StoreTypes']) if s['Url'] is not None: url = s['Url'] else: url = '' if s['ZipCode'] is not None and not s['ZipCode'].__eq__(''): zip = s['ZipCode'] else: zip = '' local_addr = s['Address'] if local_addr[-1] == '.': local_addr = local_addr[:-1] if not zip.__eq__(''): addr = u'%s, %s, %s' % (local_addr, s['City'], zip) else: addr = u'%s, %s' % (local_addr, s['City']) if region.__eq__(''): addr = u'%s, %s' % (addr, country) else: addr = u'%s, %s, %s' % (addr, region, country) for t in store_type: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( entry, { 'addr_e': addr, 'country_e': country, 'city_e': s['City'], 'comments': s['Comments'], 'province_e': region.strip().upper(), 'zip': zip, 'email': s['Email'], 'fax': s['Fax'], 'lat': s['Latitude'], 'lng': s['Longitude'], 'name_e': s['Name'], 'tel': s['Phone'], 'store_type': t, 'url': url }) gs.field_sense(entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def get_stores(data): url = data['url'] print 'Trying to get stores for %s' % data['name'] try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } common.dump(dump_data) return [] start = 0 store_list = [] while True: start = html.find('<li class="info-store clearfix">', start) if start == -1: break end = html.find('<li class="info-store clearfix">', start + 1) sub_html = html[start:end] start = end entry = common.init_store_entry(brand_id, brandname_e, brandname_c) for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html): entry[common.url] = host + m[0] entry[common.name_e] = common.html2plain(m[1].strip()) break for m in re.findall( r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>', sub_html): common.update_entry(entry, { common.lat: string.atof(m[0]), common.lng: string.atof(m[1]) }) break for m in re.findall(r'<span class="map-address">(.*?)</span>', sub_html): entry[common.addr_e] = common.reformat_addr(m) break for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />', sub_html): entry[common.tel] = m.strip() break for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">', sub_html): entry[common.email] = m.strip() break opening_s = sub_html.find('<ul class="opening-hours') if opening_s != -1: opening_e = sub_html.find('</ul>', opening_s) o_str = sub_html[opening_s:opening_e] entry[common.hours] = ', '.join( [m for m in re.findall(r'<li>(.+?)</li>', o_str)]) brand_s = sub_html.find('<ul class="brands clearfix">') if brand_s != -1: brand_e = sub_html.find('</ul>', brand_s) b_str = sub_html[brand_s:brand_e] entry[common.store_type] = ', '.join([ common.html2plain(m) for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str) ]) # Geo if 'state' in data: entry[common.province_e] = data['state'] country_e = data['name'].strip().upper() entry[common.country_e] = country_e gs.field_sense(entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch(level=1, data=None, user='******', passwd=''): """ :param level: :param data: :param user: :param passwd: :return: """ try: if data is None: data = {'url': url} html = common.get_data(data['url']) except Exception: print 'Error occured in getting data: %s' % url dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'data': url }, 'brand_id': brand_id } common.dump(dump_data) return [] db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) sub_pat = re.compile(ur'<!--.*?-->', re.S) html = re.sub(sub_pat, '', html) split_pos = [ m.start() for m in re.finditer(ur'<p><span class="contactboldtitle">', html) ] split_pos.append(-1) sub_list = [] for i in xrange(len(split_pos) - 1): sub_list.append(html[split_pos[i]:split_pos[i + 1]]) store_list = [] for sub_html in sub_list: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>', sub_html) if len(m) > 0: entry[common.name_l] = m[0] m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)', sub_html, re.S) if len(m) >= 2: addr = common.reformat_addr(m[0]) entry[common.addr_l] = addr # 城市,国家和邮编 addr_splits = addr.split(', ') ret = gs.look_up(addr_splits[-1], 1) if ret is None: print 'Error in geo translating: %s' % addr_splits[-1] else: entry[common.country_e] = ret['name_e'] m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2]) if len(m1) > 0: common.update_entry( entry, { common.city_e: common.extract_city(m1[0][0])[0], common.zip_code: m1[0][1] }) # 联系方式 tmp = m[1] m1 = re.findall(ur'[\d\-]{5,}', tmp) if len(m1) > 0: entry[common.tel] = m1[0] m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp) if len(m1) > 0: entry[common.email] = m1[0].strip() gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_l], entry[common.addr_l], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) db.disconnect_db()
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] province_list = [{ cm.province_c: m[1].strip().upper(), cm.url: m[0].strip() } for m in re.findall( ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)] comment_pat = re.compile(ur'<!--.*?-->', re.S) store_list = [] for p in province_list: start = html.find('<div id="%s">' % p[cm.url]) if start == -1: continue p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>', ur'</tbody>') p_sub = re.sub(comment_pat, '', p_sub) city_c = '' city_e = '' while True: s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>') if end == 0: break p_sub = p_sub[end:] if u'城市' in s_sub and u'店铺名称' in s_sub: continue term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub) if len(term_list) < 3: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) if len(term_list) == 4: city_c = term_list[0].strip() ret = gs.look_up(city_c, 3) if ret is not None: city_e = ret['name_e'] city_c = ret['name_c'] offset = 1 else: offset = 0 entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip() entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip() entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip() entry[cm.country_e] = 'CHINA' entry[cm.continent_e] = 'ASIA' p_name_c = p[cm.province_c] p_name_e = '' ret = gs.look_up(p_name_c, 2) if ret is not None: p_name_c = ret['name_c'] p_name_e = ret['name_e'] cm.update_entry( entry, { cm.province_e: p_name_e, cm.province_c: p_name_c, cm.city_e: city_e, cm.city_c: city_c }) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
dump_data = {'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id} cm.dump(dump_data) return [] # 可能有多个门店,拆分 sub_html = [] for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html): start = m.start() + len(m.group()) end = html.find('</li>', start) sub_html.append(html[start:end]) stores = [] # 针对每个门店: for s in sub_html: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {cm.url: url, cm.name_e: data['name'], cm.lat: data['lat'], cm.lng: data['lng'], cm.store_type: data['type']}) for m in re.findall(ur'<p class="boutique-info-cadre-titre">(.*?)</p>', s): if len(m.strip()) >= 0: entry[cm.store_type] = m.strip() break for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S): if len(m.strip()) == 0: break for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.tel] = m1.strip() break for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m): if len(m1.strip()) > 0: entry[cm.fax] = m1.strip() break
def fetch_stores(data): url = data['store_url'] try: body = cm.post_data(url, {'continent': data['continent'], 'country': data['country'], 'city': data['city'], 'send': 1, 'page': 0}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="shop">', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) sub, start, end = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<h3>\s*(.+?)\s*</h3>', sub, re.S) if m1 is not None: entry[cm.name_e] = m1.group(1) m1 = re.search(ur'<p[^>]*>(.+?)</p>', sub, re.S) if m1 is not None: entry[cm.store_type] = re.sub(re.compile(ur'\s*\+\s*', re.S), ', ', m1.group(1).strip()) addr_sub, start, end = cm.extract_closure(sub, ur'<ul\b', ur'</ul>') if end != 0: tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', addr_sub) addr_list = [] if len(tmp) >= 3: entry[cm.tel] = tmp[-1].strip() del tmp[-1] for term in tmp: term = cm.html2plain(term).strip() if term != '': addr_list.append(term) entry[cm.addr_e] = ', '.join(addr_list) start = sub.lower().find(ur'opening hours') if start != -1: opening_sub, start, end = cm.extract_closure(sub[start:], ur'<ul\b', ur'</ul>') tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', opening_sub) opening_list = [] for term in tmp: term = cm.html2plain(term).strip() if term != '': opening_list.append(term) entry[cm.hours] = ', '.join(opening_list) cm.update_entry(entry, {cm.continent_e: data['continent'].strip().upper(), cm.country_e: data['country'].strip().upper()}) entry[cm.city_e] = cm.extract_city(data['city'])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry( store, { cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper() }) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
# 可能有多个门店,拆分 sub_html = [] for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html): start = m.start() + len(m.group()) end = html.find('</li>', start) sub_html.append(html[start:end]) stores = [] # 针对每个门店: for s in sub_html: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( entry, { cm.url: url, cm.name_e: data['name'], cm.lat: data['lat'], cm.lng: data['lng'], cm.store_type: data['type'] }) for m in re.findall(ur'<p class="boutique-info-cadre-titre">(.*?)</p>', s): if len(m.strip()) >= 0: entry[cm.store_type] = m.strip() break for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S): if len(m.strip()) == 0: break for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m): if len(m1.strip()) > 0:
def field_sense(entry): # Geo country = entry[cm.country_e] city = entry[cm.city_e] ret = look_up(city, 3) ret1 = look_up(country, 1) if ret1 is not None: country = ret1['name_e'] if ret is not None and ret['country']['name_e'] == country: entry[cm.city_e] = ret['name_e'] entry[cm.city_c] = ret['name_c'] prov = ret['province'] if prov != '': ret1 = look_up(prov['name_e'], 2) if ret1 is not None: entry[cm.province_e] = ret1['name_e'] entry[cm.province_c] = ret1['name_c'] province = entry[cm.province_e] ret = look_up(province, 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.province_c] = ret['name_c'] ret = look_up(country, 1) if ret is not None: cm.update_entry(entry, {cm.country_e: ret['name_e'], cm.country_c: ret['name_c']}) ret1 = look_up(ret['continent']['name_e'], 0) cm.update_entry(entry, {cm.continent_e: ret1['name_e'], cm.continent_c: ret1['name_c']}) if entry[cm.zip_code] == '': m = None if ret['name_e'] == look_up(u'CHINA', 1)['name_e']: # 中国邮编 m = re.match(ur'.*\b(\d{6})\b', entry[cm.addr_e]) elif ret['name_e'] == look_up(u'UNITED STATES', 1)['name_e']: # 美国邮编 m = re.match(ur'.*\b(\d{5})\b', entry[cm.addr_e]) elif ret['name_e'] == look_up(u'JAPAN', 1)['name_e']: # 日本邮编 m = re.match(ur'.*\b(\d{3}\-\d{4})\b', entry[cm.addr_e]) if m is not None: entry[cm.zip_code] = m.group(1) cm.chn_check(entry) if entry[cm.zip_code] == '': # 数字和城市,州一起,可能为邮编 m = re.match(ur'.*\s+(\d{5,})\b', entry[cm.addr_e]) if m is not None: tmp = entry[cm.addr_e][m.end() + 1:] terms = re.findall(ur'\b(\S+?)\b', tmp) if len(terms) > 0: if look_up(terms[0], 2) is not None or look_up(terms[0], 3) is not None: entry[cm.zip_code] = m.group(1) else: tmp = entry[cm.addr_e][m.end() - len(m.group(1)) - 1::-1] terms = re.findall(ur'\b(\S+?)\b', tmp) if len(terms) > 0: if look_up(terms[0][::-1], 2) is not None or look_up(terms[0][::-1], 3) is not None: entry[cm.zip_code] = m.group(1)
def fetch(level=1, data=None, user='******', passwd=''): """ :param level: :param data: :param user: :param passwd: :return: """ try: if data is None: data = {'url': url} html = common.get_data(data['url']) except Exception: print 'Error occured in getting data: %s' % url dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id} common.dump(dump_data) return [] db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) sub_pat = re.compile(ur'<!--.*?-->', re.S) html = re.sub(sub_pat, '', html) split_pos = [m.start() for m in re.finditer(ur'<p><span class="contactboldtitle">', html)] split_pos.append(-1) sub_list = [] for i in xrange(len(split_pos) - 1): sub_list.append(html[split_pos[i]:split_pos[i + 1]]) store_list = [] for sub_html in sub_list: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>', sub_html) if len(m) > 0: entry[common.name_l] = m[0] m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)', sub_html, re.S) if len(m) >= 2: addr = common.reformat_addr(m[0]) entry[common.addr_l] = addr # 城市,国家和邮编 addr_splits = addr.split(', ') ret = gs.look_up(addr_splits[-1], 1) if ret is None: print 'Error in geo translating: %s' % addr_splits[-1] else: entry[common.country_e]=ret['name_e'] m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2]) if len(m1) > 0: common.update_entry(entry, {common.city_e: common.extract_city(m1[0][0])[0], common.zip_code: m1[0][1]}) # 联系方式 tmp = m[1] m1 = re.findall(ur'[\d\-]{5,}', tmp) if len(m1) > 0: entry[common.tel] = m1[0] m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp) if len(m1) > 0: entry[common.email] = m1[0].strip() gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_l], entry[common.addr_l], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) db.disconnect_db()
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( store_entry, { cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel }) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def fetch_stores(data): url = data['url'] try: html, cookie_map = cm.get_data_cookie(url) except Exception: print 'Error occured in getting country list: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] print 'SLEEPING>>>>' time.sleep(5) m = re.search( 'http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html) if m is None: return [] url = m.group(0) cookie_map_new = {} for key in cookie_map: if 'dwpersonalization_' in key or key == 'sr_token': continue cookie_map_new[key] = cookie_map[key] cookie_map_new['invited_visitor_22225'] = '1' cookie_map = cookie_map_new try: html = cm.post_data(url, { 'dwfrm_storelocator_startaddress': 'kingman', 'dwfrm_storelocator_maxDistance': 30.00, 'dwfrm_storelocator_outlet': 'true', 'dwfrm_storelocator_retail': 'true', 'dwfrm_storelocator_optical': 'true', 'dwfrm_storelocator_eyewear': 'true', 'dwfrm_storelocator_apparel': 'true', 'dwfrm_storelocator_attire': 'true', 'dwfrm_storelocator_department': 'true', 'dwfrm_storelocator_IsMensFootwear': 'true', 'dwfrm_storelocator_IsRRR': 'true', 'dwfrm_storelocator_IsRRNY': 'true', 'dwfrm_storelocator_IsRRS': 'true', 'dwfrm_storelocator_wholesale': 'true', 'dwfrm_storelocator_bba': 'true', 'dwfrm_storelocator_ba': 'true', 'dwfrm_storelocator_search.x': 0, 'dwfrm_storelocator_search.y': 0, 'dwfrm_storelocator_countryCode': 'US', 'dwfrm_storelocator_postalCode': '67068', 'dwfrm_storelocator_distanceUnit': 'mi', 'dwfrm_storelocator_long': -98.117208, 'dwfrm_storelocator_lat': 37.647131, }, cookie=cookie_map) except Exception: print 'Error occured in getting country list: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m1 in re.finditer(ur'<div class="storeColumnOne">', html): sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>') if end == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub) if m2 is not None: entry[cm.name_e] = m2.group(1).strip() addr_list = [ m2 for m2 in re.findall( ur'<div class="adddressline">([^<>]+)</div>', sub) ] entry[cm.addr_e] = ', '.join(addr_list) m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub) if m2 is not None: tmp = cm.reformat_addr(m2.group(1)) terms = re.split('[, ]+', tmp) if len(terms) < 3: entry[cm.addr_e] = tmp else: ret = gs.look_up(terms[0], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] else: entry[cm.city_e] = terms[0].strip().upper() ret = gs.look_up(terms[1], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] else: entry[cm.province_e] = terms[0].strip().upper() if re.match('\s*\d{5,}\s*', terms[2]) is not None: entry[cm.zip_code] = terms[2].strip() m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub) if m2 is not None: entry[cm.tel] = m2.group(1) cm.update_entry(entry, { 'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA' }) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def get_stores(data): url = data["url"] print "Trying to get stores for %s" % data["name"] try: html = common.get_data(url) except Exception: print "Error occured: %s" % url dump_data = {"level": 1, "time": common.format_time(), "data": {"url": url}, "brand_id": brand_id} common.dump(dump_data) return [] start = 0 store_list = [] while True: start = html.find('<li class="info-store clearfix">', start) if start == -1: break end = html.find('<li class="info-store clearfix">', start + 1) sub_html = html[start:end] start = end entry = common.init_store_entry(brand_id, brandname_e, brandname_c) for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html): entry[common.url] = host + m[0] entry[common.name_e] = common.html2plain(m[1].strip()) break for m in re.findall( r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>', sub_html ): common.update_entry(entry, {common.lat: string.atof(m[0]), common.lng: string.atof(m[1])}) break for m in re.findall(r'<span class="map-address">(.*?)</span>', sub_html): entry[common.addr_e] = common.reformat_addr(m) break for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />', sub_html): entry[common.tel] = m.strip() break for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">', sub_html): entry[common.email] = m.strip() break opening_s = sub_html.find('<ul class="opening-hours') if opening_s != -1: opening_e = sub_html.find("</ul>", opening_s) o_str = sub_html[opening_s:opening_e] entry[common.hours] = ", ".join([m for m in re.findall(r"<li>(.+?)</li>", o_str)]) brand_s = sub_html.find('<ul class="brands clearfix">') if brand_s != -1: brand_e = sub_html.find("</ul>", brand_s) b_str = sub_html[brand_s:brand_e] entry[common.store_type] = ", ".join( [common.html2plain(m) for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str)] ) # Geo if "state" in data: entry[common.province_e] = data["state"] country_e = data["name"].strip().upper() entry[common.country_e] = country_e gs.field_sense(entry) print "%s Found store: %s, %s (%s, %s)" % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def get_store_details(data): url = data['url'] try: html = cm.post_data( url, { 'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) info = json.loads(html)['elements'] addr = cm.reformat_addr(info['address'].replace('\\', '').replace( '<p>', ',').replace('</p>', ',')) # 第一行为商店名称 terms = addr.split(',') if len(terms) > 0: entry[cm.name_e] = cm.reformat_addr(terms[0]) entry[cm.addr_e] = addr gmap_url = info['gmap'] m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url) if len(m) > 0: cm.update_entry(entry, { cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1]) }) entry[cm.url] = info['shareurl'].replace('\\', '') entry[cm.hours] = info['openingtimes'] entry[cm.comments] = info['other'] # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall( ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, { cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1]) }) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list = [ tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',') ] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'): tot = 0 start = 0 store_list = [] data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100} # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100} db = cm.StoresDb() db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores') db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) while True: cm.dump('Fetching from %d' % start, 'triumph_log.txt') try: data['start'] = start html = cm.get_data(url, data) raw_list = json.loads(html) if tot == 0: tot = raw_list['response']['numFound'] cm.dump('Found: %d' % tot, 'triumph_log.txt') raw_list = raw_list['response']['docs'] except Exception: cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt') dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] idx = 0 if len(raw_list) < data['rows'] and start + len(raw_list) < tot: cm.dump('Cooling down...', 'triumph_log.txt') time.sleep(5) continue for v in raw_list: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {cm.store_type: v['class'], cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'], cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours']}) entry[cm.name_e] = cm.reformat_addr(v['name']) entry[cm.city_e], tmp = cm.extract_city(v['city']) if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '': entry[cm.zip_code] = tmp if v['location'] != '': terms = v['location'].split(',') cm.update_entry(entry, {cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1])}) addr = v['address'] if v['address2'] != '': addr += ', ' + v['address2'] entry[cm.addr_e] = cm.reformat_addr(addr) ret = gs.look_up(v['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] else: cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt') gs.field_sense(entry) cm.dump('(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % ( brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), 'triumph_log.txt') store_list.append(entry) db.insert_record(entry, 'stores') idx += 1 if tot - start <= len(raw_list): break else: start += len(raw_list)
def get_stores(url, data): """ 从json对象中获得商店信息 """ opener = urllib2.build_opener() opener.addheaders = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)" "Chrome/27.0.1453.94 Safari/537.36"), ('Accept', '*/*'), ('X-Requested-With', 'XMLHttpRequest'), ('Connection', 'keep-alive')] response = opener.open(url) html = response.read().encode('utf-8') jsonobj = json.loads(html) stores = jsonobj[u'Stores'][u'Items'] region_list = jsonobj['Regions'] region_id = jsonobj['Region'] region = '' if len(region_list) > 0 and region_id != 0: for val in region_list: if val['RegionId'] == region_id: region = val['Name'] break country = jsonobj['CurrentCountry']['Name'] store_list = [] for s in stores: # print('Found store: %s, %s. Tel: %s, lat=%s, lng=%s' % ( # s['Name'], s['Address'], s['Phone'], s['Latitude'], s['Longitude'])) store_type = [''] # Some stores may have varioius store types if len(s['StoreTypes']) > 0: store_type = list(val['Name'] for val in s['StoreTypes']) if s['Url'] is not None: url = s['Url'] else: url = '' if s['ZipCode'] is not None and not s['ZipCode'].__eq__(''): zip = s['ZipCode'] else: zip = '' local_addr = s['Address'] if local_addr[-1] == '.': local_addr = local_addr[:-1] if not zip.__eq__(''): addr = u'%s, %s, %s' % (local_addr, s['City'], zip) else: addr = u'%s, %s' % (local_addr, s['City']) if region.__eq__(''): addr = u'%s, %s' % (addr, country) else: addr = u'%s, %s, %s' % (addr, region, country) for t in store_type: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(entry, {'addr_e': addr, 'country_e': country, 'city_e': s['City'], 'comments': s['Comments'], 'province_e': region.strip().upper(), 'zip': zip, 'email': s['Email'], 'fax': s['Fax'], 'lat': s['Latitude'], 'lng': s['Longitude'], 'name_e': s['Name'], 'tel': s['Phone'], 'store_type': t, 'url': url}) gs.field_sense(entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): url = data['url'] try: html, cookie_map = cm.get_data_cookie(url) except Exception: print 'Error occured in getting country list: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] print 'SLEEPING>>>>' time.sleep(5) m = re.search('http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html) if m is None: return [] url = m.group(0) cookie_map_new = {} for key in cookie_map: if 'dwpersonalization_' in key or key == 'sr_token': continue cookie_map_new[key] = cookie_map[key] cookie_map_new['invited_visitor_22225'] = '1' cookie_map = cookie_map_new try: html = cm.post_data(url, {'dwfrm_storelocator_startaddress': 'kingman', 'dwfrm_storelocator_maxDistance': 30.00, 'dwfrm_storelocator_outlet': 'true', 'dwfrm_storelocator_retail': 'true', 'dwfrm_storelocator_optical': 'true', 'dwfrm_storelocator_eyewear': 'true', 'dwfrm_storelocator_apparel': 'true', 'dwfrm_storelocator_attire': 'true', 'dwfrm_storelocator_department': 'true', 'dwfrm_storelocator_IsMensFootwear': 'true', 'dwfrm_storelocator_IsRRR': 'true', 'dwfrm_storelocator_IsRRNY': 'true', 'dwfrm_storelocator_IsRRS': 'true', 'dwfrm_storelocator_wholesale': 'true', 'dwfrm_storelocator_bba': 'true', 'dwfrm_storelocator_ba': 'true', 'dwfrm_storelocator_search.x': 0, 'dwfrm_storelocator_search.y': 0, 'dwfrm_storelocator_countryCode': 'US', 'dwfrm_storelocator_postalCode': '67068', 'dwfrm_storelocator_distanceUnit': 'mi', 'dwfrm_storelocator_long': -98.117208, 'dwfrm_storelocator_lat': 37.647131,}, cookie=cookie_map) except Exception: print 'Error occured in getting country list: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m1 in re.finditer(ur'<div class="storeColumnOne">', html): sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>') if end == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub) if m2 is not None: entry[cm.name_e] = m2.group(1).strip() addr_list = [m2 for m2 in re.findall(ur'<div class="adddressline">([^<>]+)</div>', sub)] entry[cm.addr_e] = ', '.join(addr_list) m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub) if m2 is not None: tmp = cm.reformat_addr(m2.group(1)) terms = re.split('[, ]+', tmp) if len(terms) < 3: entry[cm.addr_e] = tmp else: ret = gs.look_up(terms[0], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] else: entry[cm.city_e] = terms[0].strip().upper() ret = gs.look_up(terms[1], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] else: entry[cm.province_e] = terms[0].strip().upper() if re.match('\s*\d{5,}\s*', terms[2]) is not None: entry[cm.zip_code] = terms[2].strip() m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub) if m2 is not None: entry[cm.tel] = m2.group(1) cm.update_entry(entry, {'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA'}) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): url = data['store_url'] try: body = cm.post_data( url, { 'continent': data['continent'], 'country': data['country'], 'city': data['city'], 'send': 1, 'page': 0 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="shop">', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) sub, start, end = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>') if end == 0: continue m1 = re.search(ur'<h3>\s*(.+?)\s*</h3>', sub, re.S) if m1 is not None: entry[cm.name_e] = m1.group(1) m1 = re.search(ur'<p[^>]*>(.+?)</p>', sub, re.S) if m1 is not None: entry[cm.store_type] = re.sub(re.compile(ur'\s*\+\s*', re.S), ', ', m1.group(1).strip()) addr_sub, start, end = cm.extract_closure(sub, ur'<ul\b', ur'</ul>') if end != 0: tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', addr_sub) addr_list = [] if len(tmp) >= 3: entry[cm.tel] = tmp[-1].strip() del tmp[-1] for term in tmp: term = cm.html2plain(term).strip() if term != '': addr_list.append(term) entry[cm.addr_e] = ', '.join(addr_list) start = sub.lower().find(ur'opening hours') if start != -1: opening_sub, start, end = cm.extract_closure( sub[start:], ur'<ul\b', ur'</ul>') tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', opening_sub) opening_list = [] for term in tmp: term = cm.html2plain(term).strip() if term != '': opening_list.append(term) entry[cm.hours] = ', '.join(opening_list) cm.update_entry( entry, { cm.continent_e: data['continent'].strip().upper(), cm.country_e: data['country'].strip().upper() }) entry[cm.city_e] = cm.extract_city(data['city'])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] province_list = [{cm.province_c: m[1].strip().upper(), cm.url: m[0].strip()} for m in re.findall(ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)] comment_pat = re.compile(ur'<!--.*?-->', re.S) store_list = [] for p in province_list: start = html.find('<div id="%s">' % p[cm.url]) if start == -1: continue p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>', ur'</tbody>') p_sub = re.sub(comment_pat, '', p_sub) city_c = '' city_e = '' while True: s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>') if end == 0: break p_sub = p_sub[end:] if u'城市' in s_sub and u'店铺名称' in s_sub: continue term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub) if len(term_list) < 3: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) if len(term_list) == 4: city_c = term_list[0].strip() ret = gs.look_up(city_c, 3) if ret is not None: city_e = ret['name_e'] city_c = ret['name_c'] offset = 1 else: offset = 0 entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip() entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip() entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip() entry[cm.country_e] = 'CHINA' entry[cm.continent_e] = 'ASIA' p_name_c = p[cm.province_c] p_name_e = '' ret = gs.look_up(p_name_c, 2) if ret is not None: p_name_c = ret['name_c'] p_name_e = ret['name_e'] cm.update_entry(entry, {cm.province_e: p_name_e, cm.province_c: p_name_c, cm.city_e: city_e, cm.city_c: city_c}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
break for m1 in re.findall(ur'<div class="store-tel">(.+?)</div>', sub_html, re.S): entry[common.tel] = common.extract_tel(m1) break for m1 in re.findall(ur'<div class="store-opening-hour">\s*?(?:Opening Hours:)?(.+?)</div>', sub_html, re.S): entry[common.hours] = common.reformat_addr(m1) break m1 = re.findall(ur'href="/(.+?)" title="View on map"', sub_html) if len(m1) > 0: entry[common.url] = host + '/' + m1[0] lat, lng = get_coordinates(entry[common.url]) common.update_entry(entry, {common.lat: lat, common.lng: lng}) # geo city_e = cities[city_id]['name'].strip() country_e = cities[city_id]['country']['name'].strip().upper() continent_e = cities[city_id]['country']['continent'].strip().upper() common.update_entry(entry, {common.city_e: common.extract_city(city_e)[0], common.country_e: country_e, common.continent_e: continent_e}) gs.field_sense(entry) # ret = common.geo_translate(country_e.strip()) # if len(ret) > 0: # common.update_entry(entry, {common.continent_c: ret[common.continent_c], # common.continent_e: ret[common.continent_e], # common.country_c: ret[common.country_c], # common.country_e: ret[common.country_e]})
def field_sense(entry): # Geo country = entry[cm.country_e] city = entry[cm.city_e] ret = look_up(city, 3) ret1 = look_up(country, 1) if ret1 is not None: country = ret1['name_e'] if ret is not None and ret['country']['name_e'] == country: entry[cm.city_e] = ret['name_e'] entry[cm.city_c] = ret['name_c'] prov = ret['province'] if prov != '': ret1 = look_up(prov['name_e'], 2) if ret1 is not None: entry[cm.province_e] = ret1['name_e'] entry[cm.province_c] = ret1['name_c'] province = entry[cm.province_e] ret = look_up(province, 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.province_c] = ret['name_c'] ret = look_up(country, 1) if ret is not None: cm.update_entry(entry, { cm.country_e: ret['name_e'], cm.country_c: ret['name_c'] }) ret1 = look_up(ret['continent']['name_e'], 0) cm.update_entry(entry, { cm.continent_e: ret1['name_e'], cm.continent_c: ret1['name_c'] }) if entry[cm.zip_code] == '': m = None if ret['name_e'] == look_up(u'CHINA', 1)['name_e']: # 中国邮编 m = re.match(ur'.*\b(\d{6})\b', entry[cm.addr_e]) elif ret['name_e'] == look_up(u'UNITED STATES', 1)['name_e']: # 美国邮编 m = re.match(ur'.*\b(\d{5})\b', entry[cm.addr_e]) elif ret['name_e'] == look_up(u'JAPAN', 1)['name_e']: # 日本邮编 m = re.match(ur'.*\b(\d{3}\-\d{4})\b', entry[cm.addr_e]) if m is not None: entry[cm.zip_code] = m.group(1) cm.chn_check(entry) if entry[cm.zip_code] == '': # 数字和城市,州一起,可能为邮编 m = re.match(ur'.*\s+(\d{5,})\b', entry[cm.addr_e]) if m is not None: tmp = entry[cm.addr_e][m.end() + 1:] terms = re.findall(ur'\b(\S+?)\b', tmp) if len(terms) > 0: if look_up(terms[0], 2) is not None or look_up(terms[0], 3) is not None: entry[cm.zip_code] = m.group(1) else: tmp = entry[cm.addr_e][m.end() - len(m.group(1)) - 1::-1] terms = re.findall(ur'\b(\S+?)\b', tmp) if len(terms) > 0: if look_up(terms[0][::-1], 2) is not None or look_up( terms[0][::-1], 3) is not None: entry[cm.zip_code] = m.group(1)