def fetch_store_detail(s, data, isOfficial=False): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() entry[cm.country_e] = data['country'] val = cm.html2plain(s['city']).strip().upper() entry[cm.city_e] = cm.extract_city(val if val and val != '' else data['city'])[0] entry[cm.addr_e] = cm.html2plain(s['address']).strip() entry[cm.email] = s['email'].strip() entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.store_class] = 'Official Retailer' if isOfficial else 'Retailer' try: entry[cm.lat] = string.atof(s['lat']) if s['lat'] != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) try: entry[cm.lng] = string.atof(s['lng']) if s['lng'] != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) return entry
def fetch_store_details(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = body.find(ur'<h3>available in store</h3>') if start != -1: type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] entry[cm.store_type] = ', '.join( cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) start = body.find(ur"<div class='gmap_info_box'") if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lat] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return [entry]
def func(s): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.native_id] = int(s['id']) if entry[cm.native_id] in data['store_list']: return entry[cm.lat] = float(s['latitude']) entry[cm.lng] = float(s['longitude']) entry[cm.email] = s['email'] entry[cm.fax] = s['fax'] entry[cm.store_class] = ' | '.join((str.format('ISCHANEL:{0}', 'YES' if s['ischanel'] != 0 else 'NO'), s['postypename'])) try: entry[cm.hours] = ' | '.join( map(lambda val: ':'.join((val['day'], val['opening'] if 'opening' in val else '')), s['openinghours'])) except TypeError as e: pass entry[cm.tel] = s['phone'] trans = s['translations'][0] entry[cm.addr_e] = cm.html2plain( ', '.join(filter(lambda val: val, (trans[key] for key in ('address1', 'address2'))))) entry[cm.city_e] = cm.html2plain(trans['cityname'].strip().upper()) entry[cm.name_e] = cm.html2plain(trans['name']) entry[cm.province_e] = cm.html2plain(trans['statename']).strip().upper() entry[cm.store_type] = ', '.join(temp['name'] for temp in trans['products']) entry[cm.url] = s['website'] entry[cm.zip_code] = s['zipcode'] country_id = s['country_id'] if 'country_id' in s else None if country_id and country_id in data['country_map']: entry[cm.country_e] = data['country_map'][country_id] else: ret = gs.geocode2(latlng=str.format('{0},{1}', entry[cm.lat], entry[cm.lng]), logger=logger) country_e = None if len(ret) > 0: for item in ret[0]['address_components']: if 'country' in item['types']: country_e = item['long_name'].strip().upper() break if not country_e: country_e = raw_input(unicode.format(u'INPUT THE COUNTRY NAME FOR {0} AT {1}, {2}', entry[cm.city_e], entry[cm.lat], entry[cm.lng])).decode('utf-8') if not country_e: # 无法确定国家名称,放弃该记录 return entry[cm.country_e] = country_e if country_id: data['country_map'][country_id] = country_e logger.info(('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) data['store_list'].add(entry[cm.native_id])
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry(store_entry, {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel}) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
def fetch_store_details(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = body.find(ur'<h3>available in store</h3>') if start != -1: type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] entry[cm.store_type] = ', '.join( cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) start = body.find(ur"<div class='gmap_info_box'") if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lat] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return [entry]
def fetch_hk(data): loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories') url = 'http://levi.com.hk/hk/storelocator' store_list = [] for loc in loc_list: param = {'loc': loc} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s' % param, log_name) continue start = body.find(ur'<div id="addWrapper">') if start == -1: cm.dump('Error in fetching stores: %s' % param, log_name) continue sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0] for s in re.findall(ur'<li>(.+?)</li>', sub, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG' entry[cm.city_e] = entry[cm.country_e] m = re.search(ur'<div id="addStore">([^<>]+)', s) entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else '' m = re.search(ur'<div id="addAddress">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.hours] = re.sub(pat, '', tmp).strip() m = re.search(ur'<div id="addPhone">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.tel] = re.sub(pat, '', tmp).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['data_url'] param = {'output': 'json', 'country': data['country_code'], 'brand': 'dkny'} page = 0 tot_page = -1 store_list = [] while True: page += 1 if tot_page != -1 and page > tot_page: break param['p'] = page try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return () raw = json.loads(body) tot_page = raw['Stores']['TotalPages'] if data['country_code'] not in region_map: # 构造州列表 region_map[data['country_code']] = dict((item['RegionId'], item['Name']) for item in raw['Regions']) for s in raw['Stores']['Items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'].upper() entry[cm.city_e] = cm.extract_city(s['City'])[0] entry[cm.name_e] = cm.html2plain(s['Name']).strip() entry[cm.addr_e] = cm.reformat_addr(s['Address']) entry[cm.tel] = s['Phone'].strip() if s['Phone'] else '' entry[cm.fax] = s['Fax'].strip() if s['Fax'] else '' entry[cm.email] = s['Email'].strip() if s['Email'] else '' entry[cm.lat] = s['Latitude'] if s['Latitude'] else '' entry[cm.lng] = s['Longitude'] if s['Longitude'] else '' region_id = s['RegionId'] if region_id in region_map[data['country_code']]: entry[cm.province_e] = cm.html2plain(region_map[data['country_code']][region_id]).strip().upper() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_hk(data): loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories') url = 'http://levi.com.hk/hk/storelocator' store_list = [] for loc in loc_list: param = {'loc': loc} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s' % param, log_name) continue start = body.find(ur'<div id="addWrapper">') if start == -1: cm.dump('Error in fetching stores: %s' % param, log_name) continue sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0] for s in re.findall(ur'<li>(.+?)</li>', sub, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG' entry[cm.city_e] = entry[cm.country_e] m = re.search(ur'<div id="addStore">([^<>]+)', s) entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else '' m = re.search(ur'<div id="addAddress">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.hours] = re.sub(pat, '', tmp).strip() m = re.search(ur'<div id="addPhone">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.tel] = re.sub(pat, '', tmp).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def func(item): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip() addr_sub = unicode(pq(item('p')[0])) addr_list = [ term.strip() for term in cm.reformat_addr(addr_sub).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) temp = item('a.track_map[href]') m = hashlib.md5() m.update(url) if len(temp) > 0: map_ref = temp[0].attrib['href'] m.update(map_ref) m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref)) if m_query: query_parm = m_query.group(1).replace('+', ' ') entry['geo_query_param'] = query_parm else: m.update(entry[cm.addr_e]) fingerprint = m.hexdigest() entry[cm.native_id] = fingerprint if entry[cm.native_id] in data['store_list']: return entry[cm.country_e] = data['country'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( ('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) return entry
def fetch_store_details(data): # http://maps.oasis-stores.com/index-v2.php?coutnryISO=GB&brand=oasis&lat=51.42014&lng=-0.20954 url = data['store_url'] code = data['country_code'] city = data['city_e'] try: html = cm.get_data(url, { 'latitude': data['lat'], 'longitude': data['lng'], 'brand': 'oasis' }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] raw = json.loads(html) entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = raw['name'] addr_list = [] for i in xrange(1, 4): tmp = cm.html2plain(raw['address%d' % i]).strip() if tmp != '': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) state = raw['countryRegion'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() state = raw['state'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() state = raw['county'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() entry[cm.zip_code] = raw['postcode'] entry[cm.country_e] = data['country_e'] entry[cm.city_e] = cm.extract_city(data['city_e'])[0] entry[cm.lat] = string.atof(data['lat']) entry[cm.lng] = string.atof(data['lng']) entry[cm.tel] = raw['phone'] entry[cm.email] = raw['email'] tmp = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] entry[cm.hours] = ', '.join([raw[d + '_open_times'] for d in tmp]) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def func(item): d = data.copy() d['node_id'] = item.attrib['id'] d['country_id'] = d['node_id'] d['country'] = cm.html2plain(item.text).strip().upper() d['url'] = d['host'] + item.attrib['href'] return d
def fetch_stores(data): # <h2 property="dc:title" url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<h2 property="dc:title"', html): end = html.find('</header>', m.start()) if end == -1: continue sub = html[m.start():end] m1 = re.search(ur'<a href="(.+?)">(.+?)</a></h2>', sub) if m1 is None: print 'Error: no more details for %s' % url continue d = data.copy() d[cm.url] = data['host'] + m1.group(1) d[cm.name_e] = cm.html2plain(m1.group(2)).strip() store_list.append(d)
def fetch_states(data): global national_added url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching states: %s' % url, log_name) return [] national_added = False m = re.search(ur'Choose a (state|region|province)', body) if m is None: d = data.copy() d['state'] = '' return [d] body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0] results = [] for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body): d = data.copy() d['url'] = data['host'] + m[0] d['state'] = cm.html2plain(m[1]).strip().upper() results.append(d)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e=ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def fetch_stores(data): url = data['url'] try: body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for s in raw['stores']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() addr_list = [] for key in ['address1', 'address2']: if s[key].strip() != '': addr_list.append(cm.reformat_addr(s[key])) entry[cm.addr_e] = ' '.join(addr_list) # r=s['region'].strip().upper() # m = re.search(ur'\b([A-Z]{2})\b', r) # if data[cm.country_e]=='UNITED STATES' and m is not None: # # 美国 # ret = gs.look_up(m.group(1), 2) # if ret is not None: # r = ret['name_e'] # entry[cm.province_e] = r entry[cm.city_e] = cm.extract_city(s['city'])[0] entry[cm.zip_code] = s['zip'].strip() entry[cm.country_e] = data[cm.country_e] entry[cm.lat] = string.atof(s['lat']) entry[cm.lng] = string.atof(s['lng']) entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.email] = s['emailaddress'].strip() entry[cm.url] = s['website'].strip() days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] opening = [] if 'openingHours' in s and s['openingHours'] is not None: for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']): opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip())) entry[cm.hours] = ', '.join(opening) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_store_list(url): """ 获得门店的列表 :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}] :param url: """ try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'data': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] # 开始解析工作 # 查找数据部分,位于var items和var\s\w+之间 start = html.find('var items') if start == -1: return {} start += len('var items') end = html.find('var ', start) html = html[start:end] stores = [] pattern = ur'\[(.+?)\]' store_list = [] for m in re.findall(pattern, html, re.S): store_entry = {} m_list = re.findall(ur"'(.*)'", m) try: store_entry['name'] = cm.html2plain(m_list[0].strip()) store_entry['type'] = m_list[2].strip() store_entry['url'] = m_list[4].strip() except IndexError: print 'Index error: %s' % m # 去掉引号之间的内容,准备查找经纬度信息 m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0]) try: lat = string.atof(m_list[0]) lng = string.atof(m_list[1]) store_entry['lat'] = lat store_entry['lng'] = lng except (IndexError, ValueError): print 'Index error in getting coordinates: %s' % m # test # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']: if len(store_entry.keys()) > 0: store_list.append(store_entry) return store_list
def process_text(self, val): val = cm.html2plain(val.strip()) # <br/>换成换行符 val = re.sub(ur'<\s*br\s*/?\s*>', u'\n', val) # 去掉多余的标签 val = re.sub(ur'<[^<>]*?>', u'', val) return val
def fetch_stores(data): url = data["host"] + data["country_url"] % data["country_id"] try: body = cm.get_data(url) except Exception: cm.dump("Error in fetching countries: %s" % url, log_name) return [] raw = json.loads(body)["rawPos"] store_list = [] for s in raw: entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) addr_list = [] for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ["address%d" % v for v in xrange(1, 5)]]: if tmp2 != "": addr_list.append(tmp2) entry[cm.addr_e] = ", ".join(addr_list) entry[cm.city_e] = cm.extract_city(s["city"]["name"])[0] entry[cm.country_e] = s["country"]["countryCode"] entry[cm.email] = s["email"] entry[cm.fax] = s["fax"] if s["latitude"] != "": entry[cm.lat] = string.atof(s["latitude"]) if s["longitude"] != "": entry[cm.lng] = string.atof(s["longitude"]) entry[cm.hours] = cm.reformat_addr(s["openingSchedule"]) phone_list = [] for key in ["phone1", "phone2"]: if s[key].strip() != "": phone_list.append(s[key].strip()) entry[cm.tel] = ", ".join(phone_list) entry[cm.zip_code] = s["postalCode"] entry[cm.name_e] = s["shopName"] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ), log_name, ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] m = re.search(ur'var\s+retailers\s*=\s*', body) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] end = body.find(u']', m.end()) if end == -1: cm.dump('Error in fetching stores: %s' % url, log_name) return [] pat = re.compile(ur'[\{,]([a-zA-Z_\d]+):') store_list = [] for s in json.loads( re.sub(re.compile(ur'([\{,])([a-zA-Z_\d]+):'), ur'\1"\2":', body[m.end():end + 1])): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) name_list = [] for tmp in ['name', 'name_line_2']: if tmp in s and s[tmp] is not None and cm.html2plain( s[tmp]).strip() != '': name_list.append(cm.html2plain(s[tmp]).strip()) entry[cm.name_e] = ', '.join(name_list) addr_list = [] for tmp in ['address', 'address_line_2']: if tmp in s and s[tmp] is not None and cm.html2plain( s[tmp]).strip() != '': addr_list.append(cm.html2plain(s[tmp]).strip()) entry[cm.addr_e] = ', '.join(addr_list) entry[cm.country_e] = s['country'].strip().upper() entry[cm.city_e] = cm.extract_city(s['city'])[0] region = cm.html2plain(s['region']) if re.search( ur'\d+', region) is None and '&' not in region and ';' not in region: entry[cm.province_e] = region.strip().upper()
def fetch_stores(data): # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata url = data['data_url'] param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02', 'undercolor': ' 06', 'togetmap': 'mapdata'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False) dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'<marker (.+?)>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'name=\\"(.+?)\\"', m) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', '')) m1 = re.search(ur'address=\\"(.+?)\\"', m) if m1 is not None: addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', '')) tel = cm.extract_tel(addr) if tel != '': entry[cm.tel] = tel addr = addr.replace(tel, '') entry[cm.addr_e] = cm.reformat_addr(addr) m1 = re.search(ur'lat=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng=\\"(.+?)\\"', m) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.country_e] = data['country'].strip().upper() entry[cm.city_e] = cm.extract_city(data['city'])[0] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_cities(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, log_name) return [] m = re.search(ur'Choose a city', body) if m is None: cm.dump('Error in fetching cities: %s' % url, log_name) return [] body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0] results = [] for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body): d = data.copy() d['url'] = data['host'] + cm.html2plain(m[0]) d['city'] = cm.html2plain(m[1]).strip().upper() results.append(d)
def proc_store(sub, data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub) if m1 is not None: entry[cm.store_class] = m1.group(1).strip() m1 = re.search(ur'<span itemprop="address"', sub) if m1 is not None: addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0] m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.zip_code] = m2.group(1).strip() m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr_sub) m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S) if m2 is not None: entry[cm.tel] = m2.group(1).strip() m2 = re.search(ur'Fax\b(.+?)</p>', sub) if m2 is not None: entry[cm.fax] = cm.extract_tel(m2.group(1)) m2 = re.search( ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub) if m2 is not None: geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8')) param = { 'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True } try: geo_body = cm.get_data(geo_url, param) m3 = re.search( ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body) if m3 is not None: entry[cm.lat] = string.atof(m3.group(1)) entry[cm.lng] = string.atof(m3.group(2)) except Exception, e: cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
def fetch_stores(data): url = data['url'] param = { 'action': 'getStoresByCity', 'idCity': data['city_id'], 'filter': 'clothing;lacoste%20l!ve' } try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] raw = json.loads(body)['root']['DATA']['stores'] store_list = [] for s in [tmp['store'] for tmp in raw]: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = s['name'].strip() entry[cm.country_e] = data['country_code'] entry[cm.addr_e] = cm.html2plain(s['address']).strip() entry[cm.store_type] = s['category'].strip() entry[cm.city_e] = cm.extract_city(s['city'])[0] if s['email'] is not None: entry[cm.email] = s['email'].strip() if s['fax'] is not None: entry[cm.fax] = s['fax'].strip() if s['infoHours'] is not None: entry[cm.hours] = s['infoHours'].strip() if s['latitude'] is not None and s['latitude'].strip() != '': entry[cm.lat] = string.atof(s['latitude']) if s['longitude'] is not None and s['longitude'].strip() != '': entry[cm.lat] = string.atof(s['longitude']) if s['phone'] is not None: entry[cm.tel] = s['phone'].strip() if s['postCode'] is not None: entry[cm.zip_code] = s['postCode'].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_store_list(data, logger=None): if not logger: logger = logging.getLogger() url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] body = pq(body)('div.store-country #shop-list') start = body.find(ur"<div class='store-country'>") if start == -1: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] start_stores = body.find( ur'<h3><a href="/store-locator/index">Stores</a></h3>') start_outlets = body.find(ur"<h3 class='outlets'>") store_sub = body[start_stores:start_outlets] outlet_sub = body[start_outlets:] results = [] for m1 in re.finditer( ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>', store_sub): country_id = string.atoi(m1.group(1)) country = m1.group(2).strip() sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0] for m2 in re.finditer( ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>', sub1): city_id = string.atoi(m2.group(1)) city = m2.group(2).strip() sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0] for m3 in re.finditer( ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>', sub2): d = data.copy() d['country_id'] = country_id d['country'] = country d['city_id'] = city_id d['city'] = city d['url'] = m3.group(1).strip() d['store_id'] = string.atoi(m3.group(2)) d['store'] = cm.html2plain(m3.group(3).strip()) # d['store_type'] = 'store' results.append(d)
def fetch_stores(data): url = data['post_shops'] param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0, 'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0} try: html = cm.post_data(url, param) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] try: for store in (pq(tmp) for tmp in pq(html)('ul')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip() entry[cm.country_e] = data[cm.country_e] entry[cm.city_e] = data[cm.city_e] addr_list = [] for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')): if term != '': addr_list.append(term) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') except (IndexError, TypeError) as e: cm.dump(u'Error in parsing %s, %s' % (url, param), log_name) print traceback.format_exc() continue except Exception, e: print traceback.format_exc()
def fetch_store_list(url): """ 获得门店的列表 :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}] :param url: """ try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'data': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] # 开始解析工作 # 查找数据部分,位于var items和var\s\w+之间 start = html.find('var items') if start == -1: return {} start += len('var items') end = html.find('var ', start) html = html[start:end] stores = [] pattern = ur'\[(.+?)\]' store_list = [] for m in re.findall(pattern, html, re.S): store_entry = {} m_list = re.findall(ur"'(.*)'", m) try: store_entry['name'] = cm.html2plain(m_list[0].strip()) store_entry['type'] = m_list[2].strip() store_entry['url'] = m_list[4].strip() except IndexError: print 'Index error: %s' % m # 去掉引号之间的内容,准备查找经纬度信息 m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0]) try: lat = string.atof(m_list[0]) lng = string.atof(m_list[1]) store_entry['lat'] = lat store_entry['lng'] = lng except (IndexError, ValueError): print 'Index error in getting coordinates: %s' % m # test # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']: if len(store_entry.keys()) > 0: store_list.append(store_entry) return store_list
def fetch_stores(data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) code = data['country_code'] if gs.look_up(code, 1) is None: entry[cm.country_e] = cm.html2plain(data['country']).strip().upper() else: entry[cm.country_e] = code entry[cm.name_e] = data['store_name'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.lat] = data['lat'] if data['lat'] is not None else '' entry[cm.lng] = data['lng'] if data['lng'] is not None else '' m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content']) sub = data['content'][m.end():] m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub) if m1 is not None: entry[cm.store_class] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1)) if m2: entry = fetch_details(data, m2.group(1), entry) m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S) if m2: ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(',')) entry[cm.tel] = cm.extract_tel(ct_list[0]) if len(ct_list) > 1: entry[cm.email] = ct_list[1].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return tuple(entry)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] m = re.search(ur'var\s+retailers\s*=\s*', body) if m is None: cm.dump('Error in fetching stores: %s' % url, log_name) return [] end = body.find(u']', m.end()) if end == -1: cm.dump('Error in fetching stores: %s' % url, log_name) return [] pat = re.compile(ur'[\{,]([a-zA-Z_\d]+):') store_list = [] for s in json.loads(re.sub(re.compile(ur'([\{,])([a-zA-Z_\d]+):'), ur'\1"\2":', body[m.end():end + 1])): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) name_list = [] for tmp in ['name', 'name_line_2']: if tmp in s and s[tmp] is not None and cm.html2plain(s[tmp]).strip() != '': name_list.append(cm.html2plain(s[tmp]).strip()) entry[cm.name_e] = ', '.join(name_list) addr_list = [] for tmp in ['address', 'address_line_2']: if tmp in s and s[tmp] is not None and cm.html2plain(s[tmp]).strip() != '': addr_list.append(cm.html2plain(s[tmp]).strip()) entry[cm.addr_e] = ', '.join(addr_list) entry[cm.country_e] = s['country'].strip().upper() entry[cm.city_e] = cm.extract_city(s['city'])[0] region = cm.html2plain(s['region']) if re.search(ur'\d+', region) is None and '&' not in region and ';' not in region: entry[cm.province_e] = region.strip().upper()
def reformat(text): """ 格式化字符串,将多余的空格、换行、制表符等合并 """ if text is None: return None text = cm.html2plain(text.strip()) # <br/>换成换行符 text = re.sub(ur'<\s*br\s*/?>', u'\r\n', text) # 去掉多余的标签 text = re.sub(ur'<[^<>]*?>', u'', text) # # 换行转换 text = re.sub('[\r\n]+', '\r', text) # text = re.subn(ur'(?:[\r\n])+', ', ', text)[0] return text
def fetch_store_details(data): # http://maps.oasis-stores.com/index-v2.php?coutnryISO=GB&brand=oasis&lat=51.42014&lng=-0.20954 url = data['store_url'] code = data['country_code'] city = data['city_e'] try: html = cm.get_data(url, {'latitude': data['lat'], 'longitude': data['lng'], 'brand': 'oasis'}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(html) entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = raw['name'] addr_list = [] for i in xrange(1, 4): tmp = cm.html2plain(raw['address%d' % i]).strip() if tmp!='': addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) state = raw['countryRegion'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() state = raw['state'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() state = raw['county'] if state is not None and state.strip() != '': entry[cm.province_e] = state.strip().upper() entry[cm.zip_code] = raw['postcode'] entry[cm.country_e] = data['country_e'] entry[cm.city_e] = cm.extract_city(data['city_e'])[0] entry[cm.lat] = string.atof(data['lat']) entry[cm.lng] = string.atof(data['lng']) entry[cm.tel] = raw['phone'] entry[cm.email] = raw['email'] tmp = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] entry[cm.hours] = ', '.join([raw[d + '_open_times'] for d in tmp]) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['host'] + data['country_url'] % data['country_id'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching countries: %s' % url, log_name) return [] raw = json.loads(body)['rawPos'] store_list = [] for s in raw: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) addr_list = [] for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ['address%d' % v for v in xrange(1, 5)]]: if tmp2 != '': addr_list.append(tmp2) entry[cm.addr_e] = ', '.join(addr_list) entry[cm.city_e] = cm.extract_city(s['city']['name'])[0] entry[cm.country_e] = s['country']['countryCode'] entry[cm.email] = s['email'] entry[cm.fax] = s['fax'] if s['latitude'] != '': entry[cm.lat] = string.atof(s['latitude']) if s['longitude'] != '': entry[cm.lng] = string.atof(s['longitude']) entry[cm.hours] = cm.reformat_addr(s['openingSchedule']) phone_list = [] for key in ['phone1', 'phone2']: if s[key].strip() != '': phone_list.append(s[key].strip()) entry[cm.tel] = ', '.join(phone_list) entry[cm.zip_code] = s['postalCode'] entry[cm.name_e] = s['shopName'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def parse_product_list(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath( '//div[@id="elementsContainer"]/div[contains(@id,"item")]/a[@class="itemImage" and @href]' ): url = self.process_href( re.sub(r'\s', '', cm.html2plain(node._root.attrib['href'])), response.url) m = copy.deepcopy(metadata) yield Request(url=url, meta={'userdata': m}, dont_filter=True, callback=self.parse_product_details, errback=self.onerr)
def fetch_stores(data): url = data['url'] param = {'action': 'getStoresByCity', 'idCity': data['city_id'], 'filter': 'clothing;lacoste%20l!ve'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] raw = json.loads(body)['root']['DATA']['stores'] store_list = [] for s in [tmp['store'] for tmp in raw]: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = s['name'].strip() entry[cm.country_e] = data['country_code'] entry[cm.addr_e] = cm.html2plain(s['address']).strip() entry[cm.store_type] = s['category'].strip() entry[cm.city_e] = cm.extract_city(s['city'])[0] if s['email'] is not None: entry[cm.email] = s['email'].strip() if s['fax'] is not None: entry[cm.fax] = s['fax'].strip() if s['infoHours'] is not None: entry[cm.hours] = s['infoHours'].strip() if s['latitude'] is not None and s['latitude'].strip() != '': entry[cm.lat] = string.atof(s['latitude']) if s['longitude'] is not None and s['longitude'].strip() != '': entry[cm.lat] = string.atof(s['longitude']) if s['phone'] is not None: entry[cm.tel] = s['phone'].strip() if s['postCode'] is not None: entry[cm.zip_code] = s['postCode'].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_states(data): print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="state">Choose a state/provence</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] state_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): province_e = cm.html2plain(m[1]).strip().upper() if data['country_e'] == 'CHINA': # 去掉省中间的空格 province_e = province_e.replace(' ', '') ret = gs.look_up(province_e, 2) if ret is not None: province_e = ret['name_e'] d = data.copy() d['province_e'] = province_e d['url'] = data['host'] + m[0] state_list.append(d) return state_list
def fetch_regions(data): url = data['location_url'] try: body = cm.get_data(url, {'lang': data['lang'], 'country': data['country_id']}) except Exception: cm.dump('Error in fetching regions: %s, %s' % (url, data['country']), 'tudor_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] results = [] for item in pq(body.encode('utf-8'))('region[id!=""]'): d = data.copy() d['region_id'] = string.atoi(item.attrib['id']) tmp = cm.html2plain(item.attrib['name']).strip().upper() d['region_name'] = re.sub(ur'市$', '', re.sub(ur'省$', '', tmp).strip()).strip() results.append(d) return results
def fetch_countries(data): url = data['country_url'] param = {'myid': '400-all', 'idioma': 'in'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] results = [] for c in json.loads(body): if c['title'].strip() == '': continue d = data.copy() d['country'] = cm.html2plain(c['title']).strip().upper() d['key'] = c['key'] results.append(d) return results
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] start = body.find(ur'<div class="col first" itemprop="address"') if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) addr = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0] m = re.search(ur'<span itemprop="postalCode">([^<>]+)</span>', addr, re.S) if m is not None: entry[cm.zip_code] = m.group(1).strip() entry[cm.addr_e] = cm.reformat_addr(addr) start = body.find(ur'<div class="col" itemprop="contactPoints"') if start != -1: sub = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0] m = re.search(ur'<span itemprop="telephone">([^<>]+)</span>', sub, re.S) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'<span itemprop="faxNumber">([^<>]+)</span>', sub, re.S) if m is not None: entry[cm.fax] = m.group(1).strip() start = body.find(ur'<h2>opening hours</h2>') if start != -1: sub = cm.extract_closure(body[start:], ur'<table\b', ur'</table>')[0] tmp = [] for m in re.findall(ur'<td>(.+?)</td>', sub): tmp.append(cm.html2plain(m).strip()) entry[cm.hours] = ' '.join(tmp)
def fetch_cities(data): url = data['post_city'] try: html = cm.post_data(url, {'country': data['country_id']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] city_list = [] for item in pq(html)('#cities option[value!="0"]'): d = data.copy() city_e = cm.html2plain(item.text).strip().upper() ret = gs.look_up(city_e, 3) if ret is not None: city_e = ret['name_e'] d['city_e'] = city_e city_list.append(d) return city_list
def fetch_store_list(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] start = body.find(ur"<div class='store-country'>") if start == -1: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] start_stores = body.find(ur'<h3><a href="/store-locator/index">Stores</a></h3>') start_outlets = body.find(ur"<h3 class='outlets'>") store_sub = body[start_stores:start_outlets] outlet_sub = body[start_outlets:] results = [] for m1 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>', store_sub): country_id = string.atoi(m1.group(1)) country = m1.group(2).strip() sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0] for m2 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>', sub1): city_id = string.atoi(m2.group(1)) city = m2.group(2).strip() sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0] for m3 in re.finditer(ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>', sub2): d = data.copy() d['country_id'] = country_id d['country'] = country d['city_id'] = city_id d['city'] = city d['url'] = m3.group(1).strip() d['store_id'] = string.atoi(m3.group(2)) d['store'] = cm.html2plain(m3.group(3).strip()) # d['store_type'] = 'store' results.append(d)
def fetch_cities(data): url = data['data_url'] param = { 'countries': data['country_code'], 'form_build_id': data['form_build_id'], 'form_id': 'cartierfo_generic_store_locator_search_form', '_triggering_element_name': 'countries' } try: body, cookie = cm.post_data_cookie(url, param, cookie=data['cookie']) except Exception: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] if cookie is not None: data['cookie'] = cookie raw = json.loads(body) body = None for item in raw: if 'data' in item and item['data'] != '': body = item['data'] break if body is None: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] # body = body.decode('unicode_escape') start = body.find(ur'<select id="edit-cities"') if start == -1: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0] results = [] for m in re.findall(ur'<option.+?value="([^"]+?)".*?>.+?</option>', body): d = data.copy() d['city'] = cm.html2plain(m) results.append(d) print 'Country: %s, City: %s' % (data['country'], d['city'])
def parse(self, response): metadata = response.meta['userdata'] sel = Selector(response) for node in sel.xpath( '//div[contains(@class,"main-menu")]//li[contains(@class,"level0")]' ): node_class = node._root.attrib['class'] mt = re.search(r'\b(\w+)\s*$', node_class) if not mt: continue tag_type = 'category-0' tag_name = unicodify(mt.group(1)).lower() temp = node.xpath('./a[@href]') if not temp: continue href = temp[0]._root.attrib['href'] tag_text = u', '.join([ cm.html2plain(unicodify(val.text)) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) m = copy.deepcopy(metadata) m['tags_mapping'][tag_type] = [{ 'name': tag_name, 'title': tag_text }] gender = cm.guess_gender(tag_name) if gender: m['gender'] = [gender] if not href or not href.strip(): continue else: yield Request(url=href, meta={'userdata': m}, callback=self.parse_category_0)
def fetch_states(data): print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="state">Choose a state/provence</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] state_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): province_e = cm.html2plain(m[1]).strip().upper() if data['country_e'] == 'CHINA': # 去掉省中间的空格 province_e = province_e.replace(' ', '') ret = gs.look_up(province_e, 2) if ret is not None: province_e=ret['name_e'] d = data.copy() d['province_e'] = province_e d['url'] = data['host'] + m[0] state_list.append(d) return state_list
def fetch_countries(data): url = data['home_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] country_list = [] for item in pq(html)('#country option[value!="reset"]'): d = data.copy() d['country_id'] = string.atoi(item.attrib['value']) country_e = cm.html2plain(item.text).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e country_list.append(d) return country_list
def proc_store(sub, data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub) if m1 is not None: entry[cm.store_class] = m1.group(1).strip() m1 = re.search(ur'<span itemprop="address"', sub) if m1 is not None: addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0] m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.zip_code] = m2.group(1).strip() m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S) if m2 is not None: entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr_sub) m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S) if m2 is not None: entry[cm.tel] = m2.group(1).strip() m2 = re.search(ur'Fax\b(.+?)</p>', sub) if m2 is not None: entry[cm.fax] = cm.extract_tel(m2.group(1)) m2 = re.search(ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub) if m2 is not None: geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8')) param = {'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True} try: geo_body = cm.get_data(geo_url, param) m3 = re.search(ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body) if m3 is not None: entry[cm.lat] = string.atof(m3.group(1)) entry[cm.lng] = string.atof(m3.group(2)) except Exception, e: cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
def fetch_cities(data): url = data['data_url'] param = {'countries': data['country_code'], 'form_build_id': data['form_build_id'], 'form_id': 'cartierfo_generic_store_locator_search_form', '_triggering_element_name': 'countries'} try: body, cookie = cm.post_data_cookie(url, param, cookie=data['cookie']) except Exception: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] if cookie is not None: data['cookie'] = cookie raw = json.loads(body) body = None for item in raw: if 'data' in item and item['data'] != '': body = item['data'] break if body is None: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] # body = body.decode('unicode_escape') start = body.find(ur'<select id="edit-cities"') if start == -1: cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return [] body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0] results = [] for m in re.findall(ur'<option.+?value="([^"]+?)".*?>.+?</option>', body): d = data.copy() d['city'] = cm.html2plain(m) results.append(d) print 'Country: %s, City: %s' % (data['country'], d['city'])
def fetch_countries(data): url = data['url'] param = {'lang': 'en_GB'} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] results = [] for item in json.loads(body): if item is None: continue d = data.copy() d['country_code'] = item['isocode'] d['country'] = cm.html2plain(item['Translation']['en_GB']['label']).strip().upper() d['country_id'] = item['id'] results.append(d) return results tot_processed = 0 def fetch_stores(data): store_list = [] global tot_processed tot_processed += 1 cm.dump('Processint city #%d' % tot_processed, log_name)
body = cm.get_data(url, param) except Exception, e: cm.dump("Error in fetching stores: %s, %s" % (url, param), log_name) return () tree = et.fromstring(body.encode("utf-8")) store_list = [] for store in tree.iter("poi"): entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) val = store.getiterator("uid")[0].text if val in store_map: continue store_map[val] = entry val = store.getiterator("name")[0].text entry[cm.name_e] = cm.html2plain(val).strip() if val else "" addr_list = [] for idx in xrange(1, 3): val = store.getiterator("address%d" % idx)[0].text if val: val = cm.reformat_addr(val) if val != "": addr_list.append(val) entry[cm.addr_e] = ", ".join(addr_list) val = store.getiterator("city")[0].text entry[cm.city_e] = cm.extract_city(val)[0] if val else "" val = store.getiterator("province")[0].text entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else "" if entry[cm.province_e] == "":
try: body = cm.get_data(url, {'display_country': 'CN'}) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return () m = re.search(ur'<select class="country-selector[^<>]+>(.+?)</select>', body, re.S) if m is None: cm.dump('Error in fetching countries: %s' % url, log_name) return () sub = m.group(1) results = [] for m in re.findall(ur'<option value="([A-Z]{2})"[^<>]+>([^<>]+)</option>', sub): d = data.copy() d['country_code'] = m[0] d['country'] = cm.html2plain(m[1]).strip().upper() # if m[0] == 'US': results.append(d) return tuple(results) def fetch_store_detail(data): url = data['data_url'] param = {'format': 'JSON', 'location_id': data['store_id'], 'type': 'location'} try: s = json.loads(cm.get_data(url, param))['locations'][0] except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return () store_id = data['store_id']