def fetch_stores(db, data, logger): """ 获得商店信息 :param data: """ url = data['post_url'] try: html = cm.post_data(url, { 'pid': data['city_id'], 'lang': 'en', 'action': 'popola_box_DX' }) if html.strip() == u'': logger.error( unicode.format(u'Failed to fetch stores for city {0}', data['city_id'])) return [] body = pq(html) except Exception as e: print 'Error occured in getting city list: %s' % url dump_data = { 'level': 2, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for item in (pq(temp) for temp in body('a[href]')): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.url] = item[0].attrib['href'] entry[cm.name_e] = item('h3.titleShop')[0].text.strip() # terms = cm.reformat_addr(item('div.txtBoxSingleStore p.lineHeight14')[0].text).split(',') terms = cm.reformat_addr( unicode(item('div.txtBoxSingleStore p.lineHeight14'))).split(',') tel = cm.extract_tel(terms[-1]) if tel != '': terms = terms[:-1] entry[cm.tel] = tel entry[cm.addr_e] = u', '.join([v.strip() for v in terms]) entry['country_e'] = data['country_e'] entry['city_e'] = data['city_e'] gs.field_sense(entry) logger.info( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) store_list.append(entry) cm.insert_record(db, entry, 'spider_stores.stores') return store_list
def func(s): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.native_id] = int(s['id']) if entry[cm.native_id] in data['store_list']: return entry[cm.lat] = float(s['latitude']) entry[cm.lng] = float(s['longitude']) entry[cm.email] = s['email'] entry[cm.fax] = s['fax'] entry[cm.store_class] = ' | '.join((str.format('ISCHANEL:{0}', 'YES' if s['ischanel'] != 0 else 'NO'), s['postypename'])) try: entry[cm.hours] = ' | '.join( map(lambda val: ':'.join((val['day'], val['opening'] if 'opening' in val else '')), s['openinghours'])) except TypeError as e: pass entry[cm.tel] = s['phone'] trans = s['translations'][0] entry[cm.addr_e] = cm.html2plain( ', '.join(filter(lambda val: val, (trans[key] for key in ('address1', 'address2'))))) entry[cm.city_e] = cm.html2plain(trans['cityname'].strip().upper()) entry[cm.name_e] = cm.html2plain(trans['name']) entry[cm.province_e] = cm.html2plain(trans['statename']).strip().upper() entry[cm.store_type] = ', '.join(temp['name'] for temp in trans['products']) entry[cm.url] = s['website'] entry[cm.zip_code] = s['zipcode'] country_id = s['country_id'] if 'country_id' in s else None if country_id and country_id in data['country_map']: entry[cm.country_e] = data['country_map'][country_id] else: ret = gs.geocode2(latlng=str.format('{0},{1}', entry[cm.lat], entry[cm.lng]), logger=logger) country_e = None if len(ret) > 0: for item in ret[0]['address_components']: if 'country' in item['types']: country_e = item['long_name'].strip().upper() break if not country_e: country_e = raw_input(unicode.format(u'INPUT THE COUNTRY NAME FOR {0} AT {1}, {2}', entry[cm.city_e], entry[cm.lat], entry[cm.lng])).decode('utf-8') if not country_e: # 无法确定国家名称,放弃该记录 return entry[cm.country_e] = country_e if country_id: data['country_map'][country_id] = country_e logger.info(('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) data['store_list'].add(entry[cm.native_id])
def func(item): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip() addr_sub = unicode(pq(item('p')[0])) addr_list = [ term.strip() for term in cm.reformat_addr(addr_sub).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) temp = item('a.track_map[href]') m = hashlib.md5() m.update(url) if len(temp) > 0: map_ref = temp[0].attrib['href'] m.update(map_ref) m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref)) if m_query: query_parm = m_query.group(1).replace('+', ' ') entry['geo_query_param'] = query_parm else: m.update(entry[cm.addr_e]) fingerprint = m.hexdigest() entry[cm.native_id] = fingerprint if entry[cm.native_id] in data['store_list']: return entry[cm.country_e] = data['country'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( ('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) return entry
def fetch_stores(db, data, logger): q = pq(url='http://www.paulandjoe.com/en/ozcms/stores/list/?country_id=&postcode=') store_list = [] # Country country_a = q('#store_list>li>a') country_b = q('#store_list>li>ul') assert (len(country_a) == len(country_b)) for i in xrange(len(country_a)): country = country_a[i].text.strip().upper() store_a = pq(country_b[i])('a.marker-store') store_b = pq(country_b[i])('span.store-infos') assert (len(store_a) == len(store_b)) for j in xrange(len(store_a)): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat = store_a[j].attrib['data-latitude'] lat = float(lat) if lat else None lng = store_a[j].attrib['data-longitude'] lng = float(lng) if lng else None if lat and lng: entry[cm.lat], entry[cm.lng] = lat, lng entry[cm.name_e] = store_a[j].text entry[cm.addr_e] = cm.reformat_addr(str(pq(store_b[j]))) # cm.reformat_addr(str(store_b[j])) entry[cm.country_e] = country gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info('(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) store_list.append(entry) cm.insert_record(db, entry, 'spider_stores.stores') return tuple(store_list)
def fetch_stores(db, data, logger): brand_id, brand_name, url = (data[key] for key in ('brand_id', 'brandname_c', 'url')) # try: body = cm.get_data(url) q = pq(body) # except Exception, e: # logger.error(unicode.format(u'Error in fetching contents for {0}', url)) # return () m1 = re.search(ur'var\s+markers\s*=\s*\[', body) if not m1: logger.error( unicode.format(u'Error in finding stores for {0}:{1}', brand_id, brand_name)) return () body = body[m1.end() - 1:] m2 = re.search(ur'\]\s*;', body) if not m2: logger.error( unicode.format(u'Error in finding stores for {0}:{1}', brand_id, brand_name)) return () raw = json.loads(body[:m2.end() - 1]) store_list = [] for s in raw: entry = cm.init_store_entry(brand_id, brand_name, data['brandname_c']) # try: try: entry[cm.lat], entry[cm.lng] = (float(s['location'][idx]) for idx in (0, 1)) except (KeyError, IndexError, ValueError, TypeError): pass s = s['content'] try: entry[cm.name_e] = cm.html2plain(s['title']).strip() except (KeyError, TypeError): pass tmp_list = s['analytics_label'].split('-') entry[cm.country_e] = tmp_list[0] entry[cm.city_e] = cm.extract_city(tmp_list[1])[0] try: entry[cm.addr_e] = cm.reformat_addr(s['address']).strip() except (KeyError, TypeError): pass try: entry[cm.fax] = s['fax'].strip() except (KeyError, TypeError): pass try: entry[cm.tel] = s['phone'].strip() except (KeyError, TypeError): pass try: entry[cm.email] = s['mail'].strip() except (KeyError, TypeError): pass try: entry[ cm. url] = u'http://en.longchamp.com/store/map' + s['url'].strip() except (KeyError, TypeError): pass try: entry[cm.zip_code] = cm.html2plain(s['zipcode_town']).replace( tmp_list[1], '').strip() except (KeyError, TypeError): pass gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( unicode.format( u'{0}:{1} FOUND STORE: {2}, {3}, ({4}, {5}, {6})', data['brand_id'], data['brandname_e'], *(entry[key] for key in (cm.name_e, cm.addr_e, cm.city_e, cm.country_e, cm.continent_e)))) cm.insert_record(db, entry, 'spider_stores.stores') store_list.append(entry) return tuple(store_list)
entry[cm.fax] = re.sub(pat_fax, '', term) elif re.search(pat_email, term): entry[cm.email] = re.sub(pat_email, '', term) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) cm.insert_record(db, entry, data['table']) return () def fetch_cities(db, data, logger): url = data['url'] param = {'IsFooterForm': 'true', 'CurrentCountryID': data['country_code']} if data['state_code']: param['CurrentRegionID'] = data['state_code'] try: body, data['cookie'] = cm.get_data_cookie(url, param, cookie=data['cookie']) q = pq(body) except Exception, e: # cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name) return ()
entry[cm.hours] = ', '.join(hours_list) gs.field_sense(entry) if entry[cm.addr_e]: ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e])) cm.insert_record(db, entry, 'spider_stores.stores') store_list.append(entry) except (IndexError, TypeError) as e: logger.error(traceback.format_exc()) continue return tuple(store_list) def fetch(db, data=None, user='******', passwd=''): logging.config.fileConfig('ca.cfg') logger = logging.getLogger('firenzeLogger') logger.info(u'ca STARTED') def func(data, level):
def fetch_store_details(db, data, logger): url = data['store_url'] try: body = pq(cm.get_data(url)) except Exception: logger.error('Error in fetching store details: %s' % url) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.addr_e] = cm.reformat_addr(unicode(body('p.address'))) entry[cm.store_type] = ', '.join(temp.text.strip() for temp in body('li.availability li')) raw = json.loads(body('div.gmap_info_box')[0].attrib['data-shop'])['table'] # start = body.find(ur'<h3>available in store</h3>') # if start != -1: # type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] # entry[cm.store_type] = ', '.join( # cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) # # start = body.find(ur"<div class='gmap_info_box'") # if start == -1: # logger.error('Error in fetching store details: %s' % url) # return [] # body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] # raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lng] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) logger.info('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) cm.insert_record(db, entry, 'spider_stores.stores') return [entry]