from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json import re fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.ct-text > h3 > a', is_debug=True), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a'), # Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.search_ticket_assess > span.grades > em'), #正则表达式不一样 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > ul > li:nth-child(1) > div > div.ct-text > p'), ) def get_shop_ticket(): print(111) def get_shop_info(): print(222) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_PRICE, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(4) > dd > div:nth-child(1) > div', pause_time=3, is_focus=True, is_info=True), Field(fieldname=FieldName.SHOP_TIME, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(5) > dd > div:nth-child(1)', is_focus=True), #Field(fieldname=FieldName.SHOP_SERVICE,css_selector='3) > div.main-bd > div > div.brief-box.clearfix > div.brief-right > ul > li.promise',attr='innerHTML', filter_func=get_shop_service, is_focus=True), #门票信息尚有问题
fl_shop1 = Fieldlist( Field( fieldname=FieldName.SHOP_NAME, css_selector='div > div.mp-sight-info > a > div.mp-sight-detail > h3', is_info=True), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= 'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-price > em', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.mp-sight-info > a', attr='href', is_debug=True, is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field( fieldname=FieldName.SHOP_IMG, css_selector='div > div.mp-sight-info > a > div.mp-sight-imgcon > img', attr='src', is_info=True), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector= 'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-location > span', is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field( fieldname=FieldName.SHOP_GRADE, css_selector= 'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-comments > span.mpf-starlevel > span.mpg-iconfont.mpf-starlevel-gain', attr='data-score', is_info=True), #正则表达式的使用有问题 Field( fieldname=FieldName.SHOP_COMMENT_NUM, css_selector= 'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-comments > span.mp-comments-totalnum', is_info=True), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='', filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', filter_func=get_shop_rate, is_info=True), )
num = re.findall(r'[\d]{1,10}',_str) return str(num[0]) def get_shop_grade(self,_str): return "0.0" def get_shop_price(self,_str): return "0.0" def get_shop_rate(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector='div > div.ct-text > h3 > a',is_debug=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',is_info=True,filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src',is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a', is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade), #正则表达式不一样 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True) ) fl_shop2 = Fieldlist( ) page_shop_1 = Page(name='马蜂窝景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#_j_search_result_left > div:nth-child(1) > div > ul > li',), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) page_shop_2 = Page() page_shop_2 = Page(name='马蜂窝景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.ct-text > h3 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.ml-pro-info > p'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= ' a > div.ml-pro-info > div.ml-pro-price > span.price > i:nth-child(2)', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='a', attr='href', is_debug=True, is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.ml-pro-img > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= 'a > div.ml-pro-info > div.orderNum.adress > span:nth-child(1)', is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field(fieldname=FieldName.SHOP_GRADE, css_selector='a > div.ml-pro-info > div:nth-child(3) > span', is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='', filter_func=get_comment_num, is_info=True), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='', filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='a', attr='href', filter_func=get_comment_url, is_info=True))
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector=' a.e_title.js_list_name', is_debug=True), Field(fieldname=FieldName.SHOP_URL, css_selector='a.e_title.js_list_name', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img:nth-child(1)', attr='src', is_info=True), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector= 'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > p > span > em', is_info=True), Field( fieldname=FieldName.SHOP_PRICE, css_selector= ' div > div > div.clrfix > div.item_hotel_info > div.hotel_price > div > div > div > p > a > b', is_info=True), #正则表达式不一样 小问题 Field( fieldname=FieldName.SHOP_COMMENT_NUM, css_selector= ' div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > div.level.levelmargin > a.level_comment.level_commentbd.js_list_usercomcount', is_info=True), Field( fieldname=FieldName.SHOP_GRADE, css_selector= 'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > div.level.levelmargin > a.level_score.js_list_score > strong', is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, filter_func=get_shop_feature, css_selector='', is_info=True))
from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector='li.hotel_item_name > h2 > a',regex=r'^[\d]*(.*)$',repl=r'\1'), Field(fieldname=FieldName.SHOP_URL,css_selector='li.hotel_item_name > h2 > a',attr='href',regex=r'^([^\?]*)?.*$',repl=r'\1'), Field(fieldname=FieldName.SHOP_ID, css_selector='li.hotel_item_name > h2 > a', attr='href',regex=r'^[^\?\d]*([\d]*).html?.*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_IMG, css_selector='li.pic_medal > div > a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='li.hotel_item_name > p.hotel_item_htladdress'), Field(fieldname=FieldName.SHOP_GRADE,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_value'), Field(fieldname=FieldName.SHOP_STATISFACTION_PERCENT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.total_judgement_score > span'), Field(fieldname=FieldName.SHOP_RATE, css_selector='li.hotel_item_name > span', attr='innerHTML',regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='li.hotel_item_name > p.hotel_item_last_book'), Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.J_price_lowList'), Field(fieldname=FieldName.SHOP_CATEGORY_NAME, css_selector='li.hotel_item_name > p.medal_list > span'), Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_judgement > span'), Field(fieldname=FieldName.SHOP_GRADE_TEXT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.recommend'), ) def get_recommend_all_room_dict(self, _str): p = PyQuery(_str) item_list = [] for each in p('tr').items(): if each.attr('class'): item_list.append(each)
import math import datetime def get_zero(self,_str): return 0.0 def get_shop_area(self,_str): return '千岛湖东北湖区'; def get_baidu_spider_step(self,_str): return "2"; fl_shop1 = Fieldlist( #card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(1) > span > a #card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div:nth-child(1) > span:nth-child(1) > a Field(fieldname=FieldName.SHOP_NAME,css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a',is_info=True), #card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div.row.addr > span #card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div.row.addr > span Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='div.cf > div.ml_30 > div.row.addr > span',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.cf > div.col-r > div.img-wrap > a > img', attr='src',is_info=True), Field(fieldname=FieldName.SHOP_LNG, css_selector='',filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_LAT, css_selector='',filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_AREA,css_selector='',filter_func=get_shop_area,is_info=True), Field(fieldname=FieldName.BAIDU_SPIDER_STEP,css_selector='',filter_func=get_baidu_spider_step,is_info=True) ) def get_shop_name(self,_str): self.shop_name = _str; return _str; fl_shop2 = Fieldlist( #phoenix_dom_3_0 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span #phoenix_dom_3_1 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span
import re import time import json from pyquery import PyQuery import xmltodict fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.search_ticket_title > h2 > a'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.search_ticket_title > h2 > span > span.rate'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.search_ticket_title > h2 > a', attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.search_ticket_title > div.adress'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div.search_ticket_assess > span.grades > em'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.search_ticket_assess > span.grades', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div.search_ticket_title > div.exercise'), ) def get_shop_service(self, _str): p = PyQuery(_str) service_list = [] for i in p('span').items():
def get_shop_rate(self,_str): return "" def get_shop_grade(self,_str): return "0.0" fl_shop1 = Fieldlist( ##_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > h3 > a Field(fieldname=FieldName.SHOP_NAME, css_selector='div.ct-text > h3 > a', is_debug=True), #_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > h3 > a Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.ct-text > h3 > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.flt1 > a > img', attr='src', is_info=True), #_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > div > p:nth-child(1) Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.ct-text > ul > li:nth-child(1) > a', is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='div.ct-text > ul > li.frt._j_hotel_ota > a > span.seg-price'), # 正则表达式不一样 #_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > ul > li:nth-child(2) > a Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.ct-text > ul > li:nth-child(2) > a', is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div.ct-text > div > p:nth-child(1)', is_info=True), Field(fieldname=FieldName.SHOP_GRADE, css_selector='',filter_func=get_shop_grade, is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='',filter_func=get_shop_rate, is_info=True), )
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a'), # 5A景区 Field( fieldname=FieldName.SHOP_RATE, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > span.level' ), Field(fieldname=FieldName.SHOP_URL, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a', attr='href'), Field( fieldname=FieldName.SHOP_IMG, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_show > div.show loading > a > img', attr='src'), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > p.address.color999 > span' ), Field( fieldname=FieldName.SHOP_GRADE, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > div.sight_item_hot > span.product_star_level > em > span' ), #价格 Field( fieldname=FieldName.SHOP_PRICE, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_pop > table > tbody > tr-nthchild:(0) > td > span.sight_item_price > em' ), Field( fieldname=FieldName.SHOP_FEATURE, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.intro.color999' ), )
import json def get_shop_area(self, _str): return '千岛湖乡村游景点' fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='dl > dd > a > h2', is_info=True), Field(fieldname=FieldName.SHOP_URL, css_selector='dl > dd > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(1)', is_info=True), Field(fieldname=FieldName.SHOP_PHONE, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)', is_info=True), Field(fieldname=FieldName.SHOP_AREA, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)', filter_func=get_shop_area, is_info=True)) page_shop_1 = Page( name='大众点评餐饮店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector= 'body > div.mainLayout.newsMainLayout > div.newsLeftLayout.sceneRightLayout > div'
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector= 'div.product-regular.clearfix > div.product-section > h3 > a', is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', is_info=True, filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_URL, css_selector= 'div.product-regular.clearfix > div.product-section > h3 > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector= 'div.product-regular.clearfix > div.product-left > a > img', attr='src', is_info=True), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector= ' div.product-regular.clearfix > div.product-section > dl:nth-child(3) > dd', is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector= 'div.product-regular.clearfix > div.product-info > div > em', is_info=True), Field( fieldname=FieldName.SHOP_COMMENT_NUM, css_selector= ' div.product-regular.clearfix > div.product-info > ul > li:nth-child(2) > a ' ), Field( fieldname=FieldName.SHOP_FEATURE, css_selector= ' div.product-regular.clearfix > div.product-section > dl:nth-child(6) > dd > div' ), Field( fieldname=FieldName.SHOP_GRADE, css_selector= 'div.product-regular.clearfix > div.product-info > ul > li:nth-child(1) > b', filter_func=get_shop_grade, is_info=True), )
result.setdefault('评分', float(re.sub(r'[^\d.]*', '', i.text()))) elif '评价' in i.text(): result.setdefault('评论数', int(re.sub(r'[^\d]*', '', i.text()))) elif '游记' in i.text(): result.setdefault('游记数', int(re.sub(r'[^\d]*', '', i.text()))) return json.dumps(result, ensure_ascii=False) fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-title > div > h3 > a'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-title > div > span.hotel-rate.rate5', attr='class', regex='[^\d]*', is_info=True), Field(fieldname=FieldName.SHOP_INTRO, css_selector='div.hotel-info > ul', attr="innerHTML", is_debug='True', filter_func=get_shop_grade, is_info=True), ) # fl_shop2 = Fieldlist( # Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.container > div.hotel-intro > div.intro-hd > div.location > span', attr='title', offset=6, try_times=10, pause_time=1), # Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#_j_booking_info', attr='innerHTML', filter_func=get_shop_room_all, offset=6, try_times=10, pause_time=2), # Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#_j_map_poi_list > div.bd', attr='innerHTML', filter_func=get_shop_traffic, offset=6, try_times=10, pause_time=1), # Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#_j_hotel_info', attr='innerHTML', filter_func=get_shop_facilities, offset=6, try_times=10, pause_time=1), # Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#_j_comment', attr='innerHTML', filter_func=get_shop_stattistics), # )
return "" def get_shop_feature(self,_str): return "" def get_shop_rate(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True) ) page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_comment_grade(self,_str): doc = pq(_str) if (doc('.star-active').length) == 3:
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.theinfo.ticket.clearfix > a > dl > dt > p > span', is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector= 'div.theinfo.ticket.clearfix > a > div.priceinfo > span > em', is_info=False), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='div.theinfo.ticket.clearfix > a', attr='href', is_debug=True, is_info=False), #img还有些许问题 Field( fieldname=FieldName.SHOP_IMG, css_selector='div.theinfo.ticket.clearfix > a > div.imgbox > div > img', attr='data-src', is_info=False), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector='div.theinfo.ticket.clearfix > a > dl > dd:nth-child(2)', is_info=False), #这里应该做一个转换 Field(fieldname=FieldName.SHOP_GRADE, css_selector= 'div.theinfo.ticket.clearfix > a > div.priceinfo > div > p > i', filter_func=get_shop_grade, is_info=False), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector= 'div.theinfo.ticket.clearfix > a > div.priceinfo > div > p > span', is_info=False), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='', is_info=True, filter_func=get_shop_feature), Field(fieldname=FieldName.SHOP_RATE, css_selector='', is_info=True, filter_func=get_shop_rate))
# -*- coding:utf-8 -*- from spider.driver.base.driver import Driver from spider.driver.base.mysql import Mysql import time from pyquery import PyQuery from spider.driver.base.field import Field, FieldName, Fieldlist, FieldType from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.base.tabsetup import TabSetup fl_weixin1 = Fieldlist( Field(fieldname='public_name', css_selector='div > div.txt-box > p.tit > a', regex=r'[^\u4e00-\u9fa5]*'), ) fl_weixin2 = Fieldlist( Field(fieldname='article_name', css_selector='div > div > h4'), Field(fieldname='article_time', css_selector='div > div > p.weui_media_extra_info'), ) page_weixin_1 = Page(name='微信公众号列表页面', fieldlist=fl_weixin1, listcssselector=ListCssSelector( list_css_selector='#main > div.news-box > ul > li')) page_weixin_2 = Page( name='微信公众号文章列表页面', fieldlist=fl_weixin2, tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'),
# -*- coding:utf-8 -*- from spider.driver.travel.core.traveldriver import TravelDriver from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div:nth-child(2) > span'), ) fl_shop2 = Fieldlist( Field( fieldname=FieldName.SHOP_NAME, css_selector= 'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span' ), ) fl_comment1 = Fieldlist( # Field(fieldname=FieldName.SHOP_NAME, css_selector='body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'), Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='div.rate-info > div.avatar-info > div.user-nick'), ) page_shop_1 = Page(name='飞猪景点店铺列表页面', fieldlist=fl_shop1,
from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json import re fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.h_info_pic > a > img', attr='src'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='title'), Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='href'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > b', attr='class', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^([\d.]*).*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^[\d.]*[^\d]*([\d]*)[^\d]*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.h_info_text > div.h_info_base > p.lastt_book'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.h_info_text > div.h_info_pri', regex=r'[^\d.]*'), ) def get_shop_room(self, _str): p = PyQuery(_str) room_list = [] for i in p('div.hdetail_type > div.htype_list > div.htype_item').items(): info_list = i('div.htype_info').text().split('\n') detail = info_list[3].split('|') type_list = [] for j in i('div.htype_info_list').items('tbody > tr'): type = j.text()
shop_comment_url = "https://m.tuniu.com/h5/tour/comment/" + shop_id + "/4" return shop_comment_url fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_URL,css_selector='a',attr='href',filter_func=get_shop_comment_url, is_info=True) ) page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='#main-page > div.mp-main > div.mp-headfigure > div.mp-headfeagure-info > div'),
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector=' dl > dt > a', is_debug=True), Field(fieldname=FieldName.SHOP_URL, css_selector='dl > dt > a', attr='onclick', filter_func=get_shop_url, is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector=' a > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='dl > dd.proInfo-address > i', is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector=' div > div:nth-child(2) > ul > li:nth-child(2) > a', is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector=' dl > dd:nth-child(4)', is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.priceInfo-price > dfn > span', is_info=True), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div:nth-child(2) > ul > li:nth-child(1) > a > b', filter_func=get_shop_grade, is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', filter_func=get_shop_rate, is_info=True), )
import xmltodict def get_shop_rate(self,_str): return "" def get_shop_feature(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector=' div > div.h_info > div.h_info_text > div.h_info_base > p.h_info_b1 > a > span.info_cn',attr='innerHTML', is_info=True), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True), #有些问题 Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True), #稍许有些问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'), Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True) ) fl_shop2 = Fieldlist() page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) # page_shop_2 = Page() # page_shop_2 = Page(name='艺龙酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection)) fl_comment1 = Fieldlist( Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector=' div.cmt_userinfo > div > p.cmt_un',is_info=True),
p = PyQuery(_str) tag_list = [] for i in list(p('span').items())[1:]: tag_list.append(i.text()) return json.dumps(tag_list, ensure_ascii=False) def get_shop_rate(self, _str): return str(float((int(_str)/10))) fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.price > p > strong'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.remark > div > div > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), # Field(fieldname=FieldName.SHOP_TAG, css_selector='div.hotel-info-ctn > div.hotel-info-main > p.hotel-tags', attr='innerHTML', filter_func=get_shop_tag, pause_time=3), Field(fieldname=FieldName.SHOP_URL,css_selector='',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG,css_selector='',attr='src',is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',is_info=True) ) def get_shop_room_all(self, _str): p = PyQuery(_str) sale_dict = {} room_list = [] for i in p('div.hotel-rooms > div.hotel-rooms-list > div.hotel-rooms-list-cont > ul > li').items(): room = {'room_name': i('div.title-info.clearfix.dph-col.dph-col1 > div.title > h3').text()} for j in i('div.h-item-more.h-hide').text().split('\n'): room.update((lambda x: {x[0].strip(): x[1].strip()} if len(x) == 2 else {})(j.split(':')))
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4', is_info=True), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num', attr='innerHTML', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price > b', attr='innerHTML', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.pic > a > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_SCORE, css_selector='div.txt > div.comment > span', filter_func=get_zero, attr='class', is_info=True), Field(fieldname=FieldName.SHOP_COOK_STYLE, css_selector='', filter_func=get_shop_cookie_style, is_info=True), Field(fieldname=FieldName.SHOP_SITE, css_selector='', filter_func=get_shop_site, is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='div.txt > div.tit > a', attr='href', filter_func=get_comment_url, is_info=True), Field(fieldname=FieldName.SHOP_LNG, css_selector='', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_LAT, css_selector='', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_SERVICE, css_selector='', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_TASTE, css_selector='', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_ENV, css_selector='', filter_func=get_zero, is_info=True), )
from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_IMG, css_selector='div.hotel-pic > a > img', attr='src'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-title > div > h3 > a'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.hotel-title > div > h3 > a', attr='href'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-title > div > span.hotel-rate', attr='class', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div.hotel-info > ul > li.rating > em'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.hotel-info > ul > li:nth-child(2) > a > em', regex=r'[^\d]*'), ) def get_shop_room_all(self, _str): p = PyQuery(_str) room_list = [] for i in p('a.item._j_booking_item').items():
from selenium.webdriver.remote.webelement import WebElement from spider.driver.base.tabsetup import TabSetup from spider.driver.base.field import Field, Fieldlist from spider.driver.base.page import Page, PageGroup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb fl = Fieldlist(Field(fieldname=12), Field(fieldname=13)) mongo = Mongodb(db='122', collection='12') lcs = ListCssSelector(list_css_selector=12) tab = TabSetup(url_name=12) p = Page(name=122, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) p1 = Page(name=123, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) pg = PageGroup(p, p1) print(next(pg))
from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.row-center > div > h5 > a'), Field(fieldname=FieldName.SHOP_CURR_URL, css_selector='div > div.row-center > div > h5 > a', attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.row-left.fleft > a > img', attr='src'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.row-center > div > h5 > span.row-subtitle', attr='title', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.row-center > div > p.row-someone-book > span'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.row-sub-right.fright > a > p.score > span.value'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.row-sub-right.fright > a > p.comment > span'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.row-right.fright > div.box-price > p > span.pi-price.pi-price-lg', regex=r'[^\d]*'), ) def get_room_all(self, _str): p = PyQuery(_str) room_list = [] for i in p('div.room-item-wrapper > div.room-item-inner > div:nth-child(1)').items(): room_list.append(i.text().split()[1:]) return json.dumps(room_list, ensure_ascii=False) def get_shop_intro(self, _str): p = PyQuery(_str) info_list = p.text().split('\n')
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div:nth-child(2) > span'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= 'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='', attr='id', filter_func=get_shop_url, is_debug=True, is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='', attr='', filter_func=get_shop_img, is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='', filter_func=get_shop_address, is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field( fieldname=FieldName.SHOP_GRADE, css_selector= 'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)', is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='', filter_func=_get_shop_comment_num, is_info=True), #无shop_feature Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div:nth-child(2) > div:nth-child(2) > span', is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='div:nth-child(2) > div:nth-child(3) > span', is_info=True))
def get_shop_rate(self, _str): return str(float((int(_str)/10))) def get_shop_subtype_name(self, _str): return _str.strip() def get_shop_feature(self,_str): return "" def get_comment_url(self,_str): return _str + "/review_all" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr'), Field(fieldname=FieldName.SHOP_IMG,css_selector='div.pic > a > img',is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.txt > span > span:nth-child(1) > b',is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='div.txt > div.tit > a', attr='href',filter_func=get_comment_url, is_info=True) ) page_shop_1 = Page(name='大众点评景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#shop-all-list > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_shop_time(self, _str): try: p = PyQuery(_str) shop_time = '' for i in p('p.info.info-indent').items(): if '营业时间' in i.text(): shop_time = i.text()
return _str.strip() fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_TAG, css_selector='div.txt > span.comment-list', attr='innerHTML', filter_func=get_shop_tag, pause_time=1), Field(fieldname=FieldName.SUBTYPE_NAME, css_selector='div.txt > div.tag-addr > a:nth-child(1)', filter_func=get_shop_subtype_name), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr')) page_shop_1 = Page(name='大众点评爱车店铺列表页面', fieldlist=fl_shop1,
fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_IMG, css_selector='div.hotel-logo > img', attr='src'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-info > div.nameAndIcon > a'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.hotel-info > div.nameAndIcon > a', attr='href'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-info.fl > div.nameAndIcon > div', attr='class', regex=r'[^\d]*'), Field( fieldname=FieldName.SHOP_YEAR, css_selector='div.hotel-info.fl > div.nameAndIcon > span.decorate_year' ), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.hotel-info.fl > div.addressInfo'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.hotel-brief.fl > div.startPrice > span.digit'), Field( fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-brief.fl > div.satisfaction > span.highlight'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.hotel-brief.fl > div.comment > a > span'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div.hotel-brief.fl > div.lastOrderTime'), )