import time import json from pyquery import PyQuery import math import datetime def get_shop_address(self,_str): return "" def get_shop_grade(self,_str): return "" def get_shop_feature(self,_str): return "" def get_shop_rate(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True), #无shop_feature
def get_shop_rate(self, _str): return "" def get_comment_url(self, _str): return _str + "/comment" def get_shop_name_search_key(self, _str): return self.shop_name_search_key(self.shop_name) fl_shop1 = Fieldlist( Field( fieldname=FieldName.SHOP_NAME, css_selector='div > div.mp-sight-info > a > div.mp-sight-detail > h3', is_info=True), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= 'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-price > em', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.mp-sight-info > a', attr='href', is_debug=True, is_info=True), #img还有些许问题
import json import re import random import datetime import math def get_comment_num(self,_str): num = re.findall(r'[\d]{1,10}',_str) return str(num[0]) def get_shop_grade(self,_str): return "0.0" def get_shop_price(self,_str): return "0.0" def get_shop_rate(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector='div > div.ct-text > h3 > a',is_debug=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',is_info=True,filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src',is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a', is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade), #正则表达式不一样 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True) )
def get_shop_feature(self, _str): return "" def get_shop_rate(self, _str): return "" def get_comment_url(self, _str): return _str + "/comment" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.ml-pro-info > p'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= ' a > div.ml-pro-info > div.ml-pro-price > span.price > i:nth-child(2)', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='a', attr='href', is_debug=True, is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div
import json import re import random def get_shop_rate(self, _str): return "" def get_shop_feature(self, _str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector=' a.e_title.js_list_name', is_debug=True), Field(fieldname=FieldName.SHOP_URL, css_selector='a.e_title.js_list_name', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img:nth-child(1)', attr='src', is_info=True), Field( fieldname=FieldName.SHOP_ADDRESS, css_selector= 'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > p > span > em', is_info=True), Field(
# -*- coding:utf-8 -*- from spider.driver.travel.core.traveldriver import TravelDriver from spider.driver.base.page import Page from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector='li.hotel_item_name > h2 > a',regex=r'^[\d]*(.*)$',repl=r'\1'), Field(fieldname=FieldName.SHOP_URL,css_selector='li.hotel_item_name > h2 > a',attr='href',regex=r'^([^\?]*)?.*$',repl=r'\1'), Field(fieldname=FieldName.SHOP_ID, css_selector='li.hotel_item_name > h2 > a', attr='href',regex=r'^[^\?\d]*([\d]*).html?.*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_IMG, css_selector='li.pic_medal > div > a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='li.hotel_item_name > p.hotel_item_htladdress'), Field(fieldname=FieldName.SHOP_GRADE,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_value'), Field(fieldname=FieldName.SHOP_STATISFACTION_PERCENT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.total_judgement_score > span'), Field(fieldname=FieldName.SHOP_RATE, css_selector='li.hotel_item_name > span', attr='innerHTML',regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='li.hotel_item_name > p.hotel_item_last_book'), Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.J_price_lowList'), Field(fieldname=FieldName.SHOP_CATEGORY_NAME, css_selector='li.hotel_item_name > p.medal_list > span'), Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_judgement > span'), Field(fieldname=FieldName.SHOP_GRADE_TEXT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.recommend'), ) def get_recommend_all_room_dict(self, _str):
import json from pyquery import PyQuery import xmltodict import math import datetime def get_zero(self,_str): return 0.0 def get_shop_area(self,_str): return '千岛湖东北湖区'; def get_baidu_spider_step(self,_str): return "2"; fl_shop1 = Fieldlist( #card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(1) > span > a #card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div:nth-child(1) > span:nth-child(1) > a Field(fieldname=FieldName.SHOP_NAME,css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a',is_info=True), #card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div.row.addr > span #card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div.row.addr > span Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='div.cf > div.ml_30 > div.row.addr > span',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.cf > div.col-r > div.img-wrap > a > img', attr='src',is_info=True), Field(fieldname=FieldName.SHOP_LNG, css_selector='',filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_LAT, css_selector='',filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_AREA,css_selector='',filter_func=get_shop_area,is_info=True), Field(fieldname=FieldName.BAIDU_SPIDER_STEP,css_selector='',filter_func=get_baidu_spider_step,is_info=True) ) def get_shop_name(self,_str): self.shop_name = _str; return _str; fl_shop2 = Fieldlist(
# -*- coding:utf-8 -*- from spider.driver.travel.core.traveldriver import TravelDriver from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery import xmltodict fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.search_ticket_title > h2 > a'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.search_ticket_title > h2 > span > span.rate'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.search_ticket_title > h2 > a', attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.search_ticket_title > div.adress'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div.search_ticket_assess > span.grades > em'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.search_ticket_assess > span.grades', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'), Field(fieldname=FieldName.SHOP_FEATURE,
from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json import re import random def get_shop_rate(self,_str): return "" def get_shop_grade(self,_str): return "0.0" fl_shop1 = Fieldlist( ##_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > h3 > a Field(fieldname=FieldName.SHOP_NAME, css_selector='div.ct-text > h3 > a', is_debug=True), #_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > h3 > a Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.ct-text > h3 > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.flt1 > a > img', attr='src', is_info=True), #_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > div > p:nth-child(1) Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.ct-text > ul > li:nth-child(1) > a', is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='div.ct-text > ul > li.frt._j_hotel_ota > a > span.seg-price'), # 正则表达式不一样 #_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > ul > li:nth-child(2) > a Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.ct-text > ul > li:nth-child(2) > a', is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div.ct-text > div > p:nth-child(1)', is_info=True),
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from selenium import webdriver from spider.driver.base.driver import * import re import time import json from pyquery import PyQuery import xmltodict # fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a'), # 5A景区 Field( fieldname=FieldName.SHOP_RATE, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > span.level' ), Field(fieldname=FieldName.SHOP_URL, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a', attr='href'), Field( fieldname=FieldName.SHOP_IMG, css_selector= 'div.sight_item_detail.clrfix > div.sight_item_show > div.show loading > a > img',
import re import random import datetime import math from urllib import request import demjson import json def get_shop_area(self, _str): return '千岛湖乡村游景点' fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='dl > dd > a > h2', is_info=True), Field(fieldname=FieldName.SHOP_URL, css_selector='dl > dd > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(1)', is_info=True), Field(fieldname=FieldName.SHOP_PHONE, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)', is_info=True), Field(fieldname=FieldName.SHOP_AREA, css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)', filter_func=get_shop_area, is_info=True))
from spider.driver.travel.core.traveldriver import TravelDriver from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery import xmltodict import math import datetime fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.search_ticket_title > h2 > a'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.search_ticket_title > h2 > span > span.rate'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.search_ticket_title > h2 > a', attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.search_ticket_title > div.adress'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div.search_ticket_assess > span.grades > em'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.search_ticket_assess > span.grades', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'), Field(fieldname=FieldName.SHOP_FEATURE,
import math import datetime def get_shop_grade(self, _str): saveTo = round(float(_str[0:-1]) / 100 * 5, 1) return str(saveTo) def get_shop_rate(self, _str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector= 'div.product-regular.clearfix > div.product-section > h3 > a', is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='', is_info=True, filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_URL, css_selector= 'div.product-regular.clearfix > div.product-section > h3 > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector= 'div.product-regular.clearfix > div.product-left > a > img', attr='src', is_info=True),
def get_shop_grade(self, _str): p = pq(_str) result = {} for i in p('li').items(): if '分' in i.text(): result.setdefault('评分', float(re.sub(r'[^\d.]*', '', i.text()))) elif '评价' in i.text(): result.setdefault('评论数', int(re.sub(r'[^\d]*', '', i.text()))) elif '游记' in i.text(): result.setdefault('游记数', int(re.sub(r'[^\d]*', '', i.text()))) return json.dumps(result, ensure_ascii=False) fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-title > div > h3 > a'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-title > div > span.hotel-rate.rate5', attr='class', regex='[^\d]*', is_info=True), Field(fieldname=FieldName.SHOP_INTRO, css_selector='div.hotel-info > ul', attr="innerHTML", is_debug='True', filter_func=get_shop_grade, is_info=True), ) # fl_shop2 = Fieldlist( # Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.container > div.hotel-intro > div.intro-hd > div.location > span', attr='title', offset=6, try_times=10, pause_time=1),
saveTo = round(float(_str[0:-1]) / 100 * 5, 1) return str(saveTo) def get_shop_feature(self, _str): return "" def get_shop_rate(self, _str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.theinfo.ticket.clearfix > a > dl > dt > p > span', is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector= 'div.theinfo.ticket.clearfix > a > div.priceinfo > span > em', is_info=False), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='div.theinfo.ticket.clearfix > a', attr='href', is_debug=True, is_info=False), #img还有些许问题 Field( fieldname=FieldName.SHOP_IMG, css_selector='div.theinfo.ticket.clearfix > a > div.imgbox > div > img',
# -*- coding:utf-8 -*- from spider.driver.base.driver import Driver from spider.driver.base.mysql import Mysql import time from pyquery import PyQuery from spider.driver.base.field import Field, FieldName, Fieldlist, FieldType from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.base.tabsetup import TabSetup fl_weixin1 = Fieldlist( Field(fieldname='public_name', css_selector='div > div.txt-box > p.tit > a', regex=r'[^\u4e00-\u9fa5]*'), ) fl_weixin2 = Fieldlist( Field(fieldname='article_name', css_selector='div > div > h4'), Field(fieldname='article_time', css_selector='div > div > p.weui_media_extra_info'), ) page_weixin_1 = Page(name='微信公众号列表页面', fieldlist=fl_weixin1, listcssselector=ListCssSelector( list_css_selector='#main > div.news-box > ul > li')) page_weixin_2 = Page( name='微信公众号文章列表页面', fieldlist=fl_weixin2, tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'),
# -*- coding:utf-8 -*- from spider.driver.travel.core.traveldriver import TravelDriver from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb import re import time import json from pyquery import PyQuery fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div:nth-child(2) > span'), ) fl_shop2 = Fieldlist( Field( fieldname=FieldName.SHOP_NAME, css_selector= 'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span' ), ) fl_comment1 = Fieldlist( # Field(fieldname=FieldName.SHOP_NAME, css_selector='body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'), Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='div.rate-info > div.avatar-info > div.user-nick'), ) page_shop_1 = Page(name='飞猪景点店铺列表页面', fieldlist=fl_shop1,
# -*- coding:utf-8 -*- from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json import re fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.h_info_pic > a > img', attr='src'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='title'), Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='href'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > b', attr='class', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^([\d.]*).*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^[\d.]*[^\d]*([\d]*)[^\d]*$', repl=r'\1'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.h_info_text > div.h_info_base > p.lastt_book'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.h_info_text > div.h_info_pri', regex=r'[^\d.]*'), ) def get_shop_room(self, _str): p = PyQuery(_str) room_list = [] for i in p('div.hdetail_type > div.htype_list > div.htype_item').items(): info_list = i('div.htype_info').text().split('\n') detail = info_list[3].split('|')
return "" def get_shop_grade(self,_str): return "" def get_shop_feature(self,_str): return "" def get_shop_rate(self,_str): return "" def get_shop_comment_url(self,_str): shop_id = re.findall(r'([\d]{1,10})',_str)[0]; shop_comment_url = "https://m.tuniu.com/h5/tour/comment/" + shop_id + "/4" return shop_comment_url fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True), #img还有些许问题 #\33 6822720 > div:nth-child(1) > div Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True), #这里应该做一个转换 #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1) Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True), #正则表达式的使用有问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True), #无shop_feature
def get_shop_url(self, _str): matchObj = re.search(r'http.*html', _str, re.M | re.I) return str(matchObj.group()) def get_shop_rate(self, _str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector=' dl > dt > a', is_debug=True), Field(fieldname=FieldName.SHOP_URL, css_selector='dl > dt > a', attr='onclick', filter_func=get_shop_url, is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector=' a > img', attr='src', is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='dl > dd.proInfo-address > i', is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector=' div > div:nth-child(2) > ul > li:nth-child(2) > a',
import re import time import json from pyquery import PyQuery import xmltodict def get_shop_rate(self,_str): return "" def get_shop_feature(self,_str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME,css_selector=' div > div.h_info > div.h_info_text > div.h_info_base > p.h_info_b1 > a > span.info_cn',attr='innerHTML', is_info=True), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True), #有些问题 Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True), #稍许有些问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'), Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True) ) fl_shop2 = Fieldlist() page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
import json import re import random def get_shop_tag(self, _str): p = PyQuery(_str) tag_list = [] for i in list(p('span').items())[1:]: tag_list.append(i.text()) return json.dumps(tag_list, ensure_ascii=False) def get_shop_rate(self, _str): return str(float((int(_str)/10))) fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.price > p > strong'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.remark > div > div > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), # Field(fieldname=FieldName.SHOP_TAG, css_selector='div.hotel-info-ctn > div.hotel-info-main > p.hotel-tags', attr='innerHTML', filter_func=get_shop_tag, pause_time=3), Field(fieldname=FieldName.SHOP_URL,css_selector='',attr='href',is_info=True), Field(fieldname=FieldName.SHOP_IMG,css_selector='',attr='src',is_info=True), Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='',is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',is_info=True) ) def get_shop_room_all(self, _str): p = PyQuery(_str) sale_dict = {}
def get_shop_score(self, _str): return (float(re.findall(r'([\d]{1,4})', _str)[0]) / 10) def get_zero(self, _str): return 0.0 def get_shop_site(self, _str): return self.shop_site fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4', is_info=True), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href', is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num', attr='innerHTML', filter_func=get_zero, is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price > b', attr='innerHTML', filter_func=get_zero, is_info=True),
# -*- coding:utf-8 -*- from spider.driver.base.field import Fieldlist, Field, FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_IMG, css_selector='div.hotel-pic > a > img', attr='src'), Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-title > div > h3 > a'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.hotel-title > div > h3 > a', attr='href'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-title > div > span.hotel-rate', attr='class', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div.hotel-info > ul > li.rating > em'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.hotel-info > ul > li:nth-child(2) > a > em', regex=r'[^\d]*'), )
from selenium.webdriver.remote.webelement import WebElement from spider.driver.base.tabsetup import TabSetup from spider.driver.base.field import Field, Fieldlist from spider.driver.base.page import Page, PageGroup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb fl = Fieldlist(Field(fieldname=12), Field(fieldname=13)) mongo = Mongodb(db='122', collection='12') lcs = ListCssSelector(list_css_selector=12) tab = TabSetup(url_name=12) p = Page(name=122, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) p1 = Page(name=123, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) pg = PageGroup(p, p1) print(next(pg))
# -*- coding:utf-8 -*- from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.row-center > div > h5 > a'), Field(fieldname=FieldName.SHOP_CURR_URL, css_selector='div > div.row-center > div > h5 > a', attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.row-left.fleft > a > img', attr='src'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.row-center > div > h5 > span.row-subtitle', attr='title', regex=r'[^\d]*'), Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.row-center > div > p.row-someone-book > span'), Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.row-sub-right.fright > a > p.score > span.value'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.row-sub-right.fright > a > p.comment > span'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.row-right.fright > div.box-price > p > span.pi-price.pi-price-lg', regex=r'[^\d]*'), ) def get_room_all(self, _str): p = PyQuery(_str) room_list = [] for i in p('div.room-item-wrapper > div.room-item-inner > div:nth-child(1)').items(): room_list.append(i.text().split()[1:]) return json.dumps(room_list, ensure_ascii=False) def get_shop_intro(self, _str):
def _get_shop_comment_num(self, _str): return "" def get_shop_url(self, _str): return 'https://market.m.taobao.com/apps/market/travelticket/detail.html?wh_weex=true&scenicId=' + str( _str) + '&gsCallback=' + str(_str) def get_shop_img(self, _str): return "" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div:nth-child(2) > span'), #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2) Field( fieldname=FieldName.SHOP_PRICE, css_selector= 'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)', is_info=True), #稍微有点问题 Field(fieldname=FieldName.SHOP_URL, css_selector='', attr='id', filter_func=get_shop_url, is_debug=True, is_info=True), #img还有些许问题
p = PyQuery(_str) except Exception: return None return json.dumps([i.text() for i in p('span').items()][1:], ensure_ascii=False) def get_shop_rate(self, _str): return str(float((int(_str)/10))) def get_shop_subtype_name(self, _str): return _str.strip() def get_shop_feature(self,_str): return "" def get_comment_url(self,_str): return _str + "/review_all" fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr'), Field(fieldname=FieldName.SHOP_IMG,css_selector='div.pic > a > img',is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.txt > span > span:nth-child(1) > b',is_info=True), Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='div.txt > div.tit > a', attr='href',filter_func=get_comment_url, is_info=True) ) page_shop_1 = Page(name='大众点评景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#shop-all-list > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_shop_time(self, _str): try:
except Exception: return None return json.dumps([i.text() for i in p('span').items()][1:], ensure_ascii=False) def get_shop_rate(self, _str): return str(float((int(_str) / 10))) def get_shop_subtype_name(self, _str): return _str.strip() fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'), Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'), Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate), Field(fieldname=FieldName.SHOP_TAG, css_selector='div.txt > span.comment-list', attr='innerHTML',
# -*- coding:utf-8 -*- from spider.driver.base.field import Fieldlist,Field,FieldName from spider.driver.base.tabsetup import TabSetup from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb from spider.driver.travel.core.traveldriver import TravelDriver import time from pyquery import PyQuery import json import re fl_shop1 = Fieldlist( Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.ct-text > h3 > a', is_debug=True), Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href'), Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src'), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a'), # Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.search_ticket_assess > span.grades > em'), #正则表达式不一样 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > ul > li:nth-child(1) > div > div.ct-text > p'), ) def get_shop_ticket(): print(111) def get_shop_info(): print(222) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_PRICE, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(4) > dd > div:nth-child(1) > div', pause_time=3, is_focus=True, is_info=True), Field(fieldname=FieldName.SHOP_TIME, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(5) > dd > div:nth-child(1)', is_focus=True),