def get_price(self, response): item = response.meta.get("item") # print("neirong:",response.body.decode()) BLOCK = extract_re(""""html":(.*?)isFullHouse""", response.body.decode()) RECORDS = re.findall("""room_unfold(.*?)class='clicked hidden""", BLOCK) for RECORD in RECORDS: # 房型名称 RoomName = extract_re(r"""RoomName\\":\\"(.*?)\\""", RECORD) item["ROOM_TYPE"] = RoomName RECORDS2 = re.findall( r"""data-hotelInvoice(.*?class=\\"hotel_room_last\\">.*?<\\/div>)""", RECORD) for RECORD2 in RECORDS2: itemValue = copy.deepcopy(item) # 产品名称 itemValue["PRODUCT_TYPE"] = extract_re( r"""(room_type_name\\".*?background-image:url\(|room_type_name\\".*?)([^>"]*?)(<br\\/>[^']|\)\\"><|\\/span>|<\\/[es])""", RECORD2, group_num=2) # 预定方式 pay_type = extract_re(r"""payment_txt\\".*?>(.*?)<""", RECORD2) map_pay_type = classify( { "0": "(在线付)", "2": "(担保)", "1": "(到店付)" }, pay_type) itemValue[ "PAYMENT_TYPE"] = map_pay_type if map_pay_type else "null" # 代理 daili = extract_re(r"""data-role=\\"title\\">(.*?)<\\/span>""", RECORD2) itemValue["IS_NOT_AGENT"] = daili if daili else "true" # 预订状态 pay_status = extract_re(r"""btns_base22_main\\">(.*?)<""", RECORD2) itemValue["AVAILABLE_ROOM_SITUATION"] = classify( { "可预订": "(预订)", "满房": "(订完)" }, pay_status) # 早餐 BREAKFAST = extract_re(r"""col4'>(.*?)<""", RECORD2) itemValue["BREAKFAST"] = BREAKFAST if BREAKFAST else "null" # 原价 itemValue["ORIGINAL_PRICE"] = extract_re( r"""data-price='(\d+)'""", RECORD2) # 套餐价格 taocan_price = extract_re( r"""rt_origin_price\\"><dfn>¥<\\/dfn>(.*?)<""", RECORD2) itemValue[ "DISCOUNT_PRICE"] = taocan_price if taocan_price else "0" # 返减 fanjian = extract_re(r"""span>返现(.*?)<""", RECORD2) itemValue["DISCOUNT"] = fanjian if fanjian else "0" print(itemValue) yield itemValue
def parse(self, response): item = response.meta["item"] meiTuanZongHeItem = response.meta.get("meiTuanZongHeItem", {}) html = response.body_as_unicode() meiTuanZongHeItem["RATING_VALUE"] = extract_re( """class="score-color">(.*?)</em>""", html) meiTuanZongHeItem["REVIEW_COUNT"] = extract_re("""住客点评\((.*?)\)""", html) meiTuanZongHeItem["GUEST_TYPE"] = "null" yield meiTuanZongHeItem for page in range(1, int(self.next_page_num) + 1): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', } url = 'http://api.hotel.meituan.com/group/v1/poi/comment/{hotel_id}?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset={offset}'.format( hotel_id=item["meiTuan_hotel_id"], offset=str((page - 1) * 10)) yield Request(url, headers=headers, callback=self.get_review, meta={ "item": copy.deepcopy(item), "is_need_proxy": True }, dont_filter=True)
def start_requests(self): if get_current_ip() == self.settings.get("MASTER_HOST", ""): self.mysql_client = SQLServer.from_settings( self.settings, self.cf.get("MYSQL_SERVER", "type"), self.cf.get("MYSQL_SERVER", "db")) sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID={site_id};".format( site_id=self.site_id) results = self.mysql_client.select(sql) for result in results: item = { "hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]), "BIG_DATA_HOTEL_ID": result[0], "SITE_ID": str(result[1]), "CHECK_POINT": self.CHECK_POINT } dateTime_list = getEveryDayFormatMapTuple(self.dateTime) for check_in_info, check_out_info in dateTime_list: itemValue = copy.deepcopy(item) itemValue["CHECKIN_DATE"] = check_in_info[1] itemValue["CHECKOUT_DATE"] = check_out_info[1] url = "http://meituan-549257379.cn-north-1.elb.amazonaws.com.cn:80/get_meituan_price" formdata = { "url_str": "https://ihotel.meituan.com/productapi/v2/prepayList?type=1&utm_medium=PC&version_name=7.3.0&poiId={hotel_id}&start={check_in}&end={check_out}" .format(hotel_id=item["hotel_id"], check_in=check_in_info[0], check_out=check_out_info[0]) } yield scrapy.FormRequest(url=url, formdata=formdata, callback=self.get_meituan_url, meta={"item": itemValue}) else: print("美团消费者!")
def MeiTuanReview(site_info, results, CHECK_POINT): rds = RedisHelper() try: result = next(results) except Exception: return meiTuanZongHeItem = MeiTuanZongHeItem() meiTuanZongHeItem["BIG_DATA_HOTEL_ID"] = result[0] meiTuanZongHeItem["SITE_ID"] = str(result[1]) meiTuanZongHeItem["CHECK_POINT"] = CHECK_POINT item = { "meiTuan_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]), "bigDataHotelId": result[0], "siteId": str(result[1]), "CHECK_POINT": CHECK_POINT } print("MeiTuanReview", time.time(), item) url = "http://www.meituan.com/jiudian/{}/#comment".format( item.get("meiTuan_hotel_id", "")) yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]), request=Request(url=url, meta={ "item": copy.deepcopy(item), "meiTuanZongHeItem": meiTuanZongHeItem, "is_need_proxy": True }, priority=-10, dont_filter=True))
def CtripReview(site_info, results, CHECK_POINT): rds = RedisHelper() try: result = next(results) except Exception: return item = { "ctrip_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]), "bigDataHotelId": result[0], "siteId": str(result[1]), "CHECK_POINT": CHECK_POINT } ctripZongHeItem = CtripZongHeItem() ctripZongHeItem["BIG_DATA_HOTEL_ID"] = result[0] ctripZongHeItem["SITE_ID"] = str(result[1]) ctripZongHeItem["CHECK_POINT"] = CHECK_POINT print("CtripReview", time.time(), item) yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]), request=Request(url=getElevenURL().format( item["ctrip_hotel_id"]), meta={ "item": item, "ctripZongHeItem": ctripZongHeItem }, priority=-10, dont_filter=True))
def start_requests(self): self.mysql_client = SQLServer.from_settings( self.settings, self.cf.get("MYSQL_SERVER", "type"), self.cf.get("MYSQL_SERVER", "db")) sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID=2;" results = self.mysql_client.select(sql) for result in results: item = { "hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]), "BIG_DATA_HOTEL_ID": result[0], "SITE_ID": str(result[1]), "CHECK_POINT": self.CHECK_POINT } yield scrapy.Request(url=getElevenURL().format(item["hotel_id"]), callback=self.parse, meta={"item": copy.deepcopy(item)}, priority=-10)
def CtripPrice(site_info, results, CHECK_POINT): rds = RedisHelper() try: result = next(results) except Exception: return item = { "ctrip_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]), "BIG_DATA_HOTEL_ID": result[0], "SITE_ID": str(result[1]), "CHECK_POINT": CHECK_POINT } print("CtripPrice", time.time(), item) yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]), request=Request(url=getElevenURL().format( item["ctrip_hotel_id"]), meta={"item": copy.deepcopy(item)}, priority=-10, dont_filter=True))
</p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='303205961'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='3' data-img-count='0' data-arrivcitycount='5' data-comhotcount='3' data-userfulcount='1' data-isUserSelf = 'False'><img src='//images4.c-ctrip.com/target/t1/headphoto/177/080/529/285e9abeede347d9a7c1cc5e1b4c9287_R_100_100.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***0</span></p><p class='level_new'></p><p class='num'>点评总数 3<br>被点有用 1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:3.0,设施:3.0,服务:2.0,卫生:3.0'><span class='b' style='width:42px;'></span></span><span class='score'><span class='n'>2.8</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321785' data-baseRoomName='精致商务房'>精致商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_biz'></i>商务出差</span></p><div class='comment_txt'><div class='J_commentDetail'>不行,没早餐、没停车场,我们入住的时候问服务员停车场有没有,他说没有,但可以停在对面第一人民医院不收费,第二天去开车收了42元的停车费</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-25</span></p><a class='useful useful_voted' data-voted='1' data-cid='303205961' href='javascript:void(0);'>有用<span class='n'>(1)</span></a></div></div></div></div><div class='comment_block J_asyncCmt' data-cid='306783408'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='4' data-img-count='0' data-arrivcitycount='4' data-comhotcount='4' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>1***2</span></p><p class='level_new'></p><p class='num'>点评总数 4</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:4.0,设施:5.0,服务:4.0,卫生:4.0'><span class='b' style='width:68px;'></span></span><span class='score'><span class='n'>4.3</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114322153' data-baseRoomName='经典双床房'>经典双床房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>这一家酒店真的不错!位置也好!离秦淮河!夫孑庙!还有不会写字了都很近!如果有朋友去南京我会介绍去这家!5分</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-29</span></p><a class='useful' data-voted='0' data-cid='306783408' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >城南有旧事 ,城北有信使, 林深时见鹿 ,海蓝时见鲸 ,梦醒时见你~~~~~~~~~~</p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='295189907'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='2' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>f***2</span></p><p class='level_new'></p><p class='num'>点评总数 1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:4.0,设施:4.0,服务:5.0,卫生:5.0'><span class='b' style='width:71px;'></span></span><span class='score'><span class='n'>4.5</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_couple'></i>情侣出游</span></p><div class='comment_txt'><div class='J_commentDetail'>有点吵,其它的都不错</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-23</span></p><a class='useful' data-voted='0' data-cid='295189907' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >非常抱歉没有给您带来最好的入住体验,但是我们会继续努力,给您最好的服务,期待您的再次光临,祝您生活愉快^_^</p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='303337557'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='1' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***5</span></p><p class='level_new'></p><p class='num'>点评总数 1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:4.0,服务:5.0,卫生:5.0'><span class='b' style='width:74px;'></span></span><span class='score'><span class='n'>4.8</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>服务不错,大厅很有特点,早餐买一送一,可以带孩子,不过描述房间有30平,感觉实际没有哦。洗手台下面空着真心欣赏不了,整体好评。</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-26</span></p><a class='useful' data-voted='0' data-cid='303337557' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text text_other' >谢谢亲的点评,每个房间加上洗手间都在35平方左右哦,卫生间是干湿分离设计呢,不过小锦也在努力的完善中,期待亲和小锦一起成长~爱你哦~~~~~~~~~~~</p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='291782555'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='2' data-img-count='0' data-arrivcitycount='2' data-comhotcount='2' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***4</span></p><p class='level_new'></p><p class='num'>点评总数 2</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:5.0,服务:5.0,卫生:5.0'><span class='b' style='width:80px;'></span></span><span class='score'><span class='n'>5.0</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114322312' data-baseRoomName='精致双床房'>精致双床房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>房间挺大挺舒适</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-22</span></p><a class='useful' data-voted='0' data-cid='291782555' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >思念是青色藤蔓开出白色的花,怎样看上去也清晰的艳。像天暗下来独自点亮的一盏烛火,雨后天空出现的彩虹,忧伤而美~~ </p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='291136859'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='1' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>1***0</span></p><p class='level_new'></p><p class='num'>点评总数 1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:5.0,服务:5.0,卫生:5.0'><span class='b' style='width:80px;'></span></span><span class='score'><span class='n'>5.0</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_else'></i>其它</span></p><div class='comment_txt'><div class='J_commentDetail'>非常好的酒店,新、干净、交通便利,早餐还有优惠活动,品种也挺多,还是性价比挺高的,下次还会去住</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-22</span></p><a class='useful' data-voted='0' data-cid='291136859' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text text_other' >你对小锦而言太珍贵了,珍贵到你在小锦身边的每一分钟我都当做最后一分钟去过,所以小锦才要马不停蹄的去拥抱你,把最好的都给你。 </p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div></div><div class='c_page_box'><div class='c_page'><a href='javascript:;' class='c_up_nocurrent'></a><div class='c_page_list layoutfix'><a href='javascript:;' class='current'><span>1</span></a><a value='2' href='/hotel/dianping/1519962_p2t0.html'><span>2</span></a><a value='3' href='/hotel/dianping/1519962_p3t0.html'><span>3</span></a><a value='4' href='/hotel/dianping/1519962_p4t0.html'><span>4</span></a><a value='5' href='/hotel/dianping/1519962_p5t0.html'><span>5</span></a><a value='6' href='/hotel/dianping/1519962_p6t0.html'><span>6</span></a><span class='c_page_ellipsis'>...</span><a value='28' href='/hotel/dianping/1519962_p28t0.html'><span>28</span></a></div><a value='2' class='c_down' href='/hotel/dianping/1519962_p2t0.html'><span>下一页</span></a><div class='c_pagevalue'>到<input type='text' class='c_page_num' name='cPageNum' id='cPageNum' value='1'>页<input type='button' class='c_page_submit' value='确定' name='cPageBtn' id='cPageBtn'></div><input type="hidden" id="cTotalPageNum" value="28" /></div></div></div></div></div><div id='commentTracker' style='display:none'>Version=1.0&PageID=102003&Rank=307613212,310543106,307122028,304092037,304091967,307279956,307443730,303841425,303436689,303205961,306783408,295189907,303337557,291782555,291136859&IdentityTextFilter=-1&OrderBy=1&RecommentType=All</div><div id='commentTracker20150415' style='display:none'>{"hotelid":"1519962","commentselect":"1","ordertype":"2","outcategory":"全部","roomtype":"","chooseorsearch":"2","keyword":"","result":"2","others":""}</div> ''' import re # re.compile() RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL) print(RECORDS.__len__()) num = 0 for RECORD in RECORDS: num +=1 print(num) item = {} USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD) REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD) ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD) RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD) print(RAW_REVIEW_CONTENT) REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD) USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD) IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) if "false" else "true" # 1 有回复内容,0没有 REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD) sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME) item["userName"] = USER_NAME item["reviewRatingValue"] = REVIEW_RATING_VALUE item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00" item["replyReviewContent"] = REPLY_REVIEW_CONTENT
def process_request(self, request, spider): if request.url.__contains__("172.25."): hotelId = extract_re("hotelId=(\d+)", request.url) request._set_url(getElevenURL().format(hotelId))
def get_home_review(self,response): eleven = response.meta["eleven"] html = response.body_as_unicode() if response.meta.get("page",1) ==1: item = response.meta["item"] ctripZongHeItem = response.meta["ctripZongHeItem"] ctripZongHeItem["RATING_VALUE"] = extract_re("<span class='score'><span class='n'>(.*?)</span>", html) ctripZongHeItem["REVIEW_COUNT"] = extract_re("<span id='All_Comment' >全部\((\d+)\)", html) ctripZongHeItem["GUEST_TYPE"] = "null" print(ctripZongHeItem) yield ctripZongHeItem for page in range(2, int(self.next_page_num) + 1): url = "http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx?MasterHotelID={hotel_id}&hotel={hotel_id}" \ "&NewOpenCount=0&AutoExpiredCount=0&RecordCount=1253&OpenDate=2016-01-01&card=-1&property=-1&userType=-1&" \ "productcode=&keyword=&roomName=&orderBy=1&viewVersion=c¤tPage={page}&contyped=0&" \ "callback=CASwaGVffbjQXgEzk&eleven={eleven}".format(hotel_id=item["ctrip_hotel_id"], page=page, eleven=eleven) yield Request(url, headers=response.request.headers, cookies=response.request.cookies, callback=self.get_home_review, priority=20+self.__class__.num, meta={"item": copy.deepcopy(item), "page": page, "eleven": eleven,"is_need_proxy":True}, ) RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL) for RECORD in RECORDS: item = response.meta["item"] USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD) REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD) ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD) RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD) REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD) USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD) IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) if "false" else "true" # 1 有回复内容,0没有 REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD) sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME) item["userName"] = USER_NAME item["reviewRatingValue"] = REVIEW_RATING_VALUE item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00" item["replyReviewContent"] = REPLY_REVIEW_CONTENT item["reviewId"] = REVIEW_ID item["userAvatar"] = USER_AVATAR item["isNotReply"] = "false" if IS_NOT_REPLY else "true" item["rawReviewContent"] = RAW_REVIEW_CONTENT item["RAW_PICTURES"] = RAW_PICTURES EMOTION_TYPE = classify( {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)", "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE) item["emotionType"] = EMOTION_TYPE item["sourceSiteId"] = sourceSiteId print(item) yield item