def get_price(self, response): item = response.meta.get("item") # print("neirong:",response.body.decode()) BLOCK = extract_re(""""html":(.*?)isFullHouse""", response.body.decode()) RECORDS = re.findall("""room_unfold(.*?)class='clicked hidden""", BLOCK) for RECORD in RECORDS: # 房型名称 RoomName = extract_re(r"""RoomName\\":\\"(.*?)\\""", RECORD) item["ROOM_TYPE"] = RoomName RECORDS2 = re.findall( r"""data-hotelInvoice(.*?class=\\"hotel_room_last\\">.*?<\\/div>)""", RECORD) for RECORD2 in RECORDS2: itemValue = copy.deepcopy(item) # 产品名称 itemValue["PRODUCT_TYPE"] = extract_re( r"""(room_type_name\\".*?background-image:url\(|room_type_name\\".*?)([^>"]*?)(<br\\/>[^']|\)\\"><|\\/span>|<\\/[es])""", RECORD2, group_num=2) # 预定方式 pay_type = extract_re(r"""payment_txt\\".*?>(.*?)<""", RECORD2) map_pay_type = classify( { "0": "(在线付)", "2": "(担保)", "1": "(到店付)" }, pay_type) itemValue[ "PAYMENT_TYPE"] = map_pay_type if map_pay_type else "null" # 代理 daili = extract_re(r"""data-role=\\"title\\">(.*?)<\\/span>""", RECORD2) itemValue["IS_NOT_AGENT"] = daili if daili else "true" # 预订状态 pay_status = extract_re(r"""btns_base22_main\\">(.*?)<""", RECORD2) itemValue["AVAILABLE_ROOM_SITUATION"] = classify( { "可预订": "(预订)", "满房": "(订完)" }, pay_status) # 早餐 BREAKFAST = extract_re(r"""col4'>(.*?)<""", RECORD2) itemValue["BREAKFAST"] = BREAKFAST if BREAKFAST else "null" # 原价 itemValue["ORIGINAL_PRICE"] = extract_re( r"""data-price='(\d+)'""", RECORD2) # 套餐价格 taocan_price = extract_re( r"""rt_origin_price\\"><dfn>¥<\\/dfn>(.*?)<""", RECORD2) itemValue[ "DISCOUNT_PRICE"] = taocan_price if taocan_price else "0" # 返减 fanjian = extract_re(r"""span>返现(.*?)<""", RECORD2) itemValue["DISCOUNT"] = fanjian if fanjian else "0" print(itemValue) yield itemValue
def get_home_review(self,response): eleven = response.meta["eleven"] html = response.body_as_unicode() if response.meta.get("page",1) ==1: item = response.meta["item"] ctripZongHeItem = response.meta["ctripZongHeItem"] ctripZongHeItem["RATING_VALUE"] = extract_re("<span class='score'><span class='n'>(.*?)</span>", html) ctripZongHeItem["REVIEW_COUNT"] = extract_re("<span id='All_Comment' >全部\((\d+)\)", html) ctripZongHeItem["GUEST_TYPE"] = "null" print(ctripZongHeItem) yield ctripZongHeItem for page in range(2, int(self.next_page_num) + 1): url = "http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx?MasterHotelID={hotel_id}&hotel={hotel_id}" \ "&NewOpenCount=0&AutoExpiredCount=0&RecordCount=1253&OpenDate=2016-01-01&card=-1&property=-1&userType=-1&" \ "productcode=&keyword=&roomName=&orderBy=1&viewVersion=c¤tPage={page}&contyped=0&" \ "callback=CASwaGVffbjQXgEzk&eleven={eleven}".format(hotel_id=item["ctrip_hotel_id"], page=page, eleven=eleven) yield Request(url, headers=response.request.headers, cookies=response.request.cookies, callback=self.get_home_review, priority=20+self.__class__.num, meta={"item": copy.deepcopy(item), "page": page, "eleven": eleven,"is_need_proxy":True}, ) RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL) for RECORD in RECORDS: item = response.meta["item"] USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD) REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD) ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD) RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD) REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD) USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD) IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) if "false" else "true" # 1 有回复内容,0没有 REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD) sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME) item["userName"] = USER_NAME item["reviewRatingValue"] = REVIEW_RATING_VALUE item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00" item["replyReviewContent"] = REPLY_REVIEW_CONTENT item["reviewId"] = REVIEW_ID item["userAvatar"] = USER_AVATAR item["isNotReply"] = "false" if IS_NOT_REPLY else "true" item["rawReviewContent"] = RAW_REVIEW_CONTENT item["RAW_PICTURES"] = RAW_PICTURES EMOTION_TYPE = classify( {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)", "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE) item["emotionType"] = EMOTION_TYPE item["sourceSiteId"] = sourceSiteId print(item) yield item
def get_review(self, response): res = json.loads(response.body_as_unicode()) feedbacks = res['data']['feedback'] for feedback in feedbacks: REVIEW_ID = feedback.get("id", "") REPLY_REVIEW_TIME = feedback.get('replytime', '') REVIEW_RATING_VALUE = feedback.get('score', '') ACCOMMODATION_TIME = feedback.get('feedbacktime', '') USER_AVATAR = feedback.get('avatar', '') REPLY_REVIEW_CONTENT = feedback.get('bizreply', '') RAW_REVIEW_CONTENT = feedback.get('comment', '') USER_NAME = feedback.get('username', '') RAW_PICTURES = '' USER_ID = feedback.get('userid', '') item = copy.deepcopy(response.meta["item"]) item['replyReviewTime'] = REPLY_REVIEW_TIME item["userName"] = USER_NAME item["reviewRatingValue"] = REVIEW_RATING_VALUE item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00" item["replyReviewContent"] = REPLY_REVIEW_CONTENT item["reviewId"] = REVIEW_ID item["userAvatar"] = USER_AVATAR if USER_AVATAR else "" item["isNotReply"] = 'false' if REPLY_REVIEW_CONTENT else 'true' item["rawReviewContent"] = RAW_REVIEW_CONTENT item["rawPictures"] = RAW_PICTURES if RAW_PICTURES else "" item['userId'] = USER_ID item["userLocation"] = "" item["emotionType"] = classify( { "0": "(1|2)", "2": "(3)", "1": "(4|5)" }, REVIEW_RATING_VALUE) item["sourceSiteId"] = item.get("siteId", "6") yield item
def get_meituran_price(self, response): response_dict = json.loads(response.body_as_unicode()) for RECORD in response_dict.get("mergeList", {}).get("data", {}): item = response.meta.get("item") set_.add(item["BIG_DATA_HOTEL_ID"]) print(set_.__len__()) item["ROOM_TYPE"] = RECORD.get("roomCellName", "") for RECORD2 in RECORD.get("aggregateGoods", []): itemValue = copy.deepcopy(item) # 产品名称 PRODUCT_TYPE = RECORD2.get("aggregateGoodName", "") itemValue[ "PRODUCT_TYPE"] = PRODUCT_TYPE if PRODUCT_TYPE else "null" # 预订状态 pay_status = RECORD2["prepayGood"]["goodsStatus"] itemValue["AVAILABLE_ROOM_SITUATION"] = classify( { "满房": "(0)", "可预订": "(1)" }, str(pay_status)) # 早餐 itemValue["BREAKFAST"] = RECORD2["prepayGood"].get( "breakfast", "null") # 原价 ORIGINAL_PRICE = RECORD2["prepayGood"]["averagePrice"] itemValue[ "ORIGINAL_PRICE"] = ORIGINAL_PRICE if ORIGINAL_PRICE else "" # 代理 daili = RECORD2["prepayGood"].get("tagName", "") itemValue["IS_NOT_AGENT"] = daili if daili else "true" # 预定方式 pay_type = RECORD2.get("prepayGood", {}).get("reserveTips", "0") itemValue["PAYMENT_TYPE"] = classify({"0": "(在线付)"}, pay_type) # 均价(已减) itemValue[ "DISCOUNT_PRICE"] = ORIGINAL_PRICE if ORIGINAL_PRICE else "" # 返减 itemValue["DISCOUNT"] = "0" # print(itemValue) yield itemValue
for RECORD in RECORDS: num +=1 print(num) item = {} USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD) REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD) ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD) RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD) print(RAW_REVIEW_CONTENT) REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD) USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD) IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) if "false" else "true" # 1 有回复内容,0没有 REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD) RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD) sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME) item["userName"] = USER_NAME item["reviewRatingValue"] = REVIEW_RATING_VALUE item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00" item["replyReviewContent"] = REPLY_REVIEW_CONTENT item["reviewId"] = REVIEW_ID item["userAvatar"] = USER_AVATAR item["isNotReply"] = "false" if IS_NOT_REPLY else "true" item["rawReviewContent"] = RAW_REVIEW_CONTENT item["RAW_PICTURES"] = RAW_PICTURES EMOTION_TYPE = classify( {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)", "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE) item["emotionType"] = EMOTION_TYPE item["sourceSiteId"] = sourceSiteId print(item)