예제 #1
0
 def get_price(self, response):
     item = response.meta.get("item")
     # print("neirong:",response.body.decode())
     BLOCK = extract_re(""""html":(.*?)isFullHouse""",
                        response.body.decode())
     RECORDS = re.findall("""room_unfold(.*?)class='clicked hidden""",
                          BLOCK)
     for RECORD in RECORDS:
         # 房型名称
         RoomName = extract_re(r"""RoomName\\":\\"(.*?)\\""", RECORD)
         item["ROOM_TYPE"] = RoomName
         RECORDS2 = re.findall(
             r"""data-hotelInvoice(.*?class=\\"hotel_room_last\\">.*?<\\/div>)""",
             RECORD)
         for RECORD2 in RECORDS2:
             itemValue = copy.deepcopy(item)
             # 产品名称
             itemValue["PRODUCT_TYPE"] = extract_re(
                 r"""(room_type_name\\".*?background-image:url\(|room_type_name\\".*?)([^>"]*?)(<br\\/>[^']|\)\\"><|\\/span>|<\\/[es])""",
                 RECORD2,
                 group_num=2)
             # 预定方式
             pay_type = extract_re(r"""payment_txt\\".*?>(.*?)<""", RECORD2)
             map_pay_type = classify(
                 {
                     "0": "(在线付)",
                     "2": "(担保)",
                     "1": "(到店付)"
                 }, pay_type)
             itemValue[
                 "PAYMENT_TYPE"] = map_pay_type if map_pay_type else "null"
             # 代理
             daili = extract_re(r"""data-role=\\"title\\">(.*?)<\\/span>""",
                                RECORD2)
             itemValue["IS_NOT_AGENT"] = daili if daili else "true"
             # 预订状态
             pay_status = extract_re(r"""btns_base22_main\\">(.*?)<""",
                                     RECORD2)
             itemValue["AVAILABLE_ROOM_SITUATION"] = classify(
                 {
                     "可预订": "(预订)",
                     "满房": "(订完)"
                 }, pay_status)
             # 早餐
             BREAKFAST = extract_re(r"""col4'>(.*?)<""", RECORD2)
             itemValue["BREAKFAST"] = BREAKFAST if BREAKFAST else "null"
             # 原价
             itemValue["ORIGINAL_PRICE"] = extract_re(
                 r"""data-price='(\d+)'""", RECORD2)
             # 套餐价格
             taocan_price = extract_re(
                 r"""rt_origin_price\\"><dfn>&yen;<\\/dfn>(.*?)<""",
                 RECORD2)
             itemValue[
                 "DISCOUNT_PRICE"] = taocan_price if taocan_price else "0"
             # 返减
             fanjian = extract_re(r"""span>返现(.*?)<""", RECORD2)
             itemValue["DISCOUNT"] = fanjian if fanjian else "0"
             print(itemValue)
             yield itemValue
예제 #2
0
 def parse(self, response):
     item = response.meta["item"]
     meiTuanZongHeItem = response.meta.get("meiTuanZongHeItem", {})
     html = response.body_as_unicode()
     meiTuanZongHeItem["RATING_VALUE"] = extract_re(
         """class="score-color">(.*?)</em>""", html)
     meiTuanZongHeItem["REVIEW_COUNT"] = extract_re("""住客点评\((.*?)\)""",
                                                    html)
     meiTuanZongHeItem["GUEST_TYPE"] = "null"
     yield meiTuanZongHeItem
     for page in range(1, int(self.next_page_num) + 1):
         headers = {
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
         }
         url = 'http://api.hotel.meituan.com/group/v1/poi/comment/{hotel_id}?sortType=default&noempty=1&withpic=0&filter=all&limit=10&offset={offset}'.format(
             hotel_id=item["meiTuan_hotel_id"], offset=str((page - 1) * 10))
         yield Request(url,
                       headers=headers,
                       callback=self.get_review,
                       meta={
                           "item": copy.deepcopy(item),
                           "is_need_proxy": True
                       },
                       dont_filter=True)
예제 #3
0
 def start_requests(self):
     if get_current_ip() == self.settings.get("MASTER_HOST", ""):
         self.mysql_client = SQLServer.from_settings(
             self.settings, self.cf.get("MYSQL_SERVER", "type"),
             self.cf.get("MYSQL_SERVER", "db"))
         sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID={site_id};".format(
             site_id=self.site_id)
         results = self.mysql_client.select(sql)
         for result in results:
             item = {
                 "hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"",
                                        result[2]),
                 "BIG_DATA_HOTEL_ID": result[0],
                 "SITE_ID": str(result[1]),
                 "CHECK_POINT": self.CHECK_POINT
             }
             dateTime_list = getEveryDayFormatMapTuple(self.dateTime)
             for check_in_info, check_out_info in dateTime_list:
                 itemValue = copy.deepcopy(item)
                 itemValue["CHECKIN_DATE"] = check_in_info[1]
                 itemValue["CHECKOUT_DATE"] = check_out_info[1]
                 url = "http://meituan-549257379.cn-north-1.elb.amazonaws.com.cn:80/get_meituan_price"
                 formdata = {
                     "url_str":
                     "https://ihotel.meituan.com/productapi/v2/prepayList?type=1&utm_medium=PC&version_name=7.3.0&poiId={hotel_id}&start={check_in}&end={check_out}"
                     .format(hotel_id=item["hotel_id"],
                             check_in=check_in_info[0],
                             check_out=check_out_info[0])
                 }
                 yield scrapy.FormRequest(url=url,
                                          formdata=formdata,
                                          callback=self.get_meituan_url,
                                          meta={"item": itemValue})
     else:
         print("美团消费者!")
예제 #4
0
def MeiTuanReview(site_info, results, CHECK_POINT):
    rds = RedisHelper()
    try:
        result = next(results)
    except Exception:
        return
    meiTuanZongHeItem = MeiTuanZongHeItem()
    meiTuanZongHeItem["BIG_DATA_HOTEL_ID"] = result[0]
    meiTuanZongHeItem["SITE_ID"] = str(result[1])
    meiTuanZongHeItem["CHECK_POINT"] = CHECK_POINT
    item = {
        "meiTuan_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"",
                                       result[2]),
        "bigDataHotelId": result[0],
        "siteId": str(result[1]),
        "CHECK_POINT": CHECK_POINT
    }
    print("MeiTuanReview", time.time(), item)
    url = "http://www.meituan.com/jiudian/{}/#comment".format(
        item.get("meiTuan_hotel_id", ""))
    yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]),
                    request=Request(url=url,
                                    meta={
                                        "item": copy.deepcopy(item),
                                        "meiTuanZongHeItem": meiTuanZongHeItem,
                                        "is_need_proxy": True
                                    },
                                    priority=-10,
                                    dont_filter=True))
예제 #5
0
def CtripReview(site_info, results, CHECK_POINT):
    rds = RedisHelper()
    try:
        result = next(results)
    except Exception:
        return
    item = {
        "ctrip_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]),
        "bigDataHotelId": result[0],
        "siteId": str(result[1]),
        "CHECK_POINT": CHECK_POINT
    }
    ctripZongHeItem = CtripZongHeItem()
    ctripZongHeItem["BIG_DATA_HOTEL_ID"] = result[0]
    ctripZongHeItem["SITE_ID"] = str(result[1])
    ctripZongHeItem["CHECK_POINT"] = CHECK_POINT
    print("CtripReview", time.time(), item)
    yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]),
                    request=Request(url=getElevenURL().format(
                        item["ctrip_hotel_id"]),
                                    meta={
                                        "item": item,
                                        "ctripZongHeItem": ctripZongHeItem
                                    },
                                    priority=-10,
                                    dont_filter=True))
예제 #6
0
 def start_requests(self):
     self.mysql_client = SQLServer.from_settings(
         self.settings, self.cf.get("MYSQL_SERVER", "type"),
         self.cf.get("MYSQL_SERVER", "db"))
     sql = "SELECT BIG_DATA_HOTEL_ID,SITE_ID,URL_CRAWL_INFO FROM `MS_EST_WH_HOTEL_SITE_REL` WHERE `STATUS`='NORMAL' AND SITE_ID=2;"
     results = self.mysql_client.select(sql)
     for result in results:
         item = {
             "hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"",
                                    result[2]),
             "BIG_DATA_HOTEL_ID": result[0],
             "SITE_ID": str(result[1]),
             "CHECK_POINT": self.CHECK_POINT
         }
         yield scrapy.Request(url=getElevenURL().format(item["hotel_id"]),
                              callback=self.parse,
                              meta={"item": copy.deepcopy(item)},
                              priority=-10)
예제 #7
0
def CtripPrice(site_info, results, CHECK_POINT):
    rds = RedisHelper()
    try:
        result = next(results)
    except Exception:
        return
    item = {
        "ctrip_hotel_id": extract_re("REGEX_COLUMNS1\":\"(.*?)\"", result[2]),
        "BIG_DATA_HOTEL_ID": result[0],
        "SITE_ID": str(result[1]),
        "CHECK_POINT": CHECK_POINT
    }
    print("CtripPrice", time.time(), item)
    yield rds.lpush(key="{}:requests".format(site_info["SpiderName"]),
                    request=Request(url=getElevenURL().format(
                        item["ctrip_hotel_id"]),
                                    meta={"item": copy.deepcopy(item)},
                                    priority=-10,
                                    dont_filter=True))
예제 #8
0
파일: BB.py 프로젝트: wyc1314/CrawlerSystem
</p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='303205961'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='3' data-img-count='0' data-arrivcitycount='5' data-comhotcount='3' data-userfulcount='1' data-isUserSelf = 'False'><img src='//images4.c-ctrip.com/target/t1/headphoto/177/080/529/285e9abeede347d9a7c1cc5e1b4c9287_R_100_100.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***0</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;3<br>被点有用&nbsp;1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:3.0,设施:3.0,服务:2.0,卫生:3.0'><span class='b' style='width:42px;'></span></span><span class='score'><span class='n'>2.8</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321785' data-baseRoomName='精致商务房'>精致商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_biz'></i>商务出差</span></p><div class='comment_txt'><div class='J_commentDetail'>不行,没早餐、没停车场,我们入住的时候问服务员停车场有没有,他说没有,但可以停在对面第一人民医院不收费,第二天去开车收了42元的停车费</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-25</span></p><a class='useful useful_voted' data-voted='1'  data-cid='303205961' href='javascript:void(0);'>有用<span class='n'>(1)</span></a></div></div></div></div><div class='comment_block J_asyncCmt' data-cid='306783408'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='4' data-img-count='0' data-arrivcitycount='4' data-comhotcount='4' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>1***2</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;4</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:4.0,设施:5.0,服务:4.0,卫生:4.0'><span class='b' style='width:68px;'></span></span><span class='score'><span class='n'>4.3</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114322153' data-baseRoomName='经典双床房'>经典双床房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>这一家酒店真的不错!位置也好!离秦淮河!夫孑庙!还有不会写字了都很近!如果有朋友去南京我会介绍去这家!5分</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-29</span></p><a class='useful' data-voted='0'  data-cid='306783408' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >城南有旧事 ,城北有信使, 林深时见鹿 ,海蓝时见鲸 ,梦醒时见你~~~~~~~~~~</p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='295189907'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='2' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>f***2</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:4.0,设施:4.0,服务:5.0,卫生:5.0'><span class='b' style='width:71px;'></span></span><span class='score'><span class='n'>4.5</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_couple'></i>情侣出游</span></p><div class='comment_txt'><div class='J_commentDetail'>有点吵,其它的都不错</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-23</span></p><a class='useful' data-voted='0'  data-cid='295189907' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >非常抱歉没有给您带来最好的入住体验,但是我们会继续努力,给您最好的服务,期待您的再次光临,祝您生活愉快^_^</p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='303337557'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='1' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***5</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:4.0,服务:5.0,卫生:5.0'><span class='b' style='width:74px;'></span></span><span class='score'><span class='n'>4.8</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>服务不错,大厅很有特点,早餐买一送一,可以带孩子,不过描述房间有30平,感觉实际没有哦。洗手台下面空着真心欣赏不了,整体好评。</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-26</span></p><a class='useful' data-voted='0'  data-cid='303337557' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text text_other' >谢谢亲的点评,每个房间加上洗手间都在35平方左右哦,卫生间是干湿分离设计呢,不过小锦也在努力的完善中,期待亲和小锦一起成长~爱你哦~~~~~~~~~~~</p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='291782555'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='2' data-img-count='0' data-arrivcitycount='2' data-comhotcount='2' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>M***4</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;2</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:5.0,服务:5.0,卫生:5.0'><span class='b' style='width:80px;'></span></span><span class='score'><span class='n'>5.0</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114322312' data-baseRoomName='精致双床房'>精致双床房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_family'></i>家庭亲子</span></p><div class='comment_txt'><div class='J_commentDetail'>房间挺大挺舒适</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-22</span></p><a class='useful' data-voted='0'  data-cid='291782555' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text ' >思念是青色藤蔓开出白色的花,怎样看上去也清晰的艳。像天暗下来独自点亮的一盏烛火,雨后天空出现的彩虹,忧伤而美~~
</p><p class='comment_txt_more J_txt_fold float_right hidden'><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div><div class='comment_block J_asyncCmt' data-cid='291136859'><div class='user_info ' style='position:relative;'><p class='head'><span class='img' data-commentcount='1' data-img-count='0' data-arrivcitycount='1' data-comhotcount='1' data-userfulcount='0' data-isUserSelf = 'False'><img src='//pic.c-ctrip.com/common/pic_default_avatar.jpg' onerror="this.onerror=''; this.src='//pic.c-ctrip.com/common/pic_default_avatar.jpg'" style='width:33px; height:33px;' /></span></p><p class='name'><span>1***0</span></p><p class='level_new'></p><p class='num'>点评总数&nbsp;1</p></div><div class='comment_main'><p class='comment_title'><span class='small_c' data-value='位置:5.0,设施:5.0,服务:5.0,卫生:5.0'><span class='b' style='width:80px;'></span></span><span class='score'><span class='n'>5.0</span>分</span><a class='room J_baseroom_link' data-baseRoomId='114321738' data-baseRoomName='经典商务房'>经典商务房</a><span class='date'>2019年04月入住</span><span class='type'><i class='k_else'></i>其它</span></p><div class='comment_txt'><div class='J_commentDetail'>非常好的酒店,新、干净、交通便利,早餐还有优惠活动,品种也挺多,还是性价比挺高的,下次还会去住</div><p class='comment_txt_more J_commentShowMore hidden'><a href='javascript:;' class='show_unfold'>查看更多<i></i></a></p><p class='comment_txt_more J_commentShowLess hidden'><a href='javascript:;' class='show_fold'>收起<i></i></a></p><div class='comment_bar'><p class='comment_bar_info'><i class="phone"></i><span class='time'>发表于2019-04-22</span></p><a class='useful' data-voted='0'  data-cid='291136859' href='javascript:void(0);'>有用<span class='n'>(0)</span></a></div></div><div class='htl_reply'><p class='title'><span class='b'>酒店回复:</span></p><p class='text text_other' >你对小锦而言太珍贵了,珍贵到你在小锦身边的每一分钟我都当做最后一分钟去过,所以小锦才要马不停蹄的去拥抱你,把最好的都给你。
</p><p class='comment_txt_more J_txt_fold float_right '><a class='show_unfold' href='javascript:;'>查看回复<i></i></a><a class='show_fold hidden' href='javascript:;'>收起<i></i></a></p></div></div></div></div><div class='c_page_box'><div class='c_page'><a href='javascript:;' class='c_up_nocurrent'></a><div class='c_page_list layoutfix'><a href='javascript:;' class='current'><span>1</span></a><a value='2' href='/hotel/dianping/1519962_p2t0.html'><span>2</span></a><a value='3' href='/hotel/dianping/1519962_p3t0.html'><span>3</span></a><a value='4' href='/hotel/dianping/1519962_p4t0.html'><span>4</span></a><a value='5' href='/hotel/dianping/1519962_p5t0.html'><span>5</span></a><a value='6' href='/hotel/dianping/1519962_p6t0.html'><span>6</span></a><span class='c_page_ellipsis'>...</span><a value='28' href='/hotel/dianping/1519962_p28t0.html'><span>28</span></a></div><a value='2' class='c_down' href='/hotel/dianping/1519962_p2t0.html'><span>下一页</span></a><div class='c_pagevalue'>到<input type='text' class='c_page_num' name='cPageNum' id='cPageNum' value='1'>页<input type='button' class='c_page_submit' value='确定' name='cPageBtn' id='cPageBtn'></div><input type="hidden" id="cTotalPageNum" value="28" /></div></div></div></div></div><div id='commentTracker' style='display:none'>Version=1.0&PageID=102003&Rank=307613212,310543106,307122028,304092037,304091967,307279956,307443730,303841425,303436689,303205961,306783408,295189907,303337557,291782555,291136859&IdentityTextFilter=-1&OrderBy=1&RecommentType=All</div><div id='commentTracker20150415' style='display:none'>{"hotelid":"1519962","commentselect":"1","ordertype":"2","outcategory":"全部","roomtype":"","chooseorsearch":"2","keyword":"","result":"2","others":""}</div>
'''
import re

# re.compile()
RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL)
print(RECORDS.__len__())

num = 0
for RECORD in RECORDS:
    num +=1
    print(num)
    item = {}
    USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD)
    REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD)
    ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD)
    RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD)
    print(RAW_REVIEW_CONTENT)
    REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD)
    USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD)
    IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>",
                              RECORD) if "false" else "true"  # 1 有回复内容,0没有
    REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD)
    RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD)
    sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME)
    item["userName"] = USER_NAME
    item["reviewRatingValue"] = REVIEW_RATING_VALUE
    item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00"
    item["replyReviewContent"] = REPLY_REVIEW_CONTENT
예제 #9
0
 def process_request(self, request, spider):
     if request.url.__contains__("172.25."):
         hotelId = extract_re("hotelId=(\d+)", request.url)
         request._set_url(getElevenURL().format(hotelId))
예제 #10
0
 def get_home_review(self,response):
     eleven = response.meta["eleven"]
     html = response.body_as_unicode()
     if response.meta.get("page",1) ==1:
         item = response.meta["item"]
         ctripZongHeItem = response.meta["ctripZongHeItem"]
         ctripZongHeItem["RATING_VALUE"] = extract_re("<span class='score'><span class='n'>(.*?)</span>", html)
         ctripZongHeItem["REVIEW_COUNT"] = extract_re("<span id='All_Comment' >全部\((\d+)\)", html)
         ctripZongHeItem["GUEST_TYPE"] = "null"
         print(ctripZongHeItem)
         yield ctripZongHeItem
         for page in range(2, int(self.next_page_num) + 1):
             url = "http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx?MasterHotelID={hotel_id}&hotel={hotel_id}" \
                   "&NewOpenCount=0&AutoExpiredCount=0&RecordCount=1253&OpenDate=2016-01-01&card=-1&property=-1&userType=-1&" \
                   "productcode=&keyword=&roomName=&orderBy=1&viewVersion=c&currentPage={page}&contyped=0&" \
                   "callback=CASwaGVffbjQXgEzk&eleven={eleven}".format(hotel_id=item["ctrip_hotel_id"], page=page,
                                                                       eleven=eleven)
             yield Request(url, headers=response.request.headers, cookies=response.request.cookies, callback=self.get_home_review, priority=20+self.__class__.num,
                                  meta={"item": copy.deepcopy(item), "page": page, "eleven": eleven,"is_need_proxy":True},
                                  )
     RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL)
     for RECORD in RECORDS:
         item = response.meta["item"]
         USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD)
         REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD)
         ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD)
         RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD)
         REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD)
         USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD)
         IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>",
                                        RECORD) if "false" else "true"  # 1 有回复内容,0没有
         REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD)
         RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD)
         sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME)
         item["userName"] = USER_NAME
         item["reviewRatingValue"] = REVIEW_RATING_VALUE
         item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00"
         item["replyReviewContent"] = REPLY_REVIEW_CONTENT
         item["reviewId"] = REVIEW_ID
         item["userAvatar"] = USER_AVATAR
         item["isNotReply"] = "false" if IS_NOT_REPLY else "true"
         item["rawReviewContent"] = RAW_REVIEW_CONTENT
         item["RAW_PICTURES"] = RAW_PICTURES
         EMOTION_TYPE = classify(
             {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)",
              "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE)
         item["emotionType"] = EMOTION_TYPE
         item["sourceSiteId"] = sourceSiteId
         print(item)
         yield item