Ejemplo n.º 1
0
class TuniuCatcher(object):

    def __init__(self):
        self._city = None
        self.__ota_info = "途牛"
        self.tuniu_api_client = TuniuAPIClient()
        self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])

    def setCity(self, city):
        self._city = city

    def getHotelList(self, city_code):
        if self._city == None:
            print "未设置城市,请先使用setCity方法"
            return
        hotel_list = []
        page_index = 1
        page_amount =10000
        while page_index <= page_amount - 1:
            try:
                page_data = self.tuniu_api_client.get_hotel_list(page_index, city_code)
                # 接口返回的酒店数不稳定,所以爬取页数以最小数为准
                if page_amount > page_data["data"]["total"] / 20:
                    page_amount = page_data["data"]["total"] / 20
                    print "page_amount=%d"%page_amount
                hotel_list.extend(page_data["data"]["list"])
                print "Page_%d Success"%page_index
                time.sleep(5)
                page_index += 1
            except:
                print "Page_%d Fail"%page_index
                continue
        return hotel_list

    def saveHolteList(self, hotel_list):
        old_location_info = self.hotel_dao.get_locations(self._city)
        old_baseinfo = list(self.hotel_dao.get_baseinfo(self._city, self.__ota_info))
        # 将基础数据中的if_overtime先假设为都已过时
        for i in range(0, len(old_baseinfo)):
            old_baseinfo[i] = list(old_baseinfo[i])
            old_baseinfo[i][5] = 1
        new_locations = []
        new_baseinfo = []
        update_baseinfo = []
        # 遍历将要保存的数据
        for item in hotel_list:
            location_id = None
            # 首先检查该酒店是否已经保存在location表中
            for location in old_location_info:
                if item["name"] == location[3]:
                    location_id = location[0]
                    break
            # 如果没有则插入一条新的记录到location表中
            if location_id is None:
                location_id = uuid.uuid1()
                trans_location = CoordTransor.gcj02towgs84(lng=float(item["pos"]["lng"]), lat=float(item["pos"]["lat"]))
                new_locations.append({
                    "guid": location_id,
                    "x": trans_location[1],
                    "y": trans_location[0],
                    "hotel_name": item["name"],
                    "city": self._city,
                    "address": item["address"]
                })
            # 根据location的id号到baseinfo表中查询
            # 如果已经存于表中,则更新该条数据
            # 如果没有,则插入一条新的数据
            if_exist = False
            for baseinfo in old_baseinfo:
                if location_id == baseinfo[2]:
                    if_exist = True
                    baseinfo[1] = item["url"]
                    baseinfo[4] = item["remarkCount"]
                    baseinfo[5] = 0
                    baseinfo[6] = int(item["remarkCount"]) - int(baseinfo[4]) if int(item["remarkCount"]) - int(baseinfo[4]) > 0 else 0
                    baseinfo[7] = item["snapshot"]
                    baseinfo[8] = item["id"]
                    break
            if not if_exist:
                new_baseinfo.append({
                    "guid": uuid.uuid1(),
                    "url": item["url"],
                    "location_id": location_id,
                    "OTA": self.__ota_info,
                    "comm_num": item["remarkCount"],
                    "if_overtime": 0,
                    "incre_num": item["remarkCount"],
                    "img": item["snapshot"],
                    "id_in_ota": item["id"]
                })
        for baseinfo in old_baseinfo:
            update_baseinfo.append({
                "guid": baseinfo[0],
                "url": baseinfo[1],
                "location_id": baseinfo[2],
                "OTA": baseinfo[3],
                "comm_num": baseinfo[4],
                "if_overtime": baseinfo[5],
                "incre_num": baseinfo[6],
                "img": baseinfo[7],
                "id_in_ota": baseinfo[8]
            })
        print len(new_locations), len(new_baseinfo), len(update_baseinfo)
        self.hotel_dao.save_locations(new_locations)
        self.hotel_dao.save_baseinfo(new_baseinfo)
        self.hotel_dao.update_baseinfo(update_baseinfo)
Ejemplo n.º 2
0
class XiechengDriverService(HotelService):
    def __init__(self):
        HotelService.__init__(self)
        # 携程dao
        self.xiechengDao = xiechengDAO(dao_setting["host"], dao_setting["db"],
                                       dao_setting["user"],
                                       dao_setting["password"])
        # 酒店dao
        self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"],
                                  dao_setting["user"], dao_setting["password"])
        # 存放列表页数据
        self.listPageInfo = []
        # 存放酒店详情数据
        self.hotelItem = {}
        # 存放酒店评论数据
        self.commList = []
        # 存储床价信息
        self.bed = {}
        # 当前ota名称
        self.__ota_info = "携程"

    def crawlListPage(self):
        self.openPage(
            "http://hotels.ctrip.com/hotel/nanjing12#ctm_ref=hod_hp_sb_lst")
        self.driver.implicitly_wait(10)
        # 单页循环次数
        loopNum = 0
        # 标识当前页面是否已经爬取:False为未处理,反之为已处理
        ifHandle = False
        # 获取总页面数
        pageNum = 140
        while (pageNum >= 1):
            # 循环次数加1
            loopNum = loopNum + 1
            # 到达页面90%处
            # js="var q=document.documentElement.scrollTop=9600"
            # self.driver.execute_script(js)
            self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
            self.driver.find_element_by_tag_name("body").send_keys(
                Keys.PAGE_UP)
            # 当页面中出现“返前价”字样时,爬取页面并跳转到下一页
            if u"收藏" in self.driver.page_source:
                # 对未解析过的页面进行解析
                if ifHandle == False:
                    self.__crawllianjie(self.driver.page_source)
                    print u"获取酒店数为:%d" % len(self.listPageInfo)
                    ifHandle = True
                # 跳转到下一页
                try:
                    if u"下一页" in self.driver.page_source:
                        self.driver.find_element_by_partial_link_text(
                            u"下一页").click()
                        #self.driver.find_element_by_xpath("//a[@class='c_down']").click()
                        pageNum = pageNum - 1
                        # 处理标识重新置为未处理
                        ifHandle = False
                        # 单页循环次数置为零
                        loopNum = 0
                        time.sleep(random.uniform(3, 6))
                        print u"页数:" + str(pageNum)
                except:
                    print "error happen at clicking of nextpage"
            # 如果单页循环次数不为零,说明没有跳转到下一页
            if loopNum != 0:
                # 循环次数较大的情况下(此处预定为15次)说明页面可能加载失败,跳出循环,否则继续循环获取
                if loopNum < 15:
                    time.sleep(3)
                    continue
                else:
                    break
        return False if pageNum > 1 else True

    # 爬取页面链接
    def __crawllianjie(self, page_sourse):
        response = HtmlResponse(url="my HTML string",
                                body=page_sourse,
                                encoding="utf-8")
        hotel_list = response.xpath("//div[@class='searchresult_list ']/ul")
        for hotel in hotel_list:
            url = hotel.xpath(
                "li[@class='searchresult_info_name']/h2/a/@href").extract()[0]
            address = hotel.xpath(
                "li[@class='searchresult_info_name']/p[@class='searchresult_htladdress']/text()"
            ).extract()[0]
            commnum = hotel.xpath(
                "li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()"
            ).extract()
            if len(commnum):
                commnum = re.sub('\D', '', commnum[0])
                commnum = commnum if len(commnum) > 0 else 0
            else:
                commnum = 0
            name = hotel.xpath(
                "li[@class='searchresult_info_name']/h2/a/text()").extract()[0]
            self.listPageInfo.append({
                "guid": uuid.uuid1(),
                "url": url,
                "hotel_name": name,
                "OTA": self.__ota_info,
                "comm_num": int(commnum),
                "address": address
            })

    '''
    保存爬取的酒店列表页数据
    '''

    def saveListPageInfo(self):
        baidu_api_service = BaiduMapAPIService("MviPFAcx5I6f1FkRQlq6iTxc")
        old_location_info = self.hotel_dao.get_locations(self._city)
        old_baseinfo = list(
            self.hotel_dao.get_baseinfo(self._city, self.__ota_info))
        # 将基础数据中的if_overtime先假设为都已过时
        for i in range(0, len(old_baseinfo)):
            old_baseinfo[i] = list(old_baseinfo[i])
            old_baseinfo[i][5] = 1
        new_locations = []
        new_baseinfo = []
        update_baseinfo = []
        # 遍历将要保存的数据
        for item in self.listPageInfo:
            location_id = None
            # 首先检查该酒店是否已经保存在location表中
            for location in old_location_info:
                if item["hotel_name"] == location[3]:
                    location_id = location[0]
                    break
            # 如果没有则插入一条新的记录到location表中
            if location_id == None:
                location_id = uuid.uuid1()
                geocoding_info = None
                while 1:
                    try:
                        geocoding_info = baidu_api_service.doGeocoding(
                            item["address"], city=self._city)
                        break
                    except:
                        time.sleep(0.5)
                        continue
                if "result" not in geocoding_info:
                    print item["hotel_name"] + "error"
                    continue
                trans_location = CoordTransor.bd09togcj02(
                    bd_lon=geocoding_info["result"]["location"]["lng"],
                    bd_lat=geocoding_info["result"]["location"]["lat"])
                print trans_location
                new_locations.append({
                    "guid": location_id,
                    "x": trans_location[1],
                    "y": trans_location[0],
                    "hotel_name": item["hotel_name"],
                    "city": self._city,
                    "address": item["address"]
                })

            # 根据location的id号到baseinfo表中查询
            # 如果已经存于表中,则更新该条数据
            # 如果没有,则插入一条新的数据
            if_exist = False
            for baseinfo in old_baseinfo:
                if location_id == baseinfo[2]:
                    if_exist = True
                    baseinfo[1] = item["url"]
                    baseinfo[4] = item["comm_num"]
                    baseinfo[5] = 0
                    baseinfo[6] = item["comm_num"] - baseinfo[
                        4] if item["comm_num"] - baseinfo[4] > 0 else 0
                    break
            if not if_exist:
                new_baseinfo.append({
                    "guid": item["guid"],
                    "url": item["url"],
                    "location_id": location_id,
                    "OTA": self.__ota_info,
                    "comm_num": item["comm_num"],
                    "if_overtime": 0,
                    "incre_num": item["comm_num"],
                })
        for baseinfo in old_baseinfo:
            update_baseinfo.append({
                "guid": baseinfo[0],
                "url": baseinfo[1],
                "location_id": baseinfo[2],
                "OTA": baseinfo[3],
                "comm_num": baseinfo[4],
                "if_overtime": baseinfo[5],
                "incre_num": baseinfo[6]
            })
        print len(new_locations)
        print len(new_baseinfo)
        print len(update_baseinfo)
        self.hotel_dao.save_locations(new_locations)
        self.hotel_dao.save_baseinfo(new_baseinfo)
        self.hotel_dao.update_baseinfo(update_baseinfo)
        #self.dao.saveListPageInfo(self.listPageInfo)

    def depose(self):
        self.driver.close()