Beispiel #1
0
 def get_room_info(self, checkin):
     """
     兼容新版age_info和room_info
     :return:
     """
     room_info = list()
     if self.task.ticket_info.get('age_info'):
         for index, room in enumerate(
                 self.task.ticket_info.get('age_info', [])):
             adult_info = []
             child_info = []
             for people in room:
                 age = self.calculate_age(checkin, people)
                 if age < 18:
                     child_info.append(age)
                 else:
                     adult_info.append(age)
             room_info.append(
                 dict(adult_info=adult_info, child_info=child_info))
     else:
         room_info = self.task.ticket_info.get('room_info', [])
     # 判断每间房人员类型是否相同,不同就报错
     for index, room in enumerate(room_info):
         if index == 0:
             continue
         if len(room['adult_info']) != len(
                 room_info[index - 1]['adult_info']):
             raise parser_except.ParserException(12, "房间类型不同")
         if len(room['child_info']) != len(
                 room_info[index - 1]['child_info']):
             raise parser_except.ParserException(12, "房间类型不同")
     return room_info
Beispiel #2
0
    def crawl_data(self, request_template, browser, source_name):
        """
        页面抓取函数
        :param request_template: 请求字典
        :param browser: 抓取浏览器
        :param source_name: 源名称
        :return: 返回抓取结果 response 对象
        """
        try:
            logger.debug(current_log_tag() + 'crawl %s, retry_count: %s',
                         self.__request_func.__name__, self.req_count)
            # 代理装配
            self.browser_set_proxy(browser, source_name)

            resp, self.content_length = self.__crawl_data_str(
                request_template, browser)

            # todo 修改 user_retry 返回的结果
            if self.user_retry:
                try:
                    user_check = self.spider.user_retry_err_or_resp(
                        resp, self.req_count, request_template, False)
                except Exception as e:
                    self.user_exc = True
                    raise e

                # 当用户返回 True 时
                if user_check:
                    return resp
                else:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, '代理异常')
            else:
                return resp
        except parser_except.ParserException as e:
            self.is_forbidden = e.code in (parser_except.PROXY_FORBIDDEN,
                                           parser_except.PROXY_FORBIDDEN,
                                           parser_except.REQ_ERROR)
            self.req_exception = e
        except Exception as e:
            self.req_exception = parser_except.ParserException(
                parser_except.REQ_ERROR, 'req exception:{0}'.format(e))

            # 如果有用户异常,则置位用户重试
            if self.user_exc:
                if isinstance(e, parser_except.ParserException):
                    self.req_exception = e

        finally:
            if self.req_exception:
                code = self.req_exception.code
            else:
                code = 0

        if self.req_exception:
            raise self.req_exception
Beispiel #3
0
 def response_error(self, req, resp, error):
     if resp.status_code == 400:
         import re
         try:
             message = re.search(r'<Message>(.*)</Message>',
                                 resp.text).group(1)
         except Exception:
             message = ""
         raise parser_except.ParserException(29, message)
     raise parser_except.ParserException(
         89, "服务出错啦啊啊~~!http code: {}".format(req['resp'].status_code))
Beispiel #4
0
    def task_parser(self):
        """
        接受任务,并解析,处理异常
        :return:
        """
        task = self.task
        try:
            contentlist = self.split_content(task.content)
            mj_city_id, hotel_id = contentlist[:2]
            days = int(contentlist[2])
            checkin = datetime.datetime.strptime(contentlist[-1], "%Y%m%d")
            checkout = checkin + datetime.timedelta(days=days)
            checkin_str = checkin.strftime("%Y-%m-%d")
            checkout_str = checkout.strftime("%Y-%m-%d")
            self.check_in = checkin_str
            ticket_info = task.ticket_info
            env_name = ticket_info.get("env_name")
            try:
                room_info = self.get_room_info(checkin_str)
                child_age = [
                    age for age in room_info[0]['child_info'] if age < 18
                ]
                adult = len(room_info[0]['adult_info']) + len(
                    [age for age in room_info[0]['child_info'] if age >= 18])
                child = len(child_age)
                self.user_datas['adult_num'] = adult
                self.user_datas['child_num'] = child
                room_count = len(room_info)
            except Exception as e:
                raise parser_except.ParserException(12, '任务错误')
        except Exception as e:
            raise parser_except.ParserException(ErrNumber.E__TASK, str(e))
        redis_key = 'Null'
        if hasattr(task, 'redis_key'):
            redis_key = task.redis_key

        try:
            auth = json.loads(task.ticket_info["auth"])
        # except parser_except.ParserException:
        except Exception:
            raise parser_except.ParserException(121, msg='API认证信息错误')
        request_info = dict(HotelID=hotel_id,
                            CheckInDate=checkin_str,
                            CheckOutDate=checkout_str,
                            city=mj_city_id,
                            RoomCount=room_count,
                            Adult=adult,
                            Children=child,
                            ChildrenAge=child_age,
                            Nationality='CN',
                            env_name=env_name,
                            redis_key=redis_key,
                            auth=auth)
        return request_info
Beispiel #5
0
def w_get_proxy(debug, source, task, verify_info):
    if debug and not slave_get_proxy:
        print('debug,and not define get_proxy,so can’t get proxy ')
        return None
    p = slave_get_proxy(source=source, task=task, verify_info=verify_info)
    if not p:
        raise parser_except.ParserException(parser_except.PROXY_NONE,
                                            f'get {source} proxy None')
    return p
Beispiel #6
0
    def __crawl_by_chain(self, chains):
        """
        根据请求链的类型,进入不同的抓取顺序进行抓取
        :param chains:
        :return:
        """
        code = 0
        try:
            for reqParse in chains:
                # gevent.sleep(0)
                browser = self.__create_browser(reqParse.new_session)
                reqParse.spider = self
                t_req = reqParse.request()

                if isinstance(t_req, dict):  # 单一请求
                    new_result = self.__single_crawl(reqParse, browser, t_req,
                                                     0)

                elif isinstance(t_req, list):
                    # 爬虫有可能返回一个空列表!!!
                    if t_req:
                        if reqParse.asynchronous:  # 并行抓取
                            list_result = self.__async_crawl_list(
                                reqParse, browser, t_req)
                        else:  # 串行请求
                            list_result = self.__crawl_list(
                                reqParse, browser, t_req)
                        new_result, code = self.check_list_result(
                            list_result, code)  # $$$ 可以优化

                elif isinstance(t_req,
                                types.GeneratorType):  # 针对使用的yelid 调用方法的请求
                    list_result = self.__crawl_list(reqParse, browser, t_req)
                    new_result, code = self.check_list_result(
                        list_result, code)

                self.__spider_append_result(new_result)

            if self.use_selenium and browser.br:
                browser.close()
        except parser_except.ParserException as e:
            if self.use_selenium and browser.br:
                browser.close()
            logger.error(e)
            raise e
        except Exception:
            if self.use_selenium and browser.br:
                browser.close()
            logger.exception(current_log_tag() +
                             '[新框架 持续请求 未知问题][ {0} ]'.format(
                                 traceback.format_exc().replace('\n', '\t')))
            raise parser_except.ParserException(
                parser_except.UNKNOWN_ERROR,
                'e:{0}'.format(traceback.format_exc()))

        return code
Beispiel #7
0
 def convert(self, request_template, data):
     data_con = request_template.get('data', {})
     c_type = data_con.get('content_type', 'string')
     logger.debug(current_log_tag() + 'Converter got content_type: %s',
                  c_type)
     if c_type is 'html':
         return HTML.fromstring(data)
     elif c_type is 'json':
         return json.loads(data)
     elif isinstance(c_type, types.MethodType):
         try:
             return c_type(request_template, data)
         except:
             raise parser_except.ParserException(
                 -1, 'convert func muset error{0} ,func:{1}'.format(
                     traceback.format_exc(), c_type))
     else:
         return data
Beispiel #8
0
def accor_parser(content, url, other_info):
    hotel = HotelBase()
    data = content.decode('utf-8')
    if '<title>Book a hotel online with Accor Hotels</title>' in data:
        raise parser_except.ParserException(29, '网站暂时维护中')
    hotel_code = re.findall(
        'https://www.accorhotels.com/zh/hotel-(.*?)-.*?/index.shtml',
        url)[0].lower()
    hotel_url = url
    source = 'accorHotel'
    source_city_id = 'NULL'
    brand_name = "NULL"
    _star = re.findall(
        '<div class="main-rating stars stars--(\d+)"\s*data-halfstars=', data)
    star = _star[0] if _star != [] else -1
    postal_code = get_blank(
        re.findall('<meta content="(.*?)" property="og:postal-code">', data))
    hotel_name = re.findall('<meta name="twitter:title" content="(.*?)">',
                            data)[0]
    hotel_name_en = "NULL"
    map_info = ",".join(
        re.findall('<meta content="(.*?)" name="geo.position"/>',
                   data)[0].split(';')[::-1])
    street = re.findall('<span itemprop="streetAddress">(.*?)</span><br>',
                        data)[0]
    location = re.findall('<span itemprop="addressLocality">(.*?)</span><br>',
                          data)[0]
    _country = re.findall('<span itemprop="addressCountry">(.*?)</span>',
                          data)[0]
    address = _country + location + street
    country = re.findall('<meta content="(.*?)" property="og:country-name">',
                         data)[0]
    city = get_blank(
        re.findall('<meta content="(.*?)" property="og:city">', data))
    _grade = re.findall(
        '<span class="rating"><span itemprop="ratingValue">\s*(.*?)</span>/<span itemprop="bestRating">5</span>\s*</span>',
        data)
    grade = _grade[0] if _grade != [] else -1.0
    review = get_blank(
        re.findall('<span class="rating-baseline">(.*?)</span>', data))
    review_num = "".join(re.findall('\d+', review)) or -1
    has_wifi = 'Yes' if re.findall('<i\s*class="icon icon_wifi"></i>',
                                   data) else 'No'
    if has_wifi == 'Yes':
        is_wifi_free = 'Yes' if re.findall(
            '<li\s*class="service-item "\s*data-servicename="wifi">',
            data) else 'No'
    else:
        is_wifi_free = 'NULL'
    has_parking = 'Yes' if re.findall('<i\s*class="icon icon_parking"></i>',
                                      data) else 'No'
    if has_wifi == 'Yes':
        is_parking_free = 'No' if re.findall(
            '<li\s*class="service-item\s*payable"\s*data-servicename="parking">',
            data) else 'Yes'
    else:
        is_parking_free = 'NULL'
    img_items = "|".join(
        re.findall(
            'www.ahstatic.com/photos/' + hotel_code +
            '_\w+_\d+_p_2048x1536.jpg', data))
    source_id = other_info['source_id']
    city_id = other_info['city_id']
    first_img = None
    if img_items:
        first_img = img_items.split('|')[0]
    others_info = {
        "city": city,
        "country": country,
        "first_img": first_img,
        "source_city_id": source_city_id
    }
    service = pq(data)('div.expandable-content').find('li').text().replace(
        "\t", "").replace("\n", "").replace(" ", "|")
    description = HTML(data).xpath("//p[@itemprop='description']/text()")[0]
    accepted_cards = "NULL"
    check_in_time = get_blank(
        re.findall('<i class="icon icon_times"></i>(.*?)</div>', data))
    check_out_time = get_blank(
        re.findall('<div class="col col-checkout">(.*?)</div>', data))

    hotel.hotel_name = hotel_name
    hotel.hotel_name_en = hotel_name_en
    hotel.source = source
    hotel.source_id = source_id
    hotel.source_city_id = source_city_id
    hotel.brand_name = brand_name
    hotel.map_info = map_info
    hotel.address = address
    hotel.city = city
    hotel.country = country
    hotel.city_id = city_id
    hotel.postal_code = postal_code
    hotel.star = star
    hotel.grade = grade
    hotel.review_num = review_num
    hotel.has_wifi = has_wifi
    hotel.is_wifi_free = is_wifi_free
    hotel.has_parking = has_parking
    hotel.is_parking_free = is_parking_free
    hotel.service = service
    hotel.img_items = img_items
    hotel.description = description
    hotel.accepted_cards = accepted_cards
    hotel.check_in_time = check_in_time
    hotel.check_out_time = check_out_time
    hotel.hotel_url = hotel_url
    hotel.others_info = json.dumps(others_info)
    return hotel
Beispiel #9
0
def get_proxy(
        source=None,
        allow_ports=[],
        forbid_ports=[],
        allow_regions=[],
        forbid_regions=[],
        user='******',
        passwd='realtime',
        proxy_info={},
        verify_info="verify",
        ip_num=1,
        ip_type="internal",
        task=Task(),
):
    """
    全都需要取代理暂时
    """

    qid = str(task.ticket_info.get('qid', int(time.time() * 1000)))

    msg = {
        "req": [{
            "source": source,
            "type": verify_info,
            "num": ip_num,
            "ip_type": ip_type,
        }]
    }
    msg = json.dumps(msg)
    ptid = task.ticket_info.get('ptid', "")
    time_st = time.time()
    get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format(
        qid, msg, ptid)
    logger.info("get proxy info :http://{1}{0}".format(get_info,
                                                       g_config.proxy_host))
    count = 1
    while 1:
        try:
            p = requests.get("http://{0}".format(g_config.proxy_host) +
                             get_info,
                             timeout=(6, 6),
                             stream=False)
            p_time = p.elapsed.total_seconds()
            p = p.content
            logger.info("代理返回内容为{0}".format(p))
            proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip']
            break
        except:
            exstr = traceback.format_exc()
            msg = '取代理请求时报错,错误信息为:' + exstr
            info = warn(qid, 'ex_GetProxyFail', ip, msg)
            logger.debug("\n" + info)
            if count == 3:
                raise parser_except.ParserException(21, "取代理时失败")
            time.sleep(3)
            logger.debug("取代理失败,进行第{}次重试,".format(count))
            count += 1
    time_end = time.time() - time_st
    # 代理服务有时候会返回一个只有":"的代理!
    if len(proxy_ip) < 9:
        msg = "获取到的代理不可用!"
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p))
    if not proxy_ip:
        msg = '未获取到代理,请求信息为:' + get_info
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "未获取到代理")
    if p_time > 1.5:
        msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time)
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
    p = [proxy_ip, [p, time_end, get_info]]
    return p
Beispiel #10
0
    def __crawl_data_str(self, request_template, browser):
        resp = None
        try:
            # 使用方法修改,用户直接修改 request_template 中的值
            self.spider.prepare_request(request_template)

            # 获得 request_template 中的 req
            req = request_template['req']

            # 用于控制qps
            if hasattr(self.spider, 'queue_info'):
                browser.queue_info = self.spider.queue_info

            if hasattr(self.spider.task, 'req_qid'):
                browser.qid = self.spider.task.req_qid
            else:
                browser.qid = ""
            browser.task_id = self.spider.task.task_id
            browser.source = self.spider.task.source
            browser.tid = self.spider.task.tid
            browser.ori_type = self.spider.task.ori_type

            resp = browser.req(**req)
            # 网络错误,异常抛出
            resp.raise_for_status()

            content_length = len(resp.content)
            if isinstance(self.need_content_length, int):
                logger.debug(current_log_tag() +
                             '[爬虫 content_length={1} 检测][页面长度需要大于 {0}]'.format(
                                 self.need_content_length, content_length))
                if content_length <= self.need_content_length:
                    raise parser_except.ParserException(
                        parser_except.PROXY_INVALID, msg='data is empty')
            elif self.need_content_length is None:
                logger.debug(current_log_tag() + '[爬虫无需 content_length 检测]')
            else:
                logger.debug(current_log_tag() +
                             '[未知 content_length 检测类型][type: {0}]'.format(
                                 str(type(self.need_content_length))))
            return resp, content_length
        # timeout
        except requests.exceptions.SSLError as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_SSL,
                                                msg=str(e),
                                                error=e)
        except requests.exceptions.ProxyError as e:  # 代理失效
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg='Proxy Error',
                                                error=e)

        except requests.exceptions.ConnectTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request connect Timeout',
                                                error=e)
        except requests.exceptions.ReadTimeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request read Timeout',
                                                error=e)
        except requests.exceptions.Timeout as e:
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_FORBIDDEN,
                                                msg='Request Timeout',
                                                error=e)

        except requests.exceptions.ConnectionError as err:
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err))

        except requests.exceptions.HTTPError as err:  # 4xx 5xx 的错误码会catch到
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)

        except requests.exceptions.RequestException as err:  # 这个是总的error
            self.spider.response_error(request_template, resp, err)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=str(err),
                                                error=err)
        except Exception as e:  # 这个是最终的error
            self.spider.response_error(request_template, resp, e)
            raise parser_except.ParserException(parser_except.PROXY_INVALID,
                                                msg=traceback.format_exc())
    def parse_detail(self, req, resp):
        tree = etree.HTML(resp)
        req_url = req['req']['url']
        # print req['req']['url']
        self.item['source'] = 'shangrila'
        self.item['brand_name'] = '香格里拉'

        if 'about' in req_url:
            if 'service' in req_url:

                hotel2 = Hotel_New()

                try:
                    service_all = tree.xpath(
                        "//div[@class='control2_1column']/ul/li/text()")
                    # facilities_dict = {'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅',
                    #                    'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心',
                    #                    'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊',
                    #                    'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车场',
                    #                    'Airport_bus': '机场', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务',
                    #                    'Rental_service': '租车服务', 'Room_wifi': '客房无线网络', 'Room_wired': '客房有线网络',
                    #                    'Public_wifi': '公共区域无线上网', 'Public_wired': '公共区域有线网络'}

                    facilities_dict = {
                        'Swimming_Pool': ['游泳池'],
                        'gym': ['健身房'],
                        'SPA': ['SPA'],
                        'Bar': ['酒吧'],
                        'Coffee_house': ['咖啡厅'],
                        'Tennis_court': ['网球场'],
                        'Golf_Course': ['高尔夫球场'],
                        'Sauna': ['桑拿'],
                        'Mandara_Spa': ['水疗中心'],
                        'Recreation': ['儿童娱乐场', '儿童游乐场'],
                        'Business_Centre': ['商务中心'],
                        'Lounge': ['行政酒廊'],
                        'Wedding_hall': ['婚礼礼堂'],
                        'Restaurant': ['餐厅'],
                        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
                        'Valet_Parking': ['代客泊车'],
                        'Call_service': ['叫车服务'],
                        'Rental_service': ['租车服务'],
                        'Room_wifi': ['客房无线网络'],
                        'Room_wired': ['客房有线网络'],
                        'Public_wifi': ['公共区域无线上网'],
                        'Public_wired': ['公共区域有线网络']
                    }
                    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
                    service_dict = {
                        'Luggage_Deposit': '行李寄存',
                        'front_desk': '24小时前台',
                        'Lobby_Manager': '24小时大堂经理',
                        '24Check_in': '24小时办理入住',
                        'Security': '24小时安保',
                        'Protocol': '礼宾服务',
                        'wake': '叫醒服务',
                        'Chinese_front': '中文前台',
                        'Postal_Service': '邮政服务',
                        'Fax_copy': '传真/复印',
                        'Laundry': '洗衣',
                        'polish_shoes': '擦鞋服务',
                        'Frontdesk_safe': '保险',
                        'fast_checkin': '快捷入住及退房服务',
                        'ATM': '自动柜员机(ATM)/银行服务',
                        'child_care': '儿',
                        'Food_delivery': '送餐服务'
                    }
                    reverse_sevice_dict = {
                        v: k
                        for k, v in service_dict.items()
                    }
                    for service in service_all:
                        for keys, fac_value in facilities_dict.items():
                            if fac_value in service:
                                service = self.clean_data(service)
                                if keys in hotel2.facility:
                                    hotel2.facility[keys] = hotel2.facility[
                                        keys] + ',' + service
                                else:
                                    hotel2.facility[keys] = service
                        for sev_value in service_dict.values():
                            if sev_value in service:
                                service = self.clean_data(service)
                                hotel2.service[
                                    reverse_sevice_dict[sev_value]] = service
                    self.item['service'] = hotel2.service
                    self.item['facility'] = hotel2.facility
                except Exception as e:
                    self.item['service'] = "NULL"
                    self.item['facility'] = "NULL"

            elif 'map' in req_url:
                try:
                    latitude = re.compile(r'"Lat":"(.*?)"',
                                          re.S).findall(resp)[0]
                    longitude = re.compile(r'"Lng":"(.*?)"',
                                           re.S).findall(resp)[0]
                except:
                    raise parser_except.ParserException(22, '代理失效')
                self.item['latitude'] = latitude
                self.item['longitude'] = longitude
                map_list = tree.xpath(
                    "//div[@class='control2_1column']/div[@class='map-list']/div/h4/text()"
                )
                self.item['traffic'] = "NULL"
                traffic_str_all = ""
                index = 1
                for tra_str in map_list:
                    # if tra_str == "公共交通":
                    traffic_str_l = tree.xpath(
                        "//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()"
                        .format(index))
                    traffic_str = " ".join(traffic_str_l).strip().replace(
                        " ", "")
                    traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "机场交通":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "地铁":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "出租车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "高速磁悬浮列车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "酒店豪华桥车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "").replace('\r', '').replace('\n', '')
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    self.item['traffic'] = traffic_str_all
                    index += 1
                return

        elif 'reviews' in req_url:
            self.flag = True

            try:
                link = tree.xpath(
                    '//iframe[contains(@id, "ChildFrame")]/@src')[0]
            except:
                raise parser_except.ParserException(22, 'proxy error')

            self.review_url = link
            if 'http' not in link:
                self.review_url = 'http:' + link

            self.review_url = self.review_url.strip()
    def parse_index(self, req, resp):
        req_url = req['req']['url']
        # print req_url
        # print req_url

        if 'NavigationMainMenuJson' in req_url:
            # print resp
            node_list = resp['MainMenu']
            self.info_list.extend([
                'http://www.shangri-la.com{}'.format(node['Url'])
                for node in node_list
                if 'about' in node['Url'] or 'reviews' in node['Url']
            ])

        elif 'NavigationJson' in req_url:
            # print resp
            try:
                node_list = resp['NaviMenu']
                self.info_list.extend([
                    'http://www.shangri-la.com{}'.format(node['Url'])
                    for node in node_list
                    if 'map' in node['Url'] or 'service' in node['Url']
                ])
            except KeyError:
                raise parser_except.ParserException(
                    22, '请求失效,失效url为:{}'.format(req_url))
        else:
            tree = etree.HTML(resp)
            description = re.compile(r'<p>(.*?)</p>').findall(resp)
            description_info = ''
            for des in description:
                if u'本酒店可接受以下信用卡付款' in des or u'退房时用信用卡结账需' in des:
                    pass
                else:
                    description_info += des
            # print description_info

            self.item['description'] = description_info
            try:
                hotel_phone = tree.xpath(
                    "//span[@id='ctl00_ContentPlaceHolder1_ltrPhone']/text()"
                )[0]
                self.item['hotel_phone'] = hotel_phone
            except Exception as e:
                self.item['hotel_phone'] = 'NULL'
            self.item['source_id'] = self.hotel_code

            self.img_url = 'http://www.shangri-la.com/HotelPhotoVideoJson.json?hotel_code={}&lang=cn'.format(
                self.hotel_code)

            # hotel_name_start = tree.xpath('//div[@class="logoOverLayer"]/img/@alt')[0]
            # print resp
            try:
                hotel_name_info = tree.xpath(
                    '//meta[@property="og:title"]/@content')[0]
            except:
                raise parser_except.ParserException(22, 'proxy error')
            # title = tree.xpath('//title/text()')
            # print title
            # print hotel_name_info
            if '五星级' in hotel_name_info:
                self.item['star'] = 5
            elif '四星级' in hotel_name_info:
                self.item['star'] = 4
            elif '三星级' in hotel_name_info:
                self.item['star'] = 3
            else:
                self.item['star'] = ''
            # hotel_name = hotel_name_info.split('|')[-1]
            # self.item['hotel_name'] = hotel_name
            self.item['hotel_name_en'] = self.hotel_name
            try:
                # print resp
                post_code = tree.xpath(
                    '//div[@class="widget-mid"]//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()'
                )
                if not post_code:
                    post_code = tree.xpath(
                        '//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()'
                    )

                if len(post_code) <= 1:
                    p_codes = post_code[0].split(',')
                    try:
                        p_code = p_codes[-2]
                        n = re.compile(r'(\d+)').findall(p_code)
                        if not n:
                            p_code = ''
                    except:
                        p_code = ''
                    address = post_code[0]
                else:
                    address = post_code[0]
                    p_code = re.compile(r'\d+').findall(post_code[1])[0]
            except:
                post_code = ''
                p_code = ''
                address = ''

            self.item['city'] = self.city

            self.item['postal_code'] = p_code
            self.item['address'] = address
            print req['req']['url']
            # print resp
            try:
                self.img_first = 'http://www.shangri-la.com{}'.format(
                    tree.xpath('//div[@id="background"]/img/@src')[0])
            except:
                self.img_first = ''
            try:
                check_time = tree.xpath(
                    '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/text()'
                )

                if not check_time:
                    check_time = tree.xpath(
                        '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()'
                    )

                    if check_time[1] == u'\xa0':
                        check_time1 = tree.xpath(
                            '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()'
                        )[0]
                        check_time2 = tree.xpath(
                            '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/span/text()'
                        )[0]
                        check_time = [check_time1, check_time2]

                try:
                    # print check_time[0]
                    self.item['check_in_time'] = check_time[0].split(':')[1]
                    self.item['check_out_time'] = check_time[1].split(':')[1]
                except:
                    self.item['check_in_time'] = ''
                    self.item['check_out_time'] = ''
            except:
                self.item['check_in_time'] = ''
                self.item['check_out_time'] = ''
            try:
                accepted_card_info = tree.xpath(
                    '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/p/text()'
                )
                if not accepted_card_info:
                    accepted_card_info = tree.xpath(
                        '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/text()'
                    )
                if len(accepted_card_info) <= 1:
                    accepted_card_infos = accepted_card_info[0].replace(
                        ':', ':')
                    accepted_cards = accepted_card_infos.split(':')[-1].replace('、', '|').replace(',', '|'). \
                        replace(u'及', "|").replace('。', '')
                else:
                    accepted_cards = accepted_card_info[-1].replace('、', '|').replace(',', '|'). \
                        replace(u'及', "|").replace('。', '')
            except:
                accepted_cards = ''
            self.item['accepted_cards'] = accepted_cards
Beispiel #13
0
    def parse_English_hotel(self, req, resp):
        data = etree.HTML(resp)
        try:
            self.hotel_test['hotel_name_en'] = data.xpath(
                "//p[@class='homePropertyName']//text()")[0]
        except:
            raise parser_except.ParserException(22, '代理失效,重试')
        self.hotel_test['source'] = 'hyatt'
        self.hotel_test['brand_name'] = '凯悦'
        self.hotel_test['source_id'] = re.findall(r"var spiritCode='(.*?)'",
                                                  resp, re.S)[0].encode('utf8')

        latitude = re.findall(r'"latitude" : "(.*?)"', resp,
                              re.S)[0].encode('utf8')
        longitude = re.findall(r'"latitude" : "(.*?)"', resp,
                               re.S)[0].encode('utf8')
        self.hotel_test['map_info'] = longitude + ',' + latitude
        self.hotel_test['address'] = data.xpath(
            "//p[@class='address']//text()")[0]
        self.hotel_test['hotel_city'] = re.findall(r'hotel_city:"(.*?)"', resp,
                                                   re.S)[0].encode('utf-8')
        self.hotel_test['hotel_country'] = data.xpath(
            "//p[@class='address']/span[1]/text()")[0]
        try:
            self.hotel_test['hotel_postal_code'] = data.xpath(
                "//p[@class='address']/span[2]/text()")[0]
        except:
            self.hotel_test['hotel_postal_code'] = 'NULL'
        self.hotel_test['star'] = 5
        self.hotel_test['grade'] = self.hotel_test['grade']
        self.hotel_test['review_num'] = self.hotel_test['review_num']

        wifi = data.xpath(
            "//img[@src='/content/dam/PropertyWebsites/andaz/nycaw/Media/All/xBHSG_RightRail_Four_101216.png.pagespeed.ic.1FAby01L8_.png']"
        )
        if len(wifi):
            # self.hotel_test['has_wifi'] = 'YES'
            self.hotel_test['has_wifi'] = 'Free WiFi'
            self.hotel_test['is_wifi_free'] = 'YES'
        else:
            # self.hotel_test['has_wifi'] = 'NULL'
            self.hotel_test['has_wifi'] = ''
            self.hotel_test['is_wifi_free'] = 'NULL'

        self.hotel_test['has_parking'] = 'NULL'
        self.hotel_test['is_parking_free'] = 'NULL'
        # self.hotel_test['services'] = 'NULL'
        self.hotel_test['services'] = ''

        imgs = data.xpath(
            "//div[@class='carousel fullWidth floatL']//a/img/@src")[0]
        url = self.url_en.split('/en')[0]
        self.hotel_test['img_items'] = url + imgs

        self.hotel_test['description'] = data.xpath(
            "//div[@class='readMoreContent']//text()")[0]

        while '\n' in self.hotel_test['description']:
            self.hotel_test['description'].remove('\n')
        self.hotel_test['accepted_cards'] = 'NULL'
        self.hotel_test['check_in_time'] = self.hotel_test['check_in_time']
        self.hotel_test['check_out_time'] = self.hotel_test['check_out_time']
        self.hotel_test['hotel_url'] = self.url_en
        try:
            self.hotel_test['Img_first'] = self.url + data.xpath(
                "//div[@class='carousel fullWidth floatL']//@src")[0]
        except:
            self.hotel_test['Img_first'] = ''

        phone = data.xpath("//p[@class='phnNo']/text()")[0].split('+')[1]
        self.hotel_test['hotel_phone'] = phone.replace(' ', '')
Beispiel #14
0
    def parse_room_util(self, req, resp):
        """
        解析方法的工具方法,用于parse_Room和parse_verifyRoom
        :return:
        """
        if "Invalid Auth" in resp:
            if json.loads(self.task.ticket_info['auth']).get(
                    'apienv', 'test') == 'online':
                use_record_api(task=self.task,
                               api_name='PriceSearchRequest',
                               unionkey='daolvApi',
                               record_tuple=1,
                               error_id=122,
                               api_info={},
                               msg='',
                               httpcode=req['resp'].status_code,
                               resp='',
                               is_success=1)
            raise parser_except.ParserException(122, '认证信息失败')
        if "Location not found for given descriptor and type" in resp:
            if json.loads(self.task.ticket_info['auth']).get(
                    'apienv', 'test') == 'online':
                use_record_api(task=self.task,
                               api_name='PriceSearchRequest',
                               unionkey='daolvApi',
                               record_tuple=1,
                               error_id=99,
                               api_info={},
                               msg='',
                               httpcode=req['resp'].status_code,
                               resp='',
                               is_success=1)
            raise parser_except.ParserException(
                99, 'Location not found for given descriptor and type')
        request_info = self.task_parser()
        task = self.task
        redis_key = request_info['redis_key']
        result = []
        try:
            doc = etree.XML(resp)
            RatePlans = doc.xpath('.//RatePlan')
            hotel_name = self.get_first_str(
                doc.xpath('.//Hotel/HotelName/text()'))
            hotel_id = self.get_first_str(doc.xpath('.//Hotel/HotelID/text()'))
            CityID = str(
                self.get_first_str(
                    doc.xpath('.//Hotel/Destination/@CityCode')))
            for offer_count, each in enumerate(RatePlans):
                try:
                    room = Room()
                    room.hotel_name = hotel_name
                    room.city = request_info.get('city')
                    room.source = 'daolvApi'
                    room.source_hotelid = hotel_id
                    room.real_source = room.source
                    room.room_type = self.get_first_str(
                        each.xpath('./RatePlanName/text()'))
                    room.occupancy = int(
                        self.get_first_str(
                            each.xpath('./MaxOccupancy/text()')))
                    bed_type = self.get_first_str(
                        each.xpath('./BedType/text()'))
                    """
                    返回的信息中给出的是一个床型id,需要在api中查询具体的信息,因为更新频率低,写成了dict信息存放,更新时
                    更新同目录下的bed_type_json.py即可
                    """
                    bed_type = bed_type_dict.get(str(bed_type), '暂时没有对应信息')
                    if isinstance(bed_type, list):
                        room.bed_type = '{0}; {1}'.format(
                            bed_type[0], bed_type[1])
                    else:
                        room.bed_type = bed_type
                    room.check_in = request_info.get('CheckInDate')
                    room.check_out = request_info.get('CheckOutDate')
                    try:
                        room.rest = int(
                            self.get_first_str(
                                each.xpath('./InventoryCount/text()')))
                    except Exception as e:
                        room.rest = len(self.task.ticket_info['room_info'])
                    room.price = float(
                        self.get_first_str(each.xpath('./TotalPrice/text()')))
                    room.currency = each.find('Currency').text
                    room.has_breakfast = self.has_breakfast(
                        self.get_first_str(
                            each.xpath('./BreakfastType/text()')))
                    room.is_breakfast_free = room.has_breakfast
                    return_rule = each.xpath(
                        './RatePlanCancellationPolicyList/CancellationPolicy')
                    room.return_rule = self.return_rule_str(return_rule)
                    if room.return_rule == 'NULL':
                        room.is_cancel_free = 'No'
                        room.return_rule = ''

                    room.others_info = json.dumps({
                        "rate_key":
                        self.get_first_str(each.xpath('./RatePlanID/text()')),
                        "room_num":
                        len(self.task.ticket_info["room_info"]),
                        "payment_info":
                        "",
                        "rating":
                        "",
                        "payKey": {
                            "redis_key": redis_key,
                            "uid": getuid(),
                            "id": offer_count,
                        },
                        'extra': {
                            'breakfast':
                            self.get_first_str(
                                each.xpath("./BreakfastType/text()")),
                            'payment':
                            '',
                            'size_info':
                            '',
                            'return_rule':
                            room.return_rule,
                            'occ_des':
                            str(room.occupancy),
                            'occ_num': {
                                'adult_num': room.occupancy,
                                'child_num': 0,
                            },
                            'size_info_extra':
                            0
                        }
                    })

                    room.pay_method = 'mioji'
                    self.room_obj_list.append(room)
                    room_tuple = (room.hotel_name, room.city, room.source,
                                  room.source_hotelid, room.source_roomid,
                                  room.real_source, room.room_type,
                                  room.occupancy, room.bed_type, room.size,
                                  room.floor, room.check_in, room.check_out,
                                  room.rest, room.price, room.tax,
                                  room.currency, room.pay_method,
                                  room.is_extrabed, room.is_extrabed_free,
                                  room.has_breakfast, room.is_breakfast_free,
                                  room.is_cancel_free, room.room_desc,
                                  room.return_rule, room.extrabed_rule,
                                  room.change_rule, room.others_info,
                                  room.guest_info)
                    result.append(room_tuple)
                except Exception as e:
                    logger.error('field not comple %s\n', str(e))
                    raise parser_except.ParserException(
                        ErrNumber.E_UNKNOWN, str(e))
        except Exception as e:
            if json.loads(self.task.ticket_info['auth']).get(
                    'apienv', 'test') == 'online':
                use_record_api(task=self.task,
                               api_name='PriceSearchRequest',
                               unionkey='daolvApi',
                               record_tuple=1,
                               error_id=25,
                               api_info={},
                               msg='',
                               httpcode=req['resp'].status_code,
                               resp='',
                               is_success=0)
            raise parser_except.ParserException(ErrNumber.E_UNKNOWN, str(e))

        if result == []:
            if json.loads(self.task.ticket_info['auth']).get(
                    'apienv', 'test') == 'online':
                use_record_api(task=self.task,
                               api_name='PriceSearchRequest',
                               unionkey='daolvApi',
                               record_tuple=1,
                               error_id=291,
                               api_info={},
                               msg='',
                               httpcode=req['resp'].status_code,
                               resp='',
                               is_success=0)
            raise parser_except.ParserException(291, '无房')
        elif result != []:
            if json.loads(self.task.ticket_info['auth']).get(
                    'apienv', 'test') == 'online':
                use_record_api(task=self.task,
                               api_name='PriceSearchRequest',
                               unionkey='daolvApi',
                               record_tuple=1,
                               error_id=0,
                               api_info={},
                               msg='',
                               httpcode=req['resp'].status_code,
                               resp='',
                               is_success=0)
        return result
Beispiel #15
0
    def __single_crawl(self, reqParse, browser, request_template, page_count):
        """ 用于请求的基本方法
        """
        # 请求链中的header 可以被沿用
        headers = request_template['req'].get('headers', None)
        use_headers = request_template['req'].get('use_headers', False)
        if headers:
            browser.add_header(headers, use_headers)

        # 设置 res 的 默认值
        res = defaultdict(list)

        # 初始化请求参数

        local_req_count = 0
        reqParse.req_count = 0
        reqParse.is_forbidden = False
        reqParse.req_exception = None
        reqParse.proxy = None
        reqParse.content_length = 0

        self.__cpu_time += time.time() * 1000

        while local_req_count < reqParse.retry_count:
            # 增加一次重试次数
            local_req_count += 1
            logger.debug(
                current_log_tag() +
                '[开始抓取][ {0} ]'.format(request_template['req'].get('url', '')))
            # 外部传入请求次数,用于在 parse 过程中抛出的代理异常进行重新抓取
            try:
                resp = reqParse.crawl_data(request_template, browser,
                                           self.task.source)
            except parser_except.ParserException as e:
                traceback.print_exc()
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                # 错误码21/22/23 或 开发指定需要重试
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID,
                              parser_except.REQ_ERROR,
                              parser_except.PROXY_SSL) or e.need_retry:
                    reqParse.is_forbidden = True

                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     traceback.format_exc())
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由框架抛出][错误码:{0}][count:{1}]'.
                                     format(e.code, reqParse.req_count))
                        continue
                else:
                    raise e
            except Exception as e:
                if reqParse.user_exc:
                    # 抛出用户在函数中抛出的错误
                    raise e
                if local_req_count >= reqParse.retry_count:
                    raise e
                else:
                    continue

                    # 请求中增加 resp 的值
            request_template['resp'] = resp
            # 打印存储抓取结果
            self.response_callback(request_template, resp)
            if reqParse.res_text == 'text':
                res = resp.text
            else:
                res = resp.content
            try:
                logger.debug(current_log_tag() +
                             '[抓取结果][ {2} ][ {0} ... ... {1} ]'.format(
                                 res[:100], res[-100:], request_template['req']
                                 ['url']).replace('\n', '').replace('\t', ''))
            except Exception:
                pass
            # 如果本地运行,将不执行上传操作
            if not self.debug and self.env != "local":
                md5_key = get_md5(res)
                verify_task_info = {
                    'func_name': reqParse.request_func.__name__,
                    'page_index': page_count,
                    'retry_count': local_req_count - 1,
                    'md5_key': md5_key
                }
                # 把上传抓取页面至ucloud
                self.task_post_process_queue.put((res, self.task, md5_key))
                self.verify_data['data'].append(verify_task_info)

            point_time = time.time() * 1000
            try:
                convert_data = reqParse.convert(request_template, res)
            except Exception:
                if local_req_count >= reqParse.retry_count:
                    logger.debug(current_log_tag() + traceback.format_exc())
                    raise parser_except.ParserException(
                        parser_except.DATA_FORMAT_ERROR,
                        '[traceback: {0}]'.format(traceback.format_exc()))
                else:
                    continue
            finally:
                self.__cpu_time += time.time() * 1000 - point_time

            # 数据解析部分
            point_time = time.time() * 1000
            try:
                res = reqParse.parse(request_template,
                                     self.__targets_parser_func_dict,
                                     convert_data, page_count,
                                     self._crawl_targets_required)

                break
            except parser_except.ParserException as e:
                if e.code in (parser_except.PROXY_FORBIDDEN,
                              parser_except.PROXY_INVALID):
                    reqParse.is_forbidden = True

                    if local_req_count >= reqParse.retry_count or e.retry_from_first:
                        raise e
                    else:
                        logger.debug(current_log_tag() +
                                     '[准备重试][错误由爬虫抛出][错误码:{0}]'.format(e.code))
                        convert_data = None
                        continue
                else:
                    raise e
            except Exception:
                raise parser_except.ParserException(
                    parser_except.PARSE_ERROR,
                    '[traceback:{0}]'.format(traceback.format_exc()))
            finally:
                self.__cpu_time += time.time() * 1000 - point_time
                self.response_callback(request_template, resp)
        have_ticket = False
        for k, v in res.items():
            if not v:
                continue
            self._asy_temp_result[k] += v
            have_ticket = True
        # 有票 && slave调用的爬虫才会异步回调
        if have_ticket and self.process_callback and not self.debug and self.env != "local":
            self.process_callback(task=self.task,
                                  spider=self,
                                  result_type="RUNNING")

        return res