コード例 #1
0
def holiday_parser(content, url, other_info):
    """
    酒店详情的爬虫
    :param content: 包含3个或2个content的元组,分别为json和xml和json格式,其中第三个json可选,主要用来抓酒店的英文名
    :param url: 酒店详情页的url
    :param other_info: 包含city_id, source_id 的字典
    :return: 返回一个HotelBase的实例
    """
    hotel = HotelNewBase()
    detail = {}
    if len(content) == 3:
        content1, content2, content3 = content
        try:
            en_json = json.loads(content3)
            detail['hotel_name_en'] = en_json['hotelInfo']['profile']['name']
        except:
            pass
    else:
        content1, content2 = content
    re_match = re.search('/hotels/cn/zh/(\w+)/hoteldetail', url)
    hotel_code = re_match.group(1) if re_match else ''

    # with open('igh.html', 'w') as f:
    #     f.write(content2)
    resp = json.loads(content1)['hotelInfo']
    hotel.hotel_url = url
    hotel.hotel_name = resp.get('profile', '').get('name', '')
    hotel.hotel_name_en = detail.get('hotel_name_en', '')
    hotel.source = 'holiday'
    hotel.source_id = other_info.get('source_id', '') or hotel_code
    # hotel.source_city_id = other_info.get('source_city_id', '')
    hotel.brand_name = resp.get('brandInfo', '').get('brandName', '')
    hotel.map_info = str(resp.get('profile', '').get(
        'longitude', '')) + ',' + str(
            resp.get('profile', '').get('latitude', ''))
    hotel.address = get_all_street(resp)
    hotel.city = resp.get('address', '').get('city', '')
    hotel.country = resp.get('address', '').get('country', '').get('name', '')
    hotel.city_id = other_info.get('city_id', '')
    hotel.postal_code = resp.get('address', '').get('zip', '')
    hotel.star = '-1'
    hotel.grade = resp.get('profile', '').get('averageReview', '')
    hotel.review_num = resp.get('profile', '').get('totalReviews', '')
    hotel.check_in_time = resp.get('policies', '').get('checkinTime', '')
    hotel.check_out_time = resp.get('policies', '').get('checkoutTime', '')
    first_img = resp.get('profile', '')
    if first_img:
        first_img = first_img.get('primaryImageUrl', '')
        if first_img:
            first_img = first_img.get('originalUrl', '')
            hotel.Img_first = first_img
    hotel.description = resp.get('profile', '').get(
        'longDescription', '') + '\n' + resp.get('profile', '').get(
            'shortDescription', '')
    # detail['has_wifi'] = 'Yes' if any([u'无线互联网' in ''.join(i.values()) or 'wifi' in ''.join(i.values()) for i in
    #                                         resp.get('facilities', '')]) else detail.get('has_wifi', 'Null')
    # detail['service'] = detail.get('service', '') + get_api_server(resp)
    facilities_dict = {
        'Swimming_Pool': '泳池',
        'gym': '健身',
        'SPA': 'SPA',
        'Bar': '酒吧',
        'Coffee_house': '咖啡厅',
        'Tennis_court': '网球场',
        'Golf_Course': '高尔夫球场',
        'Sauna': '桑拿',
        'Mandara_Spa': '水疗中心',
        'Recreation': '儿童娱乐场',
        'Business_Centre': '商务中心',
        'Lounge': '行政酒廊',
        'Wedding_hall': '婚礼礼堂',
        'Restaurant': '餐厅',
        'Parking': '停车',
        'Airport_bus': '机场班车',
        'Valet_Parking': '代客泊车',
        'Call_service': '叫车服务',
        'Rental_service': '租车服务',
        'Room_wifi': '无线互联网',
        'Room_wired': '有线互联网',
        'Public_wifi': '无线互联网',
        'Public_wired': '有线互联网'
    }
    reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '保险',
        'fast_checkin': '快速办理入住',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    facilities = resp.get("facilities", "")
    for each in facilities:
        if each['id'] == 'NO_PETS_ALLOWED' or each['id'] == 'PETS_ALLOWED':
            hotel.pet_type = each['name']
        for fac_value in facilities_dict.values():
            if fac_value in each['name']:
                hotel.facility_content[
                    reverse_facility_dict[fac_value]] = each['name']
        for ser_value in service_dict.values():
            if ser_value in each['name']:
                hotel.service_content[
                    reverse_sevice_dict[ser_value]] = each['name']
    fea_str = get_api_server(resp)
    tree = etree.HTML(content2)
    ser_str = get_ota_server(tree, '上网', '互联网', '泳', '退房', '餐', '预定', '停车',
                             '健身', '运动', '泳池', '特色', '服务')
    hotel_services_info = fea_str + ser_str
    hotel.others_info = json.dumps({
        'city':
        detail.get('city', ''),
        'country':
        detail.get('country', ''),
        'first_img':
        first_img,
        'source_city_id':
        other_info.get('source_city_id', ''),
        'hotel_services_info':
        hotel_services_info
    })
    hotel.img_items = get_all_pics(tree)

    # content_list = tree.xpath("//div[@class='accordian-content']/li/div[@class='header']/h2/span/text()")
    # index = 1
    # for content in content_list:
    #     if content == "停车":
    #         parking_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.facility_content['Parking'] = " ".join(parking_list)
    #     if content == "宠物政策":
    #         pet_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.pet_type = " ".join(pet_list)
    #     index += 1
    hotel.hotel_zip_code = hotel.postal_code
    # try:
    #     hotel.hotel_phone = tree.xpath("//div[@class='resdirect-num tel-no']/span/a/text()")[0]
    # except Exception as e:
    #     hotel.hotel_phone = "NULL"
    res = hotel.to_dict()
    # res = json.loads(res)
    # print json.dumps(res, ensure_ascii=False)
    return res
コード例 #2
0
def gha_parser(total_content, url, other_info):
    hotel = HotelNewBase()
    hotel.city_id = other_info.get("city_id", "NULL")

    select = etree.HTML(total_content)
    info = re.compile("pins\.gha_hotel\.push\((.*?)\)", re.S)
    address = re.compile(
        "<script type=\"application/ld\+json\">(.*?)</script>", re.S)
    address = json.loads(address.findall(total_content)[0].replace('	', ''))
    info = json.loads(info.findall(total_content)[0])
    hotel.hotel_name = info["title"]
    hotel.hotel_name_en = address["name"]
    hotel.source = "gha"
    hotel.source_id = info["id"]
    hotel.brand_name = info["brand_name"]
    hotel.map_info = str(info["lon"]) + "," + str(info["lat"])
    hotel.address = ''.join(select.xpath("//adress/text()")).strip()
    hotel.country = address["address"]["addressCountry"]
    hotel.city = address["address"]["addressLocality"]
    hotel.postal_code = address["address"]["postalCode"]
    hotel.star = '5'
    hotel.Img_first = select.xpath(
        "//div[@class='FlexEmbed-item']/span/img/@src")
    hotel.hotel_phone = address.get("telephone", 'NULL')
    hotel.hotel_zip_code = address["address"]["postalCode"]
    service = select.xpath('//ul[@class="prop-Amenities"]/li/span/text()')
    servicestr = ''.join(service)
    description = select.xpath("//div[@id='content-about-hotel']/p/text()")
    hotel.description = ''.join(description)
    if u'无线' in servicestr:
        hotel.facility["Room_wifi"] = u'无线上网'
        hotel.facility["Public_wifi"] = u'无线上网'
    if u'泳' in servicestr:
        hotel.facility["Swimming_Pool"] = u'泳池'
    if u'健身' in servicestr:
        hotel.facility["gym"] = u"健身中心"
    if u'水疗' in servicestr:
        hotel.facility['Mandara_Spa'] = u"水疗中心"
    if u'酒吧' in hotel.description:
        hotel.facility["Bar"] = u'酒吧'
    if u'儿童俱乐部' in hotel.description:
        hotel.facility["Recreation"] = u"儿童俱乐部"
    if u'餐' in servicestr:
        hotel.facility["Restaurant"] = u"餐饮"
    if u'商务中心' in servicestr:
        hotel.facility["Business_Centre"] = u'商务中心'
    if u'亲子' in servicestr:
        hotel.feature["Parent_child"] = u'亲子'
    img_list = select.xpath('//div[@class="RotateBanner-itemImg"]/span/@style')
    imgurl = re.compile("url\('(.*?)'\)")
    imgurl_list = []
    for img in img_list:
        imgurl_list.append(imgurl.findall(img)[0])

    hotel.img_items = '|'.join(imgurl_list)

    hotel.check_in_time = '14:00'
    hotel.check_out_time = '12:00'
    reviewsurl = re.compile('<script src="//(.*?)"')
    urls = reviewsurl.findall(total_content)
    if urls[0]:
        reviewsurl = "http://" + urls[0]
    else:
        hotel.grade = '0.0'
        hotel.review_num = 0
        hotel.hotel_url = url
        return hotel.to_dict()
    comment = requests.get(reviewsurl).content
    grade = re.compile('<div class=\\\\"rating-value\\\\">\\\\n(.*?)%', re.S)
    try:
        hotel.grade = str(float(grade.findall(comment)[0].strip()) / 10)
    except:
        hotel.grade = '0.0'
    review = re.compile('<div class=\\\\"review-count\\\\">\\\\n(.*?)reviews',
                        re.S)
    try:
        hotel.review_num = review.findall(comment)[0].strip()
    except:
        hotel.review_num = 0
    hotel.hotel_url = url
    # print room_tuple
    print hotel.to_dict()
    return hotel.to_dict()
コード例 #3
0
    def parse_hotel(self, req, resp):
        hotels = []
        # hotel = Hotel()
        hotel = Hotel_New()
        # hotel = BaseModel()
        hotel.hotel_name = 'NULL'
        hotel.hotel_name_en = self.hotel_test['hotel_name_en']
        hotel.source = 'hyatt'
        hotel.source_id = self.hotel_test['source_id']
        hotel.brand_name = 'NULL'
        hotel.map_info = self.hotel_test['map_info']
        hotel.address = self.hotel_test['address']
        hotel.city = self.hotel_test['hotel_city']
        hotel.country = self.hotel_test['hotel_country']
        hotel.postal_code = self.hotel_test['hotel_postal_code']
        hotel.star = 5
        hotel.grade = 'NULL'
        hotel.review_num = 'NULL'
        # hotel.has_wifi = self.hotel_test['has_wifi']
        # hotel.is_wifi_free = self.hotel_test['is_wifi_free']
        # hotel.has_parking = 'NULL'
        # hotel.is_parking_free = 'NULL'
        # hotel.service = self.hotel_test['services']
        # hotel.img_items = self.hotel_test['img_items']
        # hotel.description = ''.join(self.hotel_test['description'])
        hotel.Img_first = self.hotel_test['Img_first']
        hotel.hotel_phone = self.hotel_test['hotel_phone']
        hotel.hotel_zip_code = self.hotel_test['hotel_postal_code']
        hotel.traffic = ''
        hotel.chiled_bed_type = self.hotel_test['chiled_bed_type']
        hotel.pet_type = ''
        if self.hotel_test['has_wifi']:
            hotel.facility['Room_wifi'] = self.hotel_test['has_wifi']
        for one in self.hotel_test['services']:
            one = one.lower()
            if 'faxing' in one:
                hotel.service['Fax_copy'] = one
            elif 'postal' in one:
                hotel.service['Postal_Service'] = one
            elif 'laundry' in one:
                hotel.service['Laundry'] = one
            elif 'room service' in one:
                hotel.service['Food_delivery'] = one
            elif 'concierge service' in one:
                hotel.service['Protocol'] = one
            elif 'babysitting' in one:
                hotel.service['child_care'] = one
            elif 'shoeshine' in one:
                hotel.service['polish_shoes'] = one

            elif 'valet parking' in one:
                hotel.facility['Valet_Parking'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'wifi' in one or 'wi-fi' in one:
                hotel.facility['Room_wifi'] = one
            elif 'pool' in one:
                hotel.facility['Swimming_Pool'] = one
            elif 'gym' in one:
                hotel.facility['gym'] = one
            elif 'bar' in one:
                hotel.facility['Bar'] = one
            elif 'coffee' in one:
                hotel.facility['coffee'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one
            elif 'golf' in one:
                hotel.facility['Golf_Course'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'sauna' in one:
                hotel.facility['Sauna'] = one
            elif 'service to airport' in one or 'shuttle airport' in one:
                hotel.facility['Airport_bus'] = one
            elif 'wedding' in one:
                hotel.facility['Wedding_hall'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'business centre' in one:
                hotel.facility['Business_Centre'] = one
            elif 'sereno Spa' in one:
                hotel.facility['Mandara_Spa'] = one
            elif 'tennis' in one:
                hotel.facility['Tennis_court'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one

            elif "China_Friendly" in one:
                hotel.feature['China_Friendly'] = one
            elif "Romantic_lovers" in one:
                hotel.feature['Romantic_lovers'] = one
            elif "Parent_child" in one:
                hotel.feature['Parent_child'] = one
            elif "Beach_Scene" in one:
                hotel.feature['Beach_Scene'] = one
            elif "Hot_spring" in one:
                hotel.feature['Hot_spring'] = one
            elif "Japanese_Hotel" in one:
                hotel.feature['Japanese_Hotel'] = one
            elif "Vacation" in one:
                hotel.feature['Vacation'] = one

        hotel.accepted_cards = 'NULL'
        hotel.check_in_time = self.hotel_test['check_in_time']
        hotel.check_out_time = self.hotel_test['check_out_time']
        hotel.hotel_url = self.url_en

        # hotel_tuple = dict(
        #     hotel_name=hotel.hotel_name,
        #     hotel_name_en=hotel.hotel_name_en,
        #     source=hotel.source,
        #     source_id=hotel.source_id,
        #     brand_name=hotel.brand_name,
        #     map_info=hotel.map_info,
        #     address=hotel.address,
        #     city=hotel.city,
        #     country=hotel.country,
        #     postal_code=hotel.postal_code,
        #     star=hotel.star,
        #     grade=hotel.grade,
        #     review_num=hotel.review_num,
        #     has_wifi=hotel.has_wifi,
        #     is_wifi_free=hotel.is_wifi_free,
        #     has_parking=hotel.has_parking,
        #     is_parking_free=hotel.is_parking_free,
        #     service=hotel.service,
        #     img_items=hotel.img_items,
        #     description=hotel.description,
        #     accepted_cards=hotel.accepted_cards,
        #     check_in_time=hotel.check_in_time,
        #     check_out_time=hotel.check_out_time,
        #     hotel_url=hotel.hotel_url,
        # )
        # hotels.append(hotel_tuple)
        # return hotels
        res = hotel.to_dict()
        res = json.loads(res)

        # print json.dumps(res,ensure_ascii=False)
        return res
コード例 #4
0
def booking_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        root = HTML.fromstring(content)
    except Exception as e:
        print e.message
    hotel.hotel_name = re.findall(r'b_hotel_name:.*?\'(.+?)\',',
                                  content)[0].strip()
    hotel.hotel_name_en = re.findall(r'hotelName:.*?\"(.+?)\",',
                                     content)[0].strip()
    hotel.source = 'booking'
    hotel.source_id = other_info['source_id']
    latitude = re.findall(r'b_map_center_latitude = (.*?);',
                          content)[0].strip()
    longitude = re.findall(r'b_map_center_longitude = (.*?);',
                           content)[0].strip()
    hotel.map_info = '{},{}'.format(latitude, longitude)
    location_dict = json.loads(
        re.findall(r'<script type="application/ld\+json">(.*?)</script>',
                   content, re.S)[0].replace('\n', '').strip())
    hotel.address = location_dict['address']['streetAddress']
    hotel.city = re.findall(r'city_name:.*?\'(.*?)\'', content)[0].strip()
    hotel.country = location_dict['address']['addressCountry']
    hotel.city_id = other_info['city_id']
    hotel.postal_code = re.findall(r'"postalCode".*?\"(.*?)\"', content,
                                   re.S)[0].strip()
    try:
        hotel.star = root.xpath(
            '//*[@id="wrap-hotelpage-top"]/div[@class="hp__hotel-title"]/span/span[@class="hp__hotel_ratings__stars nowrap"]/i/@title'
        )[0].encode('utf-8').replace('星级酒店', '')
    except IndexError as e:
        print('Parser ERROR, NO Star Infomation.The reason follows: %s' %
              e.message)
    hotel.grade = location_dict['aggregateRating']['ratingValue']
    hotel.review_num = location_dict['aggregateRating']['reviewCount']
    hotel.Img_first = location_dict['image']
    # hotel.other_info =
    # hotel.hotel_phone =
    # hotel_zip_code =
    # hotel.feature =
    # hotel.brand_name =
    # hotel.continent =
    try:
        hotel.traffic = ','.join([
            root.xpath('//*[@id="public_transport_options"]/div/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[1]/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[2]/text()')
            [0].strip('\n').strip()
        ])
    except IndexError as e:
        print('Parser ERROR, NO Traffic Infomation.The reason follows: %s' %
              e.message)
    # hotel.chiled_bed_type = '\n'.join(root.xpath('//*[@id="children_policy"]/p[position()>1]/text()'))
    hotel.chiled_bed_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="children_policy"]/p[position()>1]//text()|//*[@id="general-child-policy"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.pet_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hotelPoliciesInc"]/div[@class="description"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    # -2:宠物  1:综合设施  2:活动设施  3:服务项目  5:浴室  6:媒体/科技  7:餐饮服务  11:网络  13:户外  16:停车场  17:卧室
    # 21:游泳及康复设施  27:商务设施
    hot_facilities = [
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilities-sliding-keep"]/div/div[@class="important_facility "]//text()'
        ) if i.replace('\n', '').strip()
    ]
    wifi = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=11]/ul/li[@class="policy"]/p/span//text()'
        ) if i.replace('\n', '').strip()
    ])
    if u'免费无线网络连接' in hot_facilities or u'免费!住宿方于各处提供WiFi(免费)。' in wifi:
        hotel.facility_content['Public_wifi'] = wifi
    elif u'免费!住宿方于客房提供WiFi(免费)。' in wifi:
        hotel.facility_content['Room_wifi'] = wifi
    elif u'客房' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Room_wired'] = wifi
    elif u'公共' in wifi or u'各处' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Public_wired'] = wifi
    parking = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=16]//p//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.facility_content['Parking'] = parking

    # 设施新字段添加到facilities_dict, 即可自动匹配
    facilities_dict = {
        'Swimming_Pool': ['游泳池'],
        'gym': ['健身房'],
        'SPA': ['SPA'],
        'Bar': ['酒吧'],
        'Coffee_house': ['咖啡厅'],
        'Tennis_court': ['网球场'],
        'Golf_Course': ['高尔夫球场'],
        'Sauna': ['桑拿'],
        'Mandara_Spa': ['水疗中心'],
        'Recreation': ['儿童娱乐场', '儿童游乐场'],
        'Business_Centre': ['商务中心'],
        'Lounge': ['行政酒廊'],
        'Wedding_hall': ['婚礼礼堂'],
        'Restaurant': ['餐厅'],
        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
        'Valet_Parking': ['代客泊车'],
        'Call_service': ['叫车服务'],
        'Rental_service': ['租车服务']
    }
    part_facilities = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilitiesChecklist"]/div/ul/li/span[@data-name-en]/text()'
        ))
    parser_list = []
    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    # print reverse_facility_dict
    for every in part_facilities:
        value = every.replace('咖啡', '咖啡厅').replace('网球', '网球场').replace(
            '健身', '健身房').replace('儿童娱乐', '儿童游乐').upper()
        for keys, faci in facilities_dict.items():
            for fac in faci:
                if fac in value:
                    if keys in hotel.facility_content:
                        hotel.facility_content[
                            keys] = hotel.facility_content[keys] + ',' + every
                    else:
                        hotel.facility_content[keys] = every
                    parser_list.append(every)
    print('酒店设施:{}'.format(', '.join(part_facilities)))
    print('已解析出:%s' % ', '.join(parser_list))
    service_list = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=3]/ul/li/span[1]/text()'
        ))

    # 服务新字段添加到facilities_dict, 即可自动匹配
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '前台保险柜',
        'fast_checkin': '快速办理入住/退房',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护服务',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    parser_sevice_list = []
    for every in part_facilities:
        for serv in service_dict.values():
            value = serv.replace('服务', '')
            if value in every:
                hotel.service_content[reverse_sevice_dict[serv]] = every
                parser_sevice_list.append(every)
    print('酒店服务:{}'.format(', '.join(service_list)
                           or '如果你看见了这句话请不要好奇,它表示酒店服务项目是空的'))
    print('已解析出:%s' % ', '.join(parser_sevice_list))
    hotel.img_items = '|'.join(
        root.xpath('//*[@id="photos_distinct"]/a[position()<last()-1]/@href'))
    if not hotel.img_items:
        hotel.img_items = '|'.join(
            root.xpath('//div[@class="bh-photo-grid-thumb-cell"]/a/@href'))
    hotel.description = '\n'.join(
        map(lambda x: x.strip(), root.xpath('//*[@id="summary"]/p/text()')))
    a = root.xpath(
        '//*[@class="jq_tooltip payment_methods_overall"]/button/@aria-label|'
        '//div[contains(@class, "payment_promotion_labels")]/label/span/text()'
    )

    hotel.accepted_cards = '|'.join(a)
    hotel.check_in_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkin_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.check_out_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkout_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.hotel_url = url.encode('utf-8')
    print json.dumps(hotel.to_dict(), ensure_ascii=False)
    return hotel.to_dict()
コード例 #5
0
def bestwestern_parser(content, url, other_info):
    lng_lat = content[0]
    html = etree.HTML(content[1])
    hotel = HotelNewBase()

    # 酒店名
    hotel.hotel_name = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//h1[@id="hotel-name"]/a/text()'
    )[0]
    # 酒店英文名
    hotel.hotel_name_en = hotel.hotel_name
    # 酒店源
    hotel.source = 'bestwestern'
    # 酒店id
    hotel.source_id = url.split('-')[-1]
    # 酒店品牌名
    hotel.brand_name = get_brand_name(html)
    # 酒店经纬度
    hotel.map_info = get_map_info(lng_lat)
    # 酒店地址
    hotel.address = "".join(
        html.xpath(
            '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span/text()'
        ))
    # 酒店所在城市
    hotel.city = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span[@id="address-1-city-state-zip"]/text()'
    )[0]
    # 酒店所在国家
    hotel.country = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span'
    )[-1].text
    # 城市ID(mioji)
    hotel.city_id = other_info['city_id']
    # 酒店邮编
    hotel.postal_code = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]//span[@class="postalCode"]/text()'
    )[0]
    # 酒店星级
    hotel.star = 5
    # 酒店评分
    hotel.grade = html.xpath('//div[@class="tripAdvisorOwl"]/img/@src'
                             )[0].split("/")[-1].split('-')[0]
    # 酒店评论数
    try:
        hotel.review_num = re.search(
            r'\d+',
            html.xpath(
                '//div[@class="hotelDetailsContainer"]//div[@id="hotel-reviews"]//div[@class="reviewRatingCount"]/text()'
            )[0]).group()
    except Exception:
        hotel.review_num = 0
    # 酒店头图
    hotel.Img_first = html.xpath(
        "//div[contains(@class, 'hotelImageSlider')]//li/img/@src")[0]
    # 酒店电话
    hotel.hotel_phone = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[0]
    # 酒店邮编
    hotel.hotel_zip_code = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[1]
    # 到达酒店的交通信息
    hotel.traffic = 'NULL'
    # 儿童和加床政策
    hotel.chiled_bed_type = 'NULL'
    # 宠物政策
    hotel.pet_type = html.xpath(
        '//div[@class="policyContent uk-margin-small-left"]/text()')[0]
    # 酒店特色
    get_feature(hotel, html)
    # 设施信息
    get_facility(hotel, html)
    # 服务信息
    get_service(hotel, html)
    # 酒店照片
    hotel.img_items = ",".join(
        html.xpath("//div[contains(@class, 'hotelImageSlider')]//li/img/@src"))
    # 酒店描述
    hotel.description = html.xpath(
        '//div[@class="hotelOverviewDetailSection"]/div[@class="overviewText"]/text()'
    )[0].strip()
    # 支付接受的卡
    hotel.accepted_cards = 'NULL'
    # 入住时间
    hotel.check_in_time = html.xpath(
        '//div[@class="uk-width-3-10 checkInPositionContainer addressCheckInTableCell"]/p[2]/text()'
    )[0]
    # 退房时间
    hotel.check_out_time = html.xpath(
        '//div[@class="phoneNumbers"]/div[contains(@class,"phonesRow")][1]/div[2]/p[2]/text()'
    )[0]
    # 酒店url
    hotel.hotel_url = url
    hotel_service_info = __get_hotel_service(html)
    hotel.others_info = json.dumps({"hotel_services_info": hotel_service_info})
    print hotel.to_dict()
    # with open("bestwestren.json", 'a') as f:
    #     f.write(hotel.to_dict() + "\n")
    return hotel.to_dict()