def elong_parser(content, url, other_info):
    hotel = HotelNewBase()

    try:
        root = HTML.fromstring(content.decode('utf-8'))
        phantom_js = execjs.get('PhantomJS')
        js_str = root.xpath('//script[contains(text(),"window.newDetailController")]/text()')[0]
        page_js = phantom_js.compile(js_str[js_str.index('window.newDetailController'):][:-1])
    except:
        try:
            js_str = root.xpath('//script[contains(text(),"HotelDetailController")]/text()')[0]
            page_js = phantom_js.compile(js_str[js_str.index('HotelDetailController'):][:-1])
        except:
            pass
        #print str(e)
        # return hotel
        # pass

    # 解析酒店中英文名,如果没有中文名则置为英文名,如果都解析失败则退出
    try:
        # temp_name = root.find_class('t24 yahei')[0].xpath('./text()')[0].strip().encode('utf-8')
        temp_name = root.xpath('//div[@class="t24"]/@title')[0].strip().encode('utf-8')
        k = temp_name.find('(')
        j = temp_name.find(')')
        hotel.hotel_name = temp_name[:k]
        hotel.hotel_name_en = temp_name[k + 1:j]
    except:
        try:
            hotel.hotel_name = root.find_class('hrela_name-cn')[0].xpath('./text()')[0].strip()
            hotel.hotel_name_en = root.find_class('hrela_name-en')[0].xpath('./text()')[0].strip()
        except:
            #print(str(e))
            # return hotel_tuple
            pass

    # 中英文名相同时只保留一个
    if hotel.hotel_name == hotel.hotel_name_en:
        if isinstance(hotel.hotel_name, str):
            hotel_name = hotel.hotel_name
        else:
            hotel_name = hotel.hotel_name.decode('utf8')
        if any(map(lambda x: u'\u4e00' <= x <= u'\u9fa5', hotel_name)):
            hotel.hotel_name_en = 'NULL'
        else:
            hotel.hotel_name = 'NULL'

    #print('hotel.hotel_name=>%s' % hotel.hotel_name)
    # #print hotel.hotel_name
    #print('hotel.hotel_name_en=>%s' % hotel.hotel_name_en)
    # #print hotel.hotel_name_en
    #print('brand=>%s' % hotel.brand_name)
    # #print hotel.brand_name

    # 解析酒店地址
    try:
        # hotel.address = root.find_class('mr5 left')[0].xpath('./text()')[0].strip().encode('utf-8').spilt(':')[1]
        temp = root.xpath('//span[@class="mr5 left"]/text()')
        hotel.address = temp[0].encode('utf-8').strip().split(':')[1]  # special chinese colon
    except:
        #print(e)
        hotel.address = 'NULL'

    if hotel.address == 'NULL':
        try:
            hotel.address = root.xpath('//span[@class="icon-address"]/text()')[0].replace('地址:', '').strip()
        except:
            #print(e)
            hotel.address = 'NULL'

    #print('hotel.address=>%s' % hotel.address)
    # #print hotel.address

    try:
        lat = re.findall(r'"lat":"([-+\d\.]*)"', content)[0]
        lon = re.findall(r'"lon":"([-+\d\.]*)"', content)[0]
        # map_infos = map_pat.findall(content)[0]
        hotel.map_info = '{},{}'.format(lon, lat)
    except:
        try:
            map_infos = page_js.eval('HotelDetailController').get('AjaxHotelInfo',{}).get('HotelGeoInfo',{})
            lat = map_infos.get('Lat',None)
            lon = map_infos.get('Long',None)
            hotel.map_info = '{0},{1}'.format(lon,lat)
            raise hotel.map_info == 'None,None'
        except:
            hotel.map_info = 'NULL'
            #print traceback.format_exc(e)

    #print 'map_info=>%s' % hotel.map_info
    # #print hotel.map_info

    # 解析酒店星级

    try:
        # star_temp = root.find_class('t24 yahei')[0].xpath('b/@class')[0].encode('utf-8')
        star_temp = root.xpath('//b[contains(@class, "icon_stars")]/@class')[0].encode('utf-8')
        hotel.star = star_temp[-1]
        if hotel.star == ' ':
            hotel.star = -1
    except:
        try:
            star_temp = page_js.eval('window.newDetailController').get('RecommendHotelRequest',{}).get('starLevel','')
            if json.loads(star_temp):
                hotel.star = json.loads(star_temp)[0]
        except:
            hotel.star = -1

    #print 'star=>%s' % hotel.star
    # #print hotel.star
    # 解析酒店评分
    try:
        grade = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_score', '')
        hotel.grade = grade

    except:
        try:
            grade = root.xpath('//div[@id="hover-hrela"]/p[1]')
            hotel.grade = float(re.search(r'[0-9\.]+', grade[0].text).group(0))
        except:
            try:
                # tp = root.xpath('//div[@class="pertxt_num"]/text()')[0].encode('utf-8')
                tp = root.xpath('//div[contains(@class, "pertxt_num")]/text()')[0].encode('utf-8')
                # t_grade = grade_pat.findall(tp)[0]
                # #print 't_grade', t_grade
                hotel.grade = float(tp)  # float(t_grade) * 0.05
            except:
                hotel.grade = 'NULL'
    #print 'grade=>%s' % hotel.grade
    # #print hotel.grade

    # 解析酒店评论数
    try:
        review_num_str = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_count', '')
        hotel.review_num = review_num_str
    except:
        try:
            # review_num_str = root.find_class('hrela_comt_total')[0]. \
            #     xpath('a/text()')[0].encode('utf-8').strip()
            # #print review_num_str
            review_num_str = root.find_class('fl sum-txt')[0].text_content().strip().encode('utf-8')
            hotel.review_num = int(grade_pat.findall(review_num_str)[0])
        except:
            hotel.review_num = -1

    #print 'review=>%s' % hotel.review_num
    # #print hotel.review_num

    # 解析酒店简介
    try:
        p_tags = root.find_class('dview_info')[0].xpath('dl[1]/dd/p')
        description = ''
        for p in p_tags:
            b_text = p.xpath('./b/text()')  # title
            p_text = p.xpath('./text()')  # description
            if len(b_text):
                description += b_text[0].strip().decode('utf-8') + ':' + p_text[1].strip().decode('utf-8') + '|'
        hotel.description = description[:-1].encode('utf-8')
        if hotel.description == '':
            hotel.description = p_tags[0].text_content().strip().encode('utf-8')
    except:
        hotel.description = 'NULL'

    #print 'description=>%s' % hotel.description
    # #print hotel.description

    # parse check_in time info , check out time info
    try:
        temp_time = root.xpath('//div[@id="iscrollNewAmenities"]/div/dl/dd/text()')[0]. \
            encode('utf-8').strip()
        #print temp_time
        hotel.check_in_time = temp_time.split(',')[0]
        k = temp_time.find('退房时间:')
        if k != -1:
            hotel.check_out_time = temp_time[k + 15:]
    except:
        hotel.check_out_time = 'NULL'
    #print 'check_in=>%s' % hotel.check_in_time
    # #print hotel.check_in_time

    #print 'check_out=>%s' % hotel.check_out_time
    # #print hotel.check_out_time
    # parse all services at this hotel

    accept_card = None
    try:
        service = ''
        accept_card = []
        service_list = root.xpath('//*[@id="serverall"]/li/text()')
        for each in service_list:
            service += each.encode('utf-8').strip() + '|'
            if '卡' in each:
                accept_card.append(each.strip())
        hotel.service = service[:-1]
    except:
        hotel.service = 'NULL'
    if accept_card:
        hotel.accepted_cards = '|'.join(accept_card).encode('utf-8')
    #print 'hotel.service=>%s' % hotel.service
    #print 'hotel.accept_cards=>%s' % hotel.accepted_cards
    # #print hotel.service
    first_img = None
    try:
        pattern_img = root.xpath('//div[@class="newdetaiL-img imgMore"]/@style')[0]
        first_img = re.search(r'url\(([^)]+)\)', pattern_img).group(1)
    except:
        #print e
        pass
    #others_info信息
    #print 'first_img=>%s' % first_img

    city_name = 'NULL'
    try:
        city_name = page_js.eval('window.newDetailController')['Region']['RegionName']
    except:
        #print e
        pass
    #print city_name
    hotel.city = city_name
    hotel.others_info = json.dumps({'city_name': city_name, 'first_img': first_img, 'hid':other_info.get('hid', 'NULL')})

    #获取source_city_id

    source_city_id = 'NULL'
    try:
        pattern_city_id = root.xpath('//p[@class="link555 t12"]/a[contains(@href,"region")]/@href')[0]
        source_city_id = re.search(r'[0-9]+',pattern_city_id).group()
    except:
        #print e
        pass
    # hotel.source_city_id = source_city_id
    #print "hotel.source_city_id",hotel.source_city_id
    # #print "hotel.others_info:",hotel.others_info

    # if '免费自助停车设施' in hotel.service:
    #     hotel.is_parking_free = 'Yes'
    #     hotel.has_parking = 'Yes'
    # if '收费自助停车设施' in hotel.service:
    #     hotel.has_parking = 'Yes'
    #     hotel.is_parking_free = 'No'
    # if '免费 Wi-Fi' in hotel.service:
    #     hotel.has_wifi = 'Yes'
    #     hotel.is_wifi_free = 'Yes'

    #print 'has_parking=>%s' % hotel.has_parking
    # #print hotel.has_parking
    #print 'is_parking_free=>%s' % hotel.is_parking_free
    # #print hotel.is_parking_free
    #print 'has_wifi=>%s' % hotel.has_wifi
    # #print hotel.has_wifi
    #print 'has_free_wifi=>%s' % hotel.is_wifi_free
    # #print hotel.is_wifi_free

    img_items = ''
    try:
        img_list = root.xpath('//ul[@class="hrela_spic_list"]/li/img/@src')
        for img_src in img_list:
            if '306' in img_src:
                img_src = img_src.replace('306', '307')
            img_items += img_src + '|'
        hotel.img_items = img_items[:-1]

        base_url = page_js.eval('window.newDetailController').get('BaseUrl')
        base_url = urljoin(base_url,'ihotel_848_470_all/')
        if not img_items:
            keys = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).keys()
            img_lists =[]
            for key in keys:
                img_list = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).get(key,{}).get('tagUrlList',{})
                img_lists.extend(img_list.values())
        img_lists = [base_url+img for img in img_lists]
        hotel.img_items = '|'.join(img_lists).encode('utf-8')
    except:
        hotel.img_items = 'NULL'

    #print 'img_items=>%s' % hotel.img_items
    # #print hotel.img_items

    if url.startswith('http://hotel'):
        try:
            hotel_obj = page_js.eval('HotelDetailController')
            lat = hotel_obj.get('googleLat', None)
            lon = hotel_obj.get('googleLng', None)
            hotel.map_info = '{0},{1}'.format(lon, lat)
            hotel.hotel_name = hotel_obj.get('hotelNameCn')
            hotel.hotel_name_en = hotel_obj.get('hotelNameEn')
            hotel.address = hotel_obj.get('hotelAddress')
            hotel.city = hotel_obj.get('cityNameCn') or hotel_obj.get('cityNameEn')
            hotel.grade = hotel_obj.get('starLevel')
            hotel.has_wifi = hotel_obj.get('hasWifi')
            hotel.source_city_id = hotel_obj.get('cityId')
            first_img = hotel_obj.get('hasWifi')
            hotel.others_info['first_img'] = first_img
        except:
            pass



    info_list = hotel.address.split(' ')
    hotel.country = info_list[-1]
    hotel.source = 'elong'
    hotel.hotel_url = url
    if other_info.get('hid'):
        hotel.source_id = re.search('/(\d+)/', url).groups()[0]
        # hotel.source_id = re.search('RegionId: ?"(\d+)"', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']
    hotel.others_info = json.dumps({
        'hotel_services_info':hotel.service,
    },ensure_ascii=False)

    # others_info_dict = hotel.__dict__
    # if first_img:
    #     others_info_dict['first_img'] = first_img
    # hotel.others_info = json.dumps(others_info_dict)
    # if first_img:
    #     del others_info_dict['first_img']
    # #print hotel

    return hotel
Example #2
0
def holiday_parser(content, url, other_info):
    """
    酒店详情的爬虫
    :param content: 包含3个或2个content的元组,分别为json和xml和json格式,其中第三个json可选,主要用来抓酒店的英文名
    :param url: 酒店详情页的url
    :param other_info: 包含city_id, source_id 的字典
    :return: 返回一个HotelBase的实例
    """
    hotel = HotelNewBase()
    detail = {}
    if len(content) == 3:
        content1, content2, content3 = content
        try:
            en_json = json.loads(content3)
            detail['hotel_name_en'] = en_json['hotelInfo']['profile']['name']
        except:
            pass
    else:
        content1, content2 = content
    re_match = re.search('/hotels/cn/zh/(\w+)/hoteldetail', url)
    hotel_code = re_match.group(1) if re_match else ''

    # with open('igh.html', 'w') as f:
    #     f.write(content2)
    resp = json.loads(content1)['hotelInfo']
    hotel.hotel_url = url
    hotel.hotel_name = resp.get('profile', '').get('name', '')
    hotel.hotel_name_en = detail.get('hotel_name_en', '')
    hotel.source = 'holiday'
    hotel.source_id = other_info.get('source_id', '') or hotel_code
    # hotel.source_city_id = other_info.get('source_city_id', '')
    hotel.brand_name = resp.get('brandInfo', '').get('brandName', '')
    hotel.map_info = str(resp.get('profile', '').get(
        'longitude', '')) + ',' + str(
            resp.get('profile', '').get('latitude', ''))
    hotel.address = get_all_street(resp)
    hotel.city = resp.get('address', '').get('city', '')
    hotel.country = resp.get('address', '').get('country', '').get('name', '')
    hotel.city_id = other_info.get('city_id', '')
    hotel.postal_code = resp.get('address', '').get('zip', '')
    hotel.star = '-1'
    hotel.grade = resp.get('profile', '').get('averageReview', '')
    hotel.review_num = resp.get('profile', '').get('totalReviews', '')
    hotel.check_in_time = resp.get('policies', '').get('checkinTime', '')
    hotel.check_out_time = resp.get('policies', '').get('checkoutTime', '')
    first_img = resp.get('profile', '')
    if first_img:
        first_img = first_img.get('primaryImageUrl', '')
        if first_img:
            first_img = first_img.get('originalUrl', '')
            hotel.Img_first = first_img
    hotel.description = resp.get('profile', '').get(
        'longDescription', '') + '\n' + resp.get('profile', '').get(
            'shortDescription', '')
    # detail['has_wifi'] = 'Yes' if any([u'无线互联网' in ''.join(i.values()) or 'wifi' in ''.join(i.values()) for i in
    #                                         resp.get('facilities', '')]) else detail.get('has_wifi', 'Null')
    # detail['service'] = detail.get('service', '') + get_api_server(resp)
    facilities_dict = {
        'Swimming_Pool': '泳池',
        'gym': '健身',
        'SPA': 'SPA',
        'Bar': '酒吧',
        'Coffee_house': '咖啡厅',
        'Tennis_court': '网球场',
        'Golf_Course': '高尔夫球场',
        'Sauna': '桑拿',
        'Mandara_Spa': '水疗中心',
        'Recreation': '儿童娱乐场',
        'Business_Centre': '商务中心',
        'Lounge': '行政酒廊',
        'Wedding_hall': '婚礼礼堂',
        'Restaurant': '餐厅',
        'Parking': '停车',
        'Airport_bus': '机场班车',
        'Valet_Parking': '代客泊车',
        'Call_service': '叫车服务',
        'Rental_service': '租车服务',
        'Room_wifi': '无线互联网',
        'Room_wired': '有线互联网',
        'Public_wifi': '无线互联网',
        'Public_wired': '有线互联网'
    }
    reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '保险',
        'fast_checkin': '快速办理入住',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    facilities = resp.get("facilities", "")
    for each in facilities:
        if each['id'] == 'NO_PETS_ALLOWED' or each['id'] == 'PETS_ALLOWED':
            hotel.pet_type = each['name']
        for fac_value in facilities_dict.values():
            if fac_value in each['name']:
                hotel.facility_content[
                    reverse_facility_dict[fac_value]] = each['name']
        for ser_value in service_dict.values():
            if ser_value in each['name']:
                hotel.service_content[
                    reverse_sevice_dict[ser_value]] = each['name']
    fea_str = get_api_server(resp)
    tree = etree.HTML(content2)
    ser_str = get_ota_server(tree, '上网', '互联网', '泳', '退房', '餐', '预定', '停车',
                             '健身', '运动', '泳池', '特色', '服务')
    hotel_services_info = fea_str + ser_str
    hotel.others_info = json.dumps({
        'city':
        detail.get('city', ''),
        'country':
        detail.get('country', ''),
        'first_img':
        first_img,
        'source_city_id':
        other_info.get('source_city_id', ''),
        'hotel_services_info':
        hotel_services_info
    })
    hotel.img_items = get_all_pics(tree)

    # content_list = tree.xpath("//div[@class='accordian-content']/li/div[@class='header']/h2/span/text()")
    # index = 1
    # for content in content_list:
    #     if content == "停车":
    #         parking_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.facility_content['Parking'] = " ".join(parking_list)
    #     if content == "宠物政策":
    #         pet_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.pet_type = " ".join(pet_list)
    #     index += 1
    hotel.hotel_zip_code = hotel.postal_code
    # try:
    #     hotel.hotel_phone = tree.xpath("//div[@class='resdirect-num tel-no']/span/a/text()")[0]
    # except Exception as e:
    #     hotel.hotel_phone = "NULL"
    res = hotel.to_dict()
    # res = json.loads(res)
    # print json.dumps(res, ensure_ascii=False)
    return res
def gha_parser(total_content, url, other_info):
    hotel = HotelNewBase()
    hotel.city_id = other_info.get("city_id", "NULL")

    select = etree.HTML(total_content)
    info = re.compile("pins\.gha_hotel\.push\((.*?)\)", re.S)
    address = re.compile(
        "<script type=\"application/ld\+json\">(.*?)</script>", re.S)
    address = json.loads(address.findall(total_content)[0].replace('	', ''))
    info = json.loads(info.findall(total_content)[0])
    hotel.hotel_name = info["title"]
    hotel.hotel_name_en = address["name"]
    hotel.source = "gha"
    hotel.source_id = info["id"]
    hotel.brand_name = info["brand_name"]
    hotel.map_info = str(info["lon"]) + "," + str(info["lat"])
    hotel.address = ''.join(select.xpath("//adress/text()")).strip()
    hotel.country = address["address"]["addressCountry"]
    hotel.city = address["address"]["addressLocality"]
    hotel.postal_code = address["address"]["postalCode"]
    hotel.star = '5'
    hotel.Img_first = select.xpath(
        "//div[@class='FlexEmbed-item']/span/img/@src")
    hotel.hotel_phone = address.get("telephone", 'NULL')
    hotel.hotel_zip_code = address["address"]["postalCode"]
    service = select.xpath('//ul[@class="prop-Amenities"]/li/span/text()')
    servicestr = ''.join(service)
    description = select.xpath("//div[@id='content-about-hotel']/p/text()")
    hotel.description = ''.join(description)
    if u'无线' in servicestr:
        hotel.facility["Room_wifi"] = u'无线上网'
        hotel.facility["Public_wifi"] = u'无线上网'
    if u'泳' in servicestr:
        hotel.facility["Swimming_Pool"] = u'泳池'
    if u'健身' in servicestr:
        hotel.facility["gym"] = u"健身中心"
    if u'水疗' in servicestr:
        hotel.facility['Mandara_Spa'] = u"水疗中心"
    if u'酒吧' in hotel.description:
        hotel.facility["Bar"] = u'酒吧'
    if u'儿童俱乐部' in hotel.description:
        hotel.facility["Recreation"] = u"儿童俱乐部"
    if u'餐' in servicestr:
        hotel.facility["Restaurant"] = u"餐饮"
    if u'商务中心' in servicestr:
        hotel.facility["Business_Centre"] = u'商务中心'
    if u'亲子' in servicestr:
        hotel.feature["Parent_child"] = u'亲子'
    img_list = select.xpath('//div[@class="RotateBanner-itemImg"]/span/@style')
    imgurl = re.compile("url\('(.*?)'\)")
    imgurl_list = []
    for img in img_list:
        imgurl_list.append(imgurl.findall(img)[0])

    hotel.img_items = '|'.join(imgurl_list)

    hotel.check_in_time = '14:00'
    hotel.check_out_time = '12:00'
    reviewsurl = re.compile('<script src="//(.*?)"')
    urls = reviewsurl.findall(total_content)
    if urls[0]:
        reviewsurl = "http://" + urls[0]
    else:
        hotel.grade = '0.0'
        hotel.review_num = 0
        hotel.hotel_url = url
        return hotel.to_dict()
    comment = requests.get(reviewsurl).content
    grade = re.compile('<div class=\\\\"rating-value\\\\">\\\\n(.*?)%', re.S)
    try:
        hotel.grade = str(float(grade.findall(comment)[0].strip()) / 10)
    except:
        hotel.grade = '0.0'
    review = re.compile('<div class=\\\\"review-count\\\\">\\\\n(.*?)reviews',
                        re.S)
    try:
        hotel.review_num = review.findall(comment)[0].strip()
    except:
        hotel.review_num = 0
    hotel.hotel_url = url
    # print room_tuple
    print hotel.to_dict()
    return hotel.to_dict()
def booking_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        root = HTML.fromstring(content)
    except Exception as e:
        print e.message
    hotel.hotel_name = re.findall(r'b_hotel_name:.*?\'(.+?)\',',
                                  content)[0].strip()
    hotel.hotel_name_en = re.findall(r'hotelName:.*?\"(.+?)\",',
                                     content)[0].strip()
    hotel.source = 'booking'
    hotel.source_id = other_info['source_id']
    latitude = re.findall(r'b_map_center_latitude = (.*?);',
                          content)[0].strip()
    longitude = re.findall(r'b_map_center_longitude = (.*?);',
                           content)[0].strip()
    hotel.map_info = '{},{}'.format(latitude, longitude)
    location_dict = json.loads(
        re.findall(r'<script type="application/ld\+json">(.*?)</script>',
                   content, re.S)[0].replace('\n', '').strip())
    hotel.address = location_dict['address']['streetAddress']
    hotel.city = re.findall(r'city_name:.*?\'(.*?)\'', content)[0].strip()
    hotel.country = location_dict['address']['addressCountry']
    hotel.city_id = other_info['city_id']
    hotel.postal_code = re.findall(r'"postalCode".*?\"(.*?)\"', content,
                                   re.S)[0].strip()
    try:
        hotel.star = root.xpath(
            '//*[@id="wrap-hotelpage-top"]/div[@class="hp__hotel-title"]/span/span[@class="hp__hotel_ratings__stars nowrap"]/i/@title'
        )[0].encode('utf-8').replace('星级酒店', '')
    except IndexError as e:
        print('Parser ERROR, NO Star Infomation.The reason follows: %s' %
              e.message)
    hotel.grade = location_dict['aggregateRating']['ratingValue']
    hotel.review_num = location_dict['aggregateRating']['reviewCount']
    hotel.Img_first = location_dict['image']
    # hotel.other_info =
    # hotel.hotel_phone =
    # hotel_zip_code =
    # hotel.feature =
    # hotel.brand_name =
    # hotel.continent =
    try:
        hotel.traffic = ','.join([
            root.xpath('//*[@id="public_transport_options"]/div/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[1]/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[2]/text()')
            [0].strip('\n').strip()
        ])
    except IndexError as e:
        print('Parser ERROR, NO Traffic Infomation.The reason follows: %s' %
              e.message)
    # hotel.chiled_bed_type = '\n'.join(root.xpath('//*[@id="children_policy"]/p[position()>1]/text()'))
    hotel.chiled_bed_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="children_policy"]/p[position()>1]//text()|//*[@id="general-child-policy"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.pet_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hotelPoliciesInc"]/div[@class="description"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    # -2:宠物  1:综合设施  2:活动设施  3:服务项目  5:浴室  6:媒体/科技  7:餐饮服务  11:网络  13:户外  16:停车场  17:卧室
    # 21:游泳及康复设施  27:商务设施
    hot_facilities = [
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilities-sliding-keep"]/div/div[@class="important_facility "]//text()'
        ) if i.replace('\n', '').strip()
    ]
    wifi = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=11]/ul/li[@class="policy"]/p/span//text()'
        ) if i.replace('\n', '').strip()
    ])
    if u'免费无线网络连接' in hot_facilities or u'免费!住宿方于各处提供WiFi(免费)。' in wifi:
        hotel.facility_content['Public_wifi'] = wifi
    elif u'免费!住宿方于客房提供WiFi(免费)。' in wifi:
        hotel.facility_content['Room_wifi'] = wifi
    elif u'客房' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Room_wired'] = wifi
    elif u'公共' in wifi or u'各处' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Public_wired'] = wifi
    parking = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=16]//p//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.facility_content['Parking'] = parking

    # 设施新字段添加到facilities_dict, 即可自动匹配
    facilities_dict = {
        'Swimming_Pool': ['游泳池'],
        'gym': ['健身房'],
        'SPA': ['SPA'],
        'Bar': ['酒吧'],
        'Coffee_house': ['咖啡厅'],
        'Tennis_court': ['网球场'],
        'Golf_Course': ['高尔夫球场'],
        'Sauna': ['桑拿'],
        'Mandara_Spa': ['水疗中心'],
        'Recreation': ['儿童娱乐场', '儿童游乐场'],
        'Business_Centre': ['商务中心'],
        'Lounge': ['行政酒廊'],
        'Wedding_hall': ['婚礼礼堂'],
        'Restaurant': ['餐厅'],
        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
        'Valet_Parking': ['代客泊车'],
        'Call_service': ['叫车服务'],
        'Rental_service': ['租车服务']
    }
    part_facilities = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilitiesChecklist"]/div/ul/li/span[@data-name-en]/text()'
        ))
    parser_list = []
    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    # print reverse_facility_dict
    for every in part_facilities:
        value = every.replace('咖啡', '咖啡厅').replace('网球', '网球场').replace(
            '健身', '健身房').replace('儿童娱乐', '儿童游乐').upper()
        for keys, faci in facilities_dict.items():
            for fac in faci:
                if fac in value:
                    if keys in hotel.facility_content:
                        hotel.facility_content[
                            keys] = hotel.facility_content[keys] + ',' + every
                    else:
                        hotel.facility_content[keys] = every
                    parser_list.append(every)
    print('酒店设施:{}'.format(', '.join(part_facilities)))
    print('已解析出:%s' % ', '.join(parser_list))
    service_list = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=3]/ul/li/span[1]/text()'
        ))

    # 服务新字段添加到facilities_dict, 即可自动匹配
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '前台保险柜',
        'fast_checkin': '快速办理入住/退房',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护服务',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    parser_sevice_list = []
    for every in part_facilities:
        for serv in service_dict.values():
            value = serv.replace('服务', '')
            if value in every:
                hotel.service_content[reverse_sevice_dict[serv]] = every
                parser_sevice_list.append(every)
    print('酒店服务:{}'.format(', '.join(service_list)
                           or '如果你看见了这句话请不要好奇,它表示酒店服务项目是空的'))
    print('已解析出:%s' % ', '.join(parser_sevice_list))
    hotel.img_items = '|'.join(
        root.xpath('//*[@id="photos_distinct"]/a[position()<last()-1]/@href'))
    if not hotel.img_items:
        hotel.img_items = '|'.join(
            root.xpath('//div[@class="bh-photo-grid-thumb-cell"]/a/@href'))
    hotel.description = '\n'.join(
        map(lambda x: x.strip(), root.xpath('//*[@id="summary"]/p/text()')))
    a = root.xpath(
        '//*[@class="jq_tooltip payment_methods_overall"]/button/@aria-label|'
        '//div[contains(@class, "payment_promotion_labels")]/label/span/text()'
    )

    hotel.accepted_cards = '|'.join(a)
    hotel.check_in_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkin_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.check_out_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkout_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.hotel_url = url.encode('utf-8')
    print json.dumps(hotel.to_dict(), ensure_ascii=False)
    return hotel.to_dict()
Example #5
0
    def parse_hotel(self, req, resp):
        hotels = []
        # hotel = Hotel()
        hotel = Hotel_New()
        # hotel = BaseModel()
        hotel.hotel_name = 'NULL'
        hotel.hotel_name_en = self.hotel_test['hotel_name_en']
        hotel.source = 'hyatt'
        hotel.source_id = self.hotel_test['source_id']
        hotel.brand_name = 'NULL'
        hotel.map_info = self.hotel_test['map_info']
        hotel.address = self.hotel_test['address']
        hotel.city = self.hotel_test['hotel_city']
        hotel.country = self.hotel_test['hotel_country']
        hotel.postal_code = self.hotel_test['hotel_postal_code']
        hotel.star = 5
        hotel.grade = 'NULL'
        hotel.review_num = 'NULL'
        # hotel.has_wifi = self.hotel_test['has_wifi']
        # hotel.is_wifi_free = self.hotel_test['is_wifi_free']
        # hotel.has_parking = 'NULL'
        # hotel.is_parking_free = 'NULL'
        # hotel.service = self.hotel_test['services']
        # hotel.img_items = self.hotel_test['img_items']
        # hotel.description = ''.join(self.hotel_test['description'])
        hotel.Img_first = self.hotel_test['Img_first']
        hotel.hotel_phone = self.hotel_test['hotel_phone']
        hotel.hotel_zip_code = self.hotel_test['hotel_postal_code']
        hotel.traffic = ''
        hotel.chiled_bed_type = self.hotel_test['chiled_bed_type']
        hotel.pet_type = ''
        if self.hotel_test['has_wifi']:
            hotel.facility['Room_wifi'] = self.hotel_test['has_wifi']
        for one in self.hotel_test['services']:
            one = one.lower()
            if 'faxing' in one:
                hotel.service['Fax_copy'] = one
            elif 'postal' in one:
                hotel.service['Postal_Service'] = one
            elif 'laundry' in one:
                hotel.service['Laundry'] = one
            elif 'room service' in one:
                hotel.service['Food_delivery'] = one
            elif 'concierge service' in one:
                hotel.service['Protocol'] = one
            elif 'babysitting' in one:
                hotel.service['child_care'] = one
            elif 'shoeshine' in one:
                hotel.service['polish_shoes'] = one

            elif 'valet parking' in one:
                hotel.facility['Valet_Parking'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'wifi' in one or 'wi-fi' in one:
                hotel.facility['Room_wifi'] = one
            elif 'pool' in one:
                hotel.facility['Swimming_Pool'] = one
            elif 'gym' in one:
                hotel.facility['gym'] = one
            elif 'bar' in one:
                hotel.facility['Bar'] = one
            elif 'coffee' in one:
                hotel.facility['coffee'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one
            elif 'golf' in one:
                hotel.facility['Golf_Course'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'sauna' in one:
                hotel.facility['Sauna'] = one
            elif 'service to airport' in one or 'shuttle airport' in one:
                hotel.facility['Airport_bus'] = one
            elif 'wedding' in one:
                hotel.facility['Wedding_hall'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'business centre' in one:
                hotel.facility['Business_Centre'] = one
            elif 'sereno Spa' in one:
                hotel.facility['Mandara_Spa'] = one
            elif 'tennis' in one:
                hotel.facility['Tennis_court'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one

            elif "China_Friendly" in one:
                hotel.feature['China_Friendly'] = one
            elif "Romantic_lovers" in one:
                hotel.feature['Romantic_lovers'] = one
            elif "Parent_child" in one:
                hotel.feature['Parent_child'] = one
            elif "Beach_Scene" in one:
                hotel.feature['Beach_Scene'] = one
            elif "Hot_spring" in one:
                hotel.feature['Hot_spring'] = one
            elif "Japanese_Hotel" in one:
                hotel.feature['Japanese_Hotel'] = one
            elif "Vacation" in one:
                hotel.feature['Vacation'] = one

        hotel.accepted_cards = 'NULL'
        hotel.check_in_time = self.hotel_test['check_in_time']
        hotel.check_out_time = self.hotel_test['check_out_time']
        hotel.hotel_url = self.url_en

        # hotel_tuple = dict(
        #     hotel_name=hotel.hotel_name,
        #     hotel_name_en=hotel.hotel_name_en,
        #     source=hotel.source,
        #     source_id=hotel.source_id,
        #     brand_name=hotel.brand_name,
        #     map_info=hotel.map_info,
        #     address=hotel.address,
        #     city=hotel.city,
        #     country=hotel.country,
        #     postal_code=hotel.postal_code,
        #     star=hotel.star,
        #     grade=hotel.grade,
        #     review_num=hotel.review_num,
        #     has_wifi=hotel.has_wifi,
        #     is_wifi_free=hotel.is_wifi_free,
        #     has_parking=hotel.has_parking,
        #     is_parking_free=hotel.is_parking_free,
        #     service=hotel.service,
        #     img_items=hotel.img_items,
        #     description=hotel.description,
        #     accepted_cards=hotel.accepted_cards,
        #     check_in_time=hotel.check_in_time,
        #     check_out_time=hotel.check_out_time,
        #     hotel_url=hotel.hotel_url,
        # )
        # hotels.append(hotel_tuple)
        # return hotels
        res = hotel.to_dict()
        res = json.loads(res)

        # print json.dumps(res,ensure_ascii=False)
        return res
Example #6
0
def agoda_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        content = content.decode('utf-8')
        root = HTML.fromstring(content)
    except:
        #print str(e)
        pass

    ph_runtime = execjs.get('PhantomJS')
    page_js = ph_runtime.compile(
        root.xpath('//script[contains(text(),"propertyPageParams")]/text()')
        [0])
    page_params = page_js.eval('propertyPageParams')
    try:
        hotel_name = page_params['hotelInfo']['name']
    except:
        try:
            hotel_name = root.xpath('//*[@id="hotelname"]/text()')[0].encode(
                'utf-8').strip()
        except:
            try:
                hotel_name = root.xpath('//title/text()')[0].split('-')[0][:-1]
            except:
                #print str(e)
                pass

    try:
        k = hotel_name.find('(')
        # #print k
        hotel.hotel_name = hotel_name[:k if k != -1 else None]
    except:
        # #print str(e)
        hotel.hotel_name = 'NULL'
    #print 'hotel_name=>%s' % hotel.hotel_name
    # #print hotel.hotel_name

    try:
        hotel.hotel_name_en = hotel_name[
            k + 1 if k != -1 else None:-1 if k != -1 else None]
    except:
        hotel.hotel_name_en = 'NULL'
        # #print str(e)
    #print 'hotel.hotel_name_en=>%s' % hotel.hotel_name_en
    # #print hotel.hotel_name_en

    try:
        if page_params['hotelInfo']['address']['address'] in page_params[
                'hotelInfo']['address']['full']:
            hotel.address = page_params['hotelInfo']['address']['full']
        else:
            hotel.address = page_params['hotelInfo']['address'][
                'address'] + page_params['hotelInfo']['address']['full']
    except:
        hotel.address = "NULL"
    #print 'hotel.address=>%s' % hotel.address

    try:
        hotel.star = int(
            page_params['hotelInfo']['starRating']['icon'].split('-')[-1])
    except:
        hotel.star = -1

    if hotel.star > 5:
        if hotel.star % 5 == 0:
            hotel.star = int(hotel.star / 10)
        else:
            hotel.star = -1

    #print 'hotel.star=>%s' % hotel.star

    try:
        lat_pat = re.compile(r'latitude\" content=(.*?) \/>', re.S)
        lon_pat = re.compile(r'longitude\" content=(.*?) \/>', re.S)

        lon_text = lon_pat.findall(content)[0][1:-1]
        lat_text = lat_pat.findall(content)[0][1:-1]
        hotel.map_info = lon_text + ',' + lat_text
    except:
        # #print str(e)
        hotel.map_info = 'NULL'

    #print 'map_info=>%s' % hotel.map_info

    try:
        hotel.grade = float(page_params['reviews']['score'])
    except:
        try:
            hotel.grade = root.find_class('review-score-value')[0].text
        except:
            try:
                hotel.grade = page_params['masterRoomInfo'][0]['demographics'][
                    'grades'][0]['score']
            except:
                hotel.grade = -1
    #print 'grade=>%s' % hotel.grade

    try:
        hotel.review_num = page_params['reviews']['reviewsCount']
    except:
        try:
            review_num = root.find_class('review-based-on-section')[0].xpath(
                './strong/text()')[0].encode('utf8').strip()
            hotel.review_num = review_num_pat.findall(review_num)[0]
        except:
            try:
                hotel.review_num = page_params['masterRoomInfo'][0][
                    'demographics']['count']
            except:
                hotel.review_num = -1

    #print 'hotel.review_num=>%s' % hotel.review_num

    try:
        first_img = page_params.get("mosaicInitData",
                                    {}).get('images',
                                            [])[0].get('Location', 'NULL')
        first_img = urljoin('http:', first_img)
    except:
        first_img = 'NULL'

    try:
        hotel.img_items = '|'.join(
            filter(
                lambda x: 'hotel' in x,
                map(lambda x: 'http:' + x['Location'].split('?')[0],
                    page_params['mosaicInitData']['images']))).encode('utf-8')
    except:
        try:
            img_lists = []
            for img in page_params['masterRoomInfo']:
                img_lists.extend(img['images'])
            hotel.img_items = '|'.join(
                map(lambda x: urljoin('http:', x), img_lists)).encode('utf-8')
        except:
            try:
                img_list = '|'.join([
                    image
                    for images in page_params['roomGridData']['masterRooms']
                    for image in images['images']
                ])
                hotel.img_items = img_list
            except:
                try:
                    img_json = images_url_pat.findall(content)[0]
                    location_pat = re.compile(r'"Location":"(.*?)",', re.S)
                    img_list = location_pat.findall(img_json)
                    hotel.img_items = '|'.join(
                        map(lambda x: 'http:' + x, img_list))
                except:
                    hotel.img_items = 'NULL'
    #print 'img_items=>%s' % hotel.img_items

    try:
        hotel.hotel_url = url
    except:
        pass

    try:
        service_url = "https://www.agoda.com/api/zh-cn/Hotel/AboutHotel?hotelId={0}".format(
            page_params['hotelId'])
        json_data = json.loads(requests.get(service_url).content)
        hotel.service = '|'.join([
            feature['name'] for features in json_data['featureGroups']
            for feature in features['feature'] if feature['available']
        ]).encode('utf-8')

    except:
        try:
            hotel.service = '|'.join([
                service['text'].strip()
                for services in page_params['featuresYouLove']['features']
                for service in services
            ])
        except:
            # hotel.service = '|'.join()
            hotel.service = 'NULL'
    #print 'hotel.service=>%s' % hotel.service

    try:
        hotel.description = json_data['hotelDesc']['overview'].strip().replace(
            '<BR>', '').encode('utf-8')
    except:
        hotel.description = 'NULL'
    #print 'hotel.description=>%s' % hotel.description

    # hotel.check_in_time = None
    # hotel.check_out_time = None
    try:
        for checkInOut in json_data['usefulInfoGroups']:
            if '入住/退房' in checkInOut['name']:
                for item in checkInOut['items']:
                    if '入住办理起始' in item['title']:
                        hotel.check_in_time = item['description']
                        break
                for item in checkInOut['items']:
                    if '退房办理截止' in item['title']:
                        hotel.check_out_time = item['description']
                        break
                break
    except:
        pass

    if hotel.check_in_time == 'NULL' and hotel.check_out_time == 'NULL':
        try:
            in_and_out = json_data.get("CheckInOutInfo", {})
            hotel.check_in_time = in_and_out.get("CheckInAndOutTime", {}).get(
                "CheckInTime", {}).get("From", {}).get("Description")
            hotel.check_out_time = in_and_out.get("CheckInAndOutTime", {}).get(
                "CheckOutTime", {}).get("Until", {}).get("Description")
        except:
            pass
    #print "hotel.check_in_time:", hotel.check_in_time
    #print "hotel.check_out_time:", hotel.check_out_time
    # 从酒店页面获取城市信息
    try:
        country_id = page_params['hotelSearchCriteria']['countryId']
        country_name = page_params['hotelInfo']['address']['countryName']
        city_name = page_params['hotelInfo']['address']['cityName']
        city_id = page_params['hotelInfo']['address']['cityId']
    except:
        country_id = 'NULL'
        country_name = 'NULL'
        city_name = 'NULL'
        city_id = 'NULL'
        #print e
        # pass

    hotel.others_info = json.dumps(
        {
            'country_id': country_id,
            'country_name': country_name,
            'city_name': city_name,
            'city_id': city_id,
            'first_img': first_img,
            'hid': other_info.get('hid'),
            'hotel_services_info': hotel.service
        },
        ensure_ascii=False)
    # hotel.source_city_id = city_id
    hotel.country = page_params['hotelInfo'].get('address',
                                                 {}).get('countryName', '')
    hotel.city = page_params['hotelInfo'].get('address',
                                              {}).get('cityName', '')
    #print "hotel.others_info:", hotel.others_info
    #print "hotel.source_city_id:", hotel.source_city_id
    hotel.accepted_cards = 'NULL'
    #print "accepted_cards:", hotel.accepted_cards
    #print "check_in_time:", hotel.check_in_time
    #print "check_out_time:", hotel.check_out_time

    # if '无线网络' in hotel.service:
    #     hotel.has_wifi = 'Yes'
    # if '免费房内无线网络' in hotel.service:
    #     hotel.is_wifi_free = 'Yes'
    # if 'free wi-fi' in hotel.service.lower() or 'wi-fi free' in hotel.service.lower():
    #     hotel.has_wifi = 'Yes'
    #     hotel.is_wifi_free = 'Yes'
    # if '停车场' in hotel.service:
    #     hotel.has_parking = 'Yes'
    # if '停车场免费' in hotel.service or 'parking free' in hotel.service:
    #     hotel.is_parking_free = 'Yes'

    #print 'hotel.has_wifi=>%s' % hotel.has_wifi
    # #print hotel.has_wifi
    #print 'hotel.is_wifi_free=>%s' % hotel.is_wifi_free
    # #print hotel.has_wifi
    #print 'hotel.has_parking=>%s' % hotel.has_parking
    # #print hotel.has_parking
    #print 'hotel.is_parking_free=>%s' % hotel.is_parking_free

    hotel.source = 'agoda'
    hotel.hotel_url = url.encode('utf-8')
    if other_info.get('hid'):
        hotel.source_id = re.search('hotelId: ?(\d+),', content).groups()[0]
        # hotel.source_id = re.search('cityId: ?(\d+),', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']

    # others_info_dict = hotel.__dict__
    # hotel.others_info = json.dumps(others_info_dict)
    # #print hotel

    return hotel
Example #7
0
def bestwestern_parser(content, url, other_info):
    lng_lat = content[0]
    html = etree.HTML(content[1])
    hotel = HotelNewBase()

    # 酒店名
    hotel.hotel_name = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//h1[@id="hotel-name"]/a/text()'
    )[0]
    # 酒店英文名
    hotel.hotel_name_en = hotel.hotel_name
    # 酒店源
    hotel.source = 'bestwestern'
    # 酒店id
    hotel.source_id = url.split('-')[-1]
    # 酒店品牌名
    hotel.brand_name = get_brand_name(html)
    # 酒店经纬度
    hotel.map_info = get_map_info(lng_lat)
    # 酒店地址
    hotel.address = "".join(
        html.xpath(
            '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span/text()'
        ))
    # 酒店所在城市
    hotel.city = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span[@id="address-1-city-state-zip"]/text()'
    )[0]
    # 酒店所在国家
    hotel.country = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span'
    )[-1].text
    # 城市ID(mioji)
    hotel.city_id = other_info['city_id']
    # 酒店邮编
    hotel.postal_code = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]//span[@class="postalCode"]/text()'
    )[0]
    # 酒店星级
    hotel.star = 5
    # 酒店评分
    hotel.grade = html.xpath('//div[@class="tripAdvisorOwl"]/img/@src'
                             )[0].split("/")[-1].split('-')[0]
    # 酒店评论数
    try:
        hotel.review_num = re.search(
            r'\d+',
            html.xpath(
                '//div[@class="hotelDetailsContainer"]//div[@id="hotel-reviews"]//div[@class="reviewRatingCount"]/text()'
            )[0]).group()
    except Exception:
        hotel.review_num = 0
    # 酒店头图
    hotel.Img_first = html.xpath(
        "//div[contains(@class, 'hotelImageSlider')]//li/img/@src")[0]
    # 酒店电话
    hotel.hotel_phone = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[0]
    # 酒店邮编
    hotel.hotel_zip_code = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[1]
    # 到达酒店的交通信息
    hotel.traffic = 'NULL'
    # 儿童和加床政策
    hotel.chiled_bed_type = 'NULL'
    # 宠物政策
    hotel.pet_type = html.xpath(
        '//div[@class="policyContent uk-margin-small-left"]/text()')[0]
    # 酒店特色
    get_feature(hotel, html)
    # 设施信息
    get_facility(hotel, html)
    # 服务信息
    get_service(hotel, html)
    # 酒店照片
    hotel.img_items = ",".join(
        html.xpath("//div[contains(@class, 'hotelImageSlider')]//li/img/@src"))
    # 酒店描述
    hotel.description = html.xpath(
        '//div[@class="hotelOverviewDetailSection"]/div[@class="overviewText"]/text()'
    )[0].strip()
    # 支付接受的卡
    hotel.accepted_cards = 'NULL'
    # 入住时间
    hotel.check_in_time = html.xpath(
        '//div[@class="uk-width-3-10 checkInPositionContainer addressCheckInTableCell"]/p[2]/text()'
    )[0]
    # 退房时间
    hotel.check_out_time = html.xpath(
        '//div[@class="phoneNumbers"]/div[contains(@class,"phonesRow")][1]/div[2]/p[2]/text()'
    )[0]
    # 酒店url
    hotel.hotel_url = url
    hotel_service_info = __get_hotel_service(html)
    hotel.others_info = json.dumps({"hotel_services_info": hotel_service_info})
    print hotel.to_dict()
    # with open("bestwestren.json", 'a') as f:
    #     f.write(hotel.to_dict() + "\n")
    return hotel.to_dict()