def holiday_parser(content, url, other_info): """ 酒店详情的爬虫 :param content: 包含3个或2个content的元组,分别为json和xml和json格式,其中第三个json可选,主要用来抓酒店的英文名 :param url: 酒店详情页的url :param other_info: 包含city_id, source_id 的字典 :return: 返回一个HotelBase的实例 """ hotel = HotelNewBase() detail = {} if len(content) == 3: content1, content2, content3 = content try: en_json = json.loads(content3) detail['hotel_name_en'] = en_json['hotelInfo']['profile']['name'] except: pass else: content1, content2 = content re_match = re.search('/hotels/cn/zh/(\w+)/hoteldetail', url) hotel_code = re_match.group(1) if re_match else '' # with open('igh.html', 'w') as f: # f.write(content2) resp = json.loads(content1)['hotelInfo'] hotel.hotel_url = url hotel.hotel_name = resp.get('profile', '').get('name', '') hotel.hotel_name_en = detail.get('hotel_name_en', '') hotel.source = 'holiday' hotel.source_id = other_info.get('source_id', '') or hotel_code # hotel.source_city_id = other_info.get('source_city_id', '') hotel.brand_name = resp.get('brandInfo', '').get('brandName', '') hotel.map_info = str(resp.get('profile', '').get( 'longitude', '')) + ',' + str( resp.get('profile', '').get('latitude', '')) hotel.address = get_all_street(resp) hotel.city = resp.get('address', '').get('city', '') hotel.country = resp.get('address', '').get('country', '').get('name', '') hotel.city_id = other_info.get('city_id', '') hotel.postal_code = resp.get('address', '').get('zip', '') hotel.star = '-1' hotel.grade = resp.get('profile', '').get('averageReview', '') hotel.review_num = resp.get('profile', '').get('totalReviews', '') hotel.check_in_time = resp.get('policies', '').get('checkinTime', '') hotel.check_out_time = resp.get('policies', '').get('checkoutTime', '') first_img = resp.get('profile', '') if first_img: first_img = first_img.get('primaryImageUrl', '') if first_img: first_img = first_img.get('originalUrl', '') hotel.Img_first = first_img hotel.description = resp.get('profile', '').get( 'longDescription', '') + '\n' + resp.get('profile', '').get( 'shortDescription', '') # detail['has_wifi'] = 'Yes' if any([u'无线互联网' in ''.join(i.values()) or 'wifi' in ''.join(i.values()) for i in # resp.get('facilities', '')]) else detail.get('has_wifi', 'Null') # detail['service'] = detail.get('service', '') + get_api_server(resp) facilities_dict = { 'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅', 'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心', 'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊', 'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车', 'Airport_bus': '机场班车', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务', 'Rental_service': '租车服务', 'Room_wifi': '无线互联网', 'Room_wired': '有线互联网', 'Public_wifi': '无线互联网', 'Public_wired': '有线互联网' } reverse_facility_dict = {v: k for k, v in facilities_dict.items()} service_dict = { 'Luggage_Deposit': '行李寄存', 'front_desk': '24小时前台', 'Lobby_Manager': '24小时大堂经理', '24Check_in': '24小时办理入住', 'Security': '24小时安保', 'Protocol': '礼宾服务', 'wake': '叫醒服务', 'Chinese_front': '中文前台', 'Postal_Service': '邮政服务', 'Fax_copy': '传真/复印', 'Laundry': '洗衣服务', 'polish_shoes': '擦鞋服务', 'Frontdesk_safe': '保险', 'fast_checkin': '快速办理入住', 'ATM': '自动柜员机(ATM)/银行服务', 'child_care': '儿童看护', 'Food_delivery': '送餐服务' } reverse_sevice_dict = {v: k for k, v in service_dict.items()} facilities = resp.get("facilities", "") for each in facilities: if each['id'] == 'NO_PETS_ALLOWED' or each['id'] == 'PETS_ALLOWED': hotel.pet_type = each['name'] for fac_value in facilities_dict.values(): if fac_value in each['name']: hotel.facility_content[ reverse_facility_dict[fac_value]] = each['name'] for ser_value in service_dict.values(): if ser_value in each['name']: hotel.service_content[ reverse_sevice_dict[ser_value]] = each['name'] fea_str = get_api_server(resp) tree = etree.HTML(content2) ser_str = get_ota_server(tree, '上网', '互联网', '泳', '退房', '餐', '预定', '停车', '健身', '运动', '泳池', '特色', '服务') hotel_services_info = fea_str + ser_str hotel.others_info = json.dumps({ 'city': detail.get('city', ''), 'country': detail.get('country', ''), 'first_img': first_img, 'source_city_id': other_info.get('source_city_id', ''), 'hotel_services_info': hotel_services_info }) hotel.img_items = get_all_pics(tree) # content_list = tree.xpath("//div[@class='accordian-content']/li/div[@class='header']/h2/span/text()") # index = 1 # for content in content_list: # if content == "停车": # parking_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index)) # hotel.facility_content['Parking'] = " ".join(parking_list) # if content == "宠物政策": # pet_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index)) # hotel.pet_type = " ".join(pet_list) # index += 1 hotel.hotel_zip_code = hotel.postal_code # try: # hotel.hotel_phone = tree.xpath("//div[@class='resdirect-num tel-no']/span/a/text()")[0] # except Exception as e: # hotel.hotel_phone = "NULL" res = hotel.to_dict() # res = json.loads(res) # print json.dumps(res, ensure_ascii=False) return res
def gha_parser(total_content, url, other_info): hotel = HotelNewBase() hotel.city_id = other_info.get("city_id", "NULL") select = etree.HTML(total_content) info = re.compile("pins\.gha_hotel\.push\((.*?)\)", re.S) address = re.compile( "<script type=\"application/ld\+json\">(.*?)</script>", re.S) address = json.loads(address.findall(total_content)[0].replace(' ', '')) info = json.loads(info.findall(total_content)[0]) hotel.hotel_name = info["title"] hotel.hotel_name_en = address["name"] hotel.source = "gha" hotel.source_id = info["id"] hotel.brand_name = info["brand_name"] hotel.map_info = str(info["lon"]) + "," + str(info["lat"]) hotel.address = ''.join(select.xpath("//adress/text()")).strip() hotel.country = address["address"]["addressCountry"] hotel.city = address["address"]["addressLocality"] hotel.postal_code = address["address"]["postalCode"] hotel.star = '5' hotel.Img_first = select.xpath( "//div[@class='FlexEmbed-item']/span/img/@src") hotel.hotel_phone = address.get("telephone", 'NULL') hotel.hotel_zip_code = address["address"]["postalCode"] service = select.xpath('//ul[@class="prop-Amenities"]/li/span/text()') servicestr = ''.join(service) description = select.xpath("//div[@id='content-about-hotel']/p/text()") hotel.description = ''.join(description) if u'无线' in servicestr: hotel.facility["Room_wifi"] = u'无线上网' hotel.facility["Public_wifi"] = u'无线上网' if u'泳' in servicestr: hotel.facility["Swimming_Pool"] = u'泳池' if u'健身' in servicestr: hotel.facility["gym"] = u"健身中心" if u'水疗' in servicestr: hotel.facility['Mandara_Spa'] = u"水疗中心" if u'酒吧' in hotel.description: hotel.facility["Bar"] = u'酒吧' if u'儿童俱乐部' in hotel.description: hotel.facility["Recreation"] = u"儿童俱乐部" if u'餐' in servicestr: hotel.facility["Restaurant"] = u"餐饮" if u'商务中心' in servicestr: hotel.facility["Business_Centre"] = u'商务中心' if u'亲子' in servicestr: hotel.feature["Parent_child"] = u'亲子' img_list = select.xpath('//div[@class="RotateBanner-itemImg"]/span/@style') imgurl = re.compile("url\('(.*?)'\)") imgurl_list = [] for img in img_list: imgurl_list.append(imgurl.findall(img)[0]) hotel.img_items = '|'.join(imgurl_list) hotel.check_in_time = '14:00' hotel.check_out_time = '12:00' reviewsurl = re.compile('<script src="//(.*?)"') urls = reviewsurl.findall(total_content) if urls[0]: reviewsurl = "http://" + urls[0] else: hotel.grade = '0.0' hotel.review_num = 0 hotel.hotel_url = url return hotel.to_dict() comment = requests.get(reviewsurl).content grade = re.compile('<div class=\\\\"rating-value\\\\">\\\\n(.*?)%', re.S) try: hotel.grade = str(float(grade.findall(comment)[0].strip()) / 10) except: hotel.grade = '0.0' review = re.compile('<div class=\\\\"review-count\\\\">\\\\n(.*?)reviews', re.S) try: hotel.review_num = review.findall(comment)[0].strip() except: hotel.review_num = 0 hotel.hotel_url = url # print room_tuple print hotel.to_dict() return hotel.to_dict()
def parse_hotel(self, req, resp): hotels = [] # hotel = Hotel() hotel = Hotel_New() # hotel = BaseModel() hotel.hotel_name = 'NULL' hotel.hotel_name_en = self.hotel_test['hotel_name_en'] hotel.source = 'hyatt' hotel.source_id = self.hotel_test['source_id'] hotel.brand_name = 'NULL' hotel.map_info = self.hotel_test['map_info'] hotel.address = self.hotel_test['address'] hotel.city = self.hotel_test['hotel_city'] hotel.country = self.hotel_test['hotel_country'] hotel.postal_code = self.hotel_test['hotel_postal_code'] hotel.star = 5 hotel.grade = 'NULL' hotel.review_num = 'NULL' # hotel.has_wifi = self.hotel_test['has_wifi'] # hotel.is_wifi_free = self.hotel_test['is_wifi_free'] # hotel.has_parking = 'NULL' # hotel.is_parking_free = 'NULL' # hotel.service = self.hotel_test['services'] # hotel.img_items = self.hotel_test['img_items'] # hotel.description = ''.join(self.hotel_test['description']) hotel.Img_first = self.hotel_test['Img_first'] hotel.hotel_phone = self.hotel_test['hotel_phone'] hotel.hotel_zip_code = self.hotel_test['hotel_postal_code'] hotel.traffic = '' hotel.chiled_bed_type = self.hotel_test['chiled_bed_type'] hotel.pet_type = '' if self.hotel_test['has_wifi']: hotel.facility['Room_wifi'] = self.hotel_test['has_wifi'] for one in self.hotel_test['services']: one = one.lower() if 'faxing' in one: hotel.service['Fax_copy'] = one elif 'postal' in one: hotel.service['Postal_Service'] = one elif 'laundry' in one: hotel.service['Laundry'] = one elif 'room service' in one: hotel.service['Food_delivery'] = one elif 'concierge service' in one: hotel.service['Protocol'] = one elif 'babysitting' in one: hotel.service['child_care'] = one elif 'shoeshine' in one: hotel.service['polish_shoes'] = one elif 'valet parking' in one: hotel.facility['Valet_Parking'] = one elif 'parking' in one: hotel.facility['Parking'] = one elif 'wifi' in one or 'wi-fi' in one: hotel.facility['Room_wifi'] = one elif 'pool' in one: hotel.facility['Swimming_Pool'] = one elif 'gym' in one: hotel.facility['gym'] = one elif 'bar' in one: hotel.facility['Bar'] = one elif 'coffee' in one: hotel.facility['coffee'] = one elif 'parking' in one: hotel.facility['Parking'] = one elif 'spa' in one: hotel.facility['SPA'] = one elif 'golf' in one: hotel.facility['Golf_Course'] = one elif 'restaurant' in one: hotel.facility['Restaurant'] = one elif 'sauna' in one: hotel.facility['Sauna'] = one elif 'service to airport' in one or 'shuttle airport' in one: hotel.facility['Airport_bus'] = one elif 'wedding' in one: hotel.facility['Wedding_hall'] = one elif 'restaurant' in one: hotel.facility['Restaurant'] = one elif 'business centre' in one: hotel.facility['Business_Centre'] = one elif 'sereno Spa' in one: hotel.facility['Mandara_Spa'] = one elif 'tennis' in one: hotel.facility['Tennis_court'] = one elif 'spa' in one: hotel.facility['SPA'] = one elif "China_Friendly" in one: hotel.feature['China_Friendly'] = one elif "Romantic_lovers" in one: hotel.feature['Romantic_lovers'] = one elif "Parent_child" in one: hotel.feature['Parent_child'] = one elif "Beach_Scene" in one: hotel.feature['Beach_Scene'] = one elif "Hot_spring" in one: hotel.feature['Hot_spring'] = one elif "Japanese_Hotel" in one: hotel.feature['Japanese_Hotel'] = one elif "Vacation" in one: hotel.feature['Vacation'] = one hotel.accepted_cards = 'NULL' hotel.check_in_time = self.hotel_test['check_in_time'] hotel.check_out_time = self.hotel_test['check_out_time'] hotel.hotel_url = self.url_en # hotel_tuple = dict( # hotel_name=hotel.hotel_name, # hotel_name_en=hotel.hotel_name_en, # source=hotel.source, # source_id=hotel.source_id, # brand_name=hotel.brand_name, # map_info=hotel.map_info, # address=hotel.address, # city=hotel.city, # country=hotel.country, # postal_code=hotel.postal_code, # star=hotel.star, # grade=hotel.grade, # review_num=hotel.review_num, # has_wifi=hotel.has_wifi, # is_wifi_free=hotel.is_wifi_free, # has_parking=hotel.has_parking, # is_parking_free=hotel.is_parking_free, # service=hotel.service, # img_items=hotel.img_items, # description=hotel.description, # accepted_cards=hotel.accepted_cards, # check_in_time=hotel.check_in_time, # check_out_time=hotel.check_out_time, # hotel_url=hotel.hotel_url, # ) # hotels.append(hotel_tuple) # return hotels res = hotel.to_dict() res = json.loads(res) # print json.dumps(res,ensure_ascii=False) return res
def booking_parser(content, url, other_info): hotel = HotelNewBase() try: root = HTML.fromstring(content) except Exception as e: print e.message hotel.hotel_name = re.findall(r'b_hotel_name:.*?\'(.+?)\',', content)[0].strip() hotel.hotel_name_en = re.findall(r'hotelName:.*?\"(.+?)\",', content)[0].strip() hotel.source = 'booking' hotel.source_id = other_info['source_id'] latitude = re.findall(r'b_map_center_latitude = (.*?);', content)[0].strip() longitude = re.findall(r'b_map_center_longitude = (.*?);', content)[0].strip() hotel.map_info = '{},{}'.format(latitude, longitude) location_dict = json.loads( re.findall(r'<script type="application/ld\+json">(.*?)</script>', content, re.S)[0].replace('\n', '').strip()) hotel.address = location_dict['address']['streetAddress'] hotel.city = re.findall(r'city_name:.*?\'(.*?)\'', content)[0].strip() hotel.country = location_dict['address']['addressCountry'] hotel.city_id = other_info['city_id'] hotel.postal_code = re.findall(r'"postalCode".*?\"(.*?)\"', content, re.S)[0].strip() try: hotel.star = root.xpath( '//*[@id="wrap-hotelpage-top"]/div[@class="hp__hotel-title"]/span/span[@class="hp__hotel_ratings__stars nowrap"]/i/@title' )[0].encode('utf-8').replace('星级酒店', '') except IndexError as e: print('Parser ERROR, NO Star Infomation.The reason follows: %s' % e.message) hotel.grade = location_dict['aggregateRating']['ratingValue'] hotel.review_num = location_dict['aggregateRating']['reviewCount'] hotel.Img_first = location_dict['image'] # hotel.other_info = # hotel.hotel_phone = # hotel_zip_code = # hotel.feature = # hotel.brand_name = # hotel.continent = try: hotel.traffic = ','.join([ root.xpath('//*[@id="public_transport_options"]/div/text()') [1].strip('\n').strip(), root.xpath( '//*[@id="public_transport_options"]/ul/li/div[1]/text()') [1].strip('\n').strip(), root.xpath( '//*[@id="public_transport_options"]/ul/li/div[2]/text()') [0].strip('\n').strip() ]) except IndexError as e: print('Parser ERROR, NO Traffic Infomation.The reason follows: %s' % e.message) # hotel.chiled_bed_type = '\n'.join(root.xpath('//*[@id="children_policy"]/p[position()>1]/text()')) hotel.chiled_bed_type = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="children_policy"]/p[position()>1]//text()|//*[@id="general-child-policy"]/p[position()>1]//text()' ) if i.replace('\n', '').strip() ]) hotel.pet_type = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hotelPoliciesInc"]/div[@class="description"]/p[position()>1]//text()' ) if i.replace('\n', '').strip() ]) # -2:宠物 1:综合设施 2:活动设施 3:服务项目 5:浴室 6:媒体/科技 7:餐饮服务 11:网络 13:户外 16:停车场 17:卧室 # 21:游泳及康复设施 27:商务设施 hot_facilities = [ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]/div[@class="facilities-sliding-keep"]/div/div[@class="important_facility "]//text()' ) if i.replace('\n', '').strip() ] wifi = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=11]/ul/li[@class="policy"]/p/span//text()' ) if i.replace('\n', '').strip() ]) if u'免费无线网络连接' in hot_facilities or u'免费!住宿方于各处提供WiFi(免费)。' in wifi: hotel.facility_content['Public_wifi'] = wifi elif u'免费!住宿方于客房提供WiFi(免费)。' in wifi: hotel.facility_content['Room_wifi'] = wifi elif u'客房' in wifi and u'有线网络' in wifi: hotel.facility_content['Room_wired'] = wifi elif u'公共' in wifi or u'各处' in wifi and u'有线网络' in wifi: hotel.facility_content['Public_wired'] = wifi parking = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=16]//p//text()' ) if i.replace('\n', '').strip() ]) hotel.facility_content['Parking'] = parking # 设施新字段添加到facilities_dict, 即可自动匹配 facilities_dict = { 'Swimming_Pool': ['游泳池'], 'gym': ['健身房'], 'SPA': ['SPA'], 'Bar': ['酒吧'], 'Coffee_house': ['咖啡厅'], 'Tennis_court': ['网球场'], 'Golf_Course': ['高尔夫球场'], 'Sauna': ['桑拿'], 'Mandara_Spa': ['水疗中心'], 'Recreation': ['儿童娱乐场', '儿童游乐场'], 'Business_Centre': ['商务中心'], 'Lounge': ['行政酒廊'], 'Wedding_hall': ['婚礼礼堂'], 'Restaurant': ['餐厅'], 'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'], 'Valet_Parking': ['代客泊车'], 'Call_service': ['叫车服务'], 'Rental_service': ['租车服务'] } part_facilities = map( lambda x: x.encode('utf-8').replace('\n', '').strip(), root.xpath( '//*[@id="hp_facilities_box"]/div[@class="facilitiesChecklist"]/div/ul/li/span[@data-name-en]/text()' )) parser_list = [] # reverse_facility_dict = {v: k for k, v in facilities_dict.items()} # print reverse_facility_dict for every in part_facilities: value = every.replace('咖啡', '咖啡厅').replace('网球', '网球场').replace( '健身', '健身房').replace('儿童娱乐', '儿童游乐').upper() for keys, faci in facilities_dict.items(): for fac in faci: if fac in value: if keys in hotel.facility_content: hotel.facility_content[ keys] = hotel.facility_content[keys] + ',' + every else: hotel.facility_content[keys] = every parser_list.append(every) print('酒店设施:{}'.format(', '.join(part_facilities))) print('已解析出:%s' % ', '.join(parser_list)) service_list = map( lambda x: x.encode('utf-8').replace('\n', '').strip(), root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=3]/ul/li/span[1]/text()' )) # 服务新字段添加到facilities_dict, 即可自动匹配 service_dict = { 'Luggage_Deposit': '行李寄存', 'front_desk': '24小时前台', 'Lobby_Manager': '24小时大堂经理', '24Check_in': '24小时办理入住', 'Security': '24小时安保', 'Protocol': '礼宾服务', 'wake': '叫醒服务', 'Chinese_front': '中文前台', 'Postal_Service': '邮政服务', 'Fax_copy': '传真/复印', 'Laundry': '洗衣服务', 'polish_shoes': '擦鞋服务', 'Frontdesk_safe': '前台保险柜', 'fast_checkin': '快速办理入住/退房', 'ATM': '自动柜员机(ATM)/银行服务', 'child_care': '儿童看护服务', 'Food_delivery': '送餐服务' } reverse_sevice_dict = {v: k for k, v in service_dict.items()} parser_sevice_list = [] for every in part_facilities: for serv in service_dict.values(): value = serv.replace('服务', '') if value in every: hotel.service_content[reverse_sevice_dict[serv]] = every parser_sevice_list.append(every) print('酒店服务:{}'.format(', '.join(service_list) or '如果你看见了这句话请不要好奇,它表示酒店服务项目是空的')) print('已解析出:%s' % ', '.join(parser_sevice_list)) hotel.img_items = '|'.join( root.xpath('//*[@id="photos_distinct"]/a[position()<last()-1]/@href')) if not hotel.img_items: hotel.img_items = '|'.join( root.xpath('//div[@class="bh-photo-grid-thumb-cell"]/a/@href')) hotel.description = '\n'.join( map(lambda x: x.strip(), root.xpath('//*[@id="summary"]/p/text()'))) a = root.xpath( '//*[@class="jq_tooltip payment_methods_overall"]/button/@aria-label|' '//div[contains(@class, "payment_promotion_labels")]/label/span/text()' ) hotel.accepted_cards = '|'.join(a) hotel.check_in_time = re.sub( pattern=r'<script.+?script>', repl='', string=root.xpath('//*[@id="checkin_policy"]/p/span/@data-caption') [0].encode('utf-8'), flags=re.S).strip() hotel.check_out_time = re.sub( pattern=r'<script.+?script>', repl='', string=root.xpath('//*[@id="checkout_policy"]/p/span/@data-caption') [0].encode('utf-8'), flags=re.S).strip() hotel.hotel_url = url.encode('utf-8') print json.dumps(hotel.to_dict(), ensure_ascii=False) return hotel.to_dict()
def bestwestern_parser(content, url, other_info): lng_lat = content[0] html = etree.HTML(content[1]) hotel = HotelNewBase() # 酒店名 hotel.hotel_name = html.xpath( '//div[contains(@class,"hotelImagebloc")]//h1[@id="hotel-name"]/a/text()' )[0] # 酒店英文名 hotel.hotel_name_en = hotel.hotel_name # 酒店源 hotel.source = 'bestwestern' # 酒店id hotel.source_id = url.split('-')[-1] # 酒店品牌名 hotel.brand_name = get_brand_name(html) # 酒店经纬度 hotel.map_info = get_map_info(lng_lat) # 酒店地址 hotel.address = "".join( html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span/text()' )) # 酒店所在城市 hotel.city = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span[@id="address-1-city-state-zip"]/text()' )[0] # 酒店所在国家 hotel.country = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span' )[-1].text # 城市ID(mioji) hotel.city_id = other_info['city_id'] # 酒店邮编 hotel.postal_code = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]//span[@class="postalCode"]/text()' )[0] # 酒店星级 hotel.star = 5 # 酒店评分 hotel.grade = html.xpath('//div[@class="tripAdvisorOwl"]/img/@src' )[0].split("/")[-1].split('-')[0] # 酒店评论数 try: hotel.review_num = re.search( r'\d+', html.xpath( '//div[@class="hotelDetailsContainer"]//div[@id="hotel-reviews"]//div[@class="reviewRatingCount"]/text()' )[0]).group() except Exception: hotel.review_num = 0 # 酒店头图 hotel.Img_first = html.xpath( "//div[contains(@class, 'hotelImageSlider')]//li/img/@src")[0] # 酒店电话 hotel.hotel_phone = html.xpath( '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[0] # 酒店邮编 hotel.hotel_zip_code = html.xpath( '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[1] # 到达酒店的交通信息 hotel.traffic = 'NULL' # 儿童和加床政策 hotel.chiled_bed_type = 'NULL' # 宠物政策 hotel.pet_type = html.xpath( '//div[@class="policyContent uk-margin-small-left"]/text()')[0] # 酒店特色 get_feature(hotel, html) # 设施信息 get_facility(hotel, html) # 服务信息 get_service(hotel, html) # 酒店照片 hotel.img_items = ",".join( html.xpath("//div[contains(@class, 'hotelImageSlider')]//li/img/@src")) # 酒店描述 hotel.description = html.xpath( '//div[@class="hotelOverviewDetailSection"]/div[@class="overviewText"]/text()' )[0].strip() # 支付接受的卡 hotel.accepted_cards = 'NULL' # 入住时间 hotel.check_in_time = html.xpath( '//div[@class="uk-width-3-10 checkInPositionContainer addressCheckInTableCell"]/p[2]/text()' )[0] # 退房时间 hotel.check_out_time = html.xpath( '//div[@class="phoneNumbers"]/div[contains(@class,"phonesRow")][1]/div[2]/p[2]/text()' )[0] # 酒店url hotel.hotel_url = url hotel_service_info = __get_hotel_service(html) hotel.others_info = json.dumps({"hotel_services_info": hotel_service_info}) print hotel.to_dict() # with open("bestwestren.json", 'a') as f: # f.write(hotel.to_dict() + "\n") return hotel.to_dict()