Python HotelNewBase Examples

Programming Language: Python

Namespace/Package Name: proj.my_lib.models.HotelModel

Class/Type: HotelNewBase

Examples at hotexamples.com: 14

Python HotelNewBase - 14 examples found. These are the top rated real world Python examples of proj.my_lib.models.HotelModel.HotelNewBase extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HotelNewBase(10)

address(8)

check_in_time(8)

check_out_time(8)

city(7)

city_id(7)

country(7)

description(7)

Img_first(5)

accepted_cards(5)

brand_name(4)

chiled_bed_type(3)

Example #1

Show file

File: hilton_parser.py Project: 20113261/platform_service

def hilton_parser(total_content, url, other_info):
    Hotel = HotelNewBase()
    content, detail_content, map_info_content, desc_content, enDetail_content = total_content

    html_detail = HTML.fromstring(detail_content)
    service = ''
    try:
        service += u'交通：'
        ALL = html_detail.xpath('//tbody[@id="tbodytransportation"]/tr')
        service = process_text(ALL, service)
    except Exception as e:
        print e

    try:
        service += u'设施：'
        ALL = html_detail.xpath('//tbody[@id="tbodyfacilities"]/tr')
        service = process_text(ALL, service)
    except Exception as e:
        print e
    try:
        service += u'服务与设施：'
        ALL = html_detail.xpath('//tbody[@id="tbodyservices"]/tr')
        service = process_text(ALL, service)

    except Exception as e:
        print e
    # try:
    service += u'家庭：'
    ALL = html_detail.xpath('//tbody[@id="tbodyfamily"]/tr')
    service = process_text(ALL, service)
    # except Exception as e:
    #     print e

    select_detail = etree.HTML(detail_content)
    check_in_time = ''
    check_out_time = ''
    hotel_id = url.split("/")[-2].split("-")[-1]
    Hotel.hotel_url = url
    try:
        ALL = select_detail.xpath(
            "//td[@headers='compare_{} compare_registration']/text()".format(
                hotel_id))
        ALL = u'：'.join(ALL)
        check_time = ALL.replace('\n', '').replace('\t', '').replace(' ', '')
        check_time = check_time.split(u'：')
        check_in_time = check_time[1]
        check_out_time = check_time[-1]
    except Exception, e:
        print str(e)

Example #2

Show file

def expedia_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        html = HTML.fromstring(content.decode('utf-8'))
        html = HTML.make_links_absolute(html, base_url=url)
    except Exception, e:
        print str(e)

Example #3

Show file

def holiday_parser(content, url, other_info):
    """
    酒店详情的爬虫
    :param content: 包含3个或2个content的元组，分别为json和xml和json格式，其中第三个json可选，主要用来抓酒店的英文名
    :param url: 酒店详情页的url
    :param other_info: 包含city_id, source_id 的字典
    :return: 返回一个HotelBase的实例
    """
    hotel = HotelNewBase()
    detail = {}
    if len(content) == 3:
        content1, content2, content3 = content
        try:
            en_json = json.loads(content3)
            detail['hotel_name_en'] = en_json['hotelInfo']['profile']['name']
        except:
            pass
    else:
        content1, content2 = content
    re_match = re.search('/hotels/cn/zh/(\w+)/hoteldetail', url)
    hotel_code = re_match.group(1) if re_match else ''

    # with open('igh.html', 'w') as f:
    #     f.write(content2)
    resp = json.loads(content1)['hotelInfo']
    hotel.hotel_url = url
    hotel.hotel_name = resp.get('profile', '').get('name', '')
    hotel.hotel_name_en = detail.get('hotel_name_en', '')
    hotel.source = 'holiday'
    hotel.source_id = other_info.get('source_id', '') or hotel_code
    # hotel.source_city_id = other_info.get('source_city_id', '')
    hotel.brand_name = resp.get('brandInfo', '').get('brandName', '')
    hotel.map_info = str(resp.get('profile', '').get(
        'longitude', '')) + ',' + str(
            resp.get('profile', '').get('latitude', ''))
    hotel.address = get_all_street(resp)
    hotel.city = resp.get('address', '').get('city', '')
    hotel.country = resp.get('address', '').get('country', '').get('name', '')
    hotel.city_id = other_info.get('city_id', '')
    hotel.postal_code = resp.get('address', '').get('zip', '')
    hotel.star = '-1'
    hotel.grade = resp.get('profile', '').get('averageReview', '')
    hotel.review_num = resp.get('profile', '').get('totalReviews', '')
    hotel.check_in_time = resp.get('policies', '').get('checkinTime', '')
    hotel.check_out_time = resp.get('policies', '').get('checkoutTime', '')
    first_img = resp.get('profile', '')
    if first_img:
        first_img = first_img.get('primaryImageUrl', '')
        if first_img:
            first_img = first_img.get('originalUrl', '')
            hotel.Img_first = first_img
    hotel.description = resp.get('profile', '').get(
        'longDescription', '') + '\n' + resp.get('profile', '').get(
            'shortDescription', '')
    # detail['has_wifi'] = 'Yes' if any([u'无线互联网' in ''.join(i.values()) or 'wifi' in ''.join(i.values()) for i in
    #                                         resp.get('facilities', '')]) else detail.get('has_wifi', 'Null')
    # detail['service'] = detail.get('service', '') + get_api_server(resp)
    facilities_dict = {
        'Swimming_Pool': '泳池',
        'gym': '健身',
        'SPA': 'SPA',
        'Bar': '酒吧',
        'Coffee_house': '咖啡厅',
        'Tennis_court': '网球场',
        'Golf_Course': '高尔夫球场',
        'Sauna': '桑拿',
        'Mandara_Spa': '水疗中心',
        'Recreation': '儿童娱乐场',
        'Business_Centre': '商务中心',
        'Lounge': '行政酒廊',
        'Wedding_hall': '婚礼礼堂',
        'Restaurant': '餐厅',
        'Parking': '停车',
        'Airport_bus': '机场班车',
        'Valet_Parking': '代客泊车',
        'Call_service': '叫车服务',
        'Rental_service': '租车服务',
        'Room_wifi': '无线互联网',
        'Room_wired': '有线互联网',
        'Public_wifi': '无线互联网',
        'Public_wired': '有线互联网'
    }
    reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '保险',
        'fast_checkin': '快速办理入住',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    facilities = resp.get("facilities", "")
    for each in facilities:
        if each['id'] == 'NO_PETS_ALLOWED' or each['id'] == 'PETS_ALLOWED':
            hotel.pet_type = each['name']
        for fac_value in facilities_dict.values():
            if fac_value in each['name']:
                hotel.facility_content[
                    reverse_facility_dict[fac_value]] = each['name']
        for ser_value in service_dict.values():
            if ser_value in each['name']:
                hotel.service_content[
                    reverse_sevice_dict[ser_value]] = each['name']
    fea_str = get_api_server(resp)
    tree = etree.HTML(content2)
    ser_str = get_ota_server(tree, '上网', '互联网', '泳', '退房', '餐', '预定', '停车',
                             '健身', '运动', '泳池', '特色', '服务')
    hotel_services_info = fea_str + ser_str
    hotel.others_info = json.dumps({
        'city':
        detail.get('city', ''),
        'country':
        detail.get('country', ''),
        'first_img':
        first_img,
        'source_city_id':
        other_info.get('source_city_id', ''),
        'hotel_services_info':
        hotel_services_info
    })
    hotel.img_items = get_all_pics(tree)

    # content_list = tree.xpath("//div[@class='accordian-content']/li/div[@class='header']/h2/span/text()")
    # index = 1
    # for content in content_list:
    #     if content == "停车":
    #         parking_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.facility_content['Parking'] = " ".join(parking_list)
    #     if content == "宠物政策":
    #         pet_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index))
    #         hotel.pet_type = " ".join(pet_list)
    #     index += 1
    hotel.hotel_zip_code = hotel.postal_code
    # try:
    #     hotel.hotel_phone = tree.xpath("//div[@class='resdirect-num tel-no']/span/a/text()")[0]
    # except Exception as e:
    #     hotel.hotel_phone = "NULL"
    res = hotel.to_dict()
    # res = json.loads(res)
    # print json.dumps(res, ensure_ascii=False)
    return res

Example #4

Show file

File: elong_parser.py Project: 20113261/platform_service

def elong_parser(content, url, other_info):
    hotel = HotelNewBase()

    try:
        root = HTML.fromstring(content.decode('utf-8'))
        phantom_js = execjs.get('PhantomJS')
        js_str = root.xpath('//script[contains(text(),"window.newDetailController")]/text()')[0]
        page_js = phantom_js.compile(js_str[js_str.index('window.newDetailController'):][:-1])
    except:
        try:
            js_str = root.xpath('//script[contains(text(),"HotelDetailController")]/text()')[0]
            page_js = phantom_js.compile(js_str[js_str.index('HotelDetailController'):][:-1])
        except:
            pass
        #print str(e)
        # return hotel
        # pass

    # 解析酒店中英文名，如果没有中文名则置为英文名，如果都解析失败则退出
    try:
        # temp_name = root.find_class('t24 yahei')[0].xpath('./text()')[0].strip().encode('utf-8')
        temp_name = root.xpath('//div[@class="t24"]/@title')[0].strip().encode('utf-8')
        k = temp_name.find('(')
        j = temp_name.find(')')
        hotel.hotel_name = temp_name[:k]
        hotel.hotel_name_en = temp_name[k + 1:j]
    except:
        try:
            hotel.hotel_name = root.find_class('hrela_name-cn')[0].xpath('./text()')[0].strip()
            hotel.hotel_name_en = root.find_class('hrela_name-en')[0].xpath('./text()')[0].strip()
        except:
            #print(str(e))
            # return hotel_tuple
            pass

    # 中英文名相同时只保留一个
    if hotel.hotel_name == hotel.hotel_name_en:
        if isinstance(hotel.hotel_name, str):
            hotel_name = hotel.hotel_name
        else:
            hotel_name = hotel.hotel_name.decode('utf8')
        if any(map(lambda x: u'\u4e00' <= x <= u'\u9fa5', hotel_name)):
            hotel.hotel_name_en = 'NULL'
        else:
            hotel.hotel_name = 'NULL'

    #print('hotel.hotel_name=>%s' % hotel.hotel_name)
    # #print hotel.hotel_name
    #print('hotel.hotel_name_en=>%s' % hotel.hotel_name_en)
    # #print hotel.hotel_name_en
    #print('brand=>%s' % hotel.brand_name)
    # #print hotel.brand_name

    # 解析酒店地址
    try:
        # hotel.address = root.find_class('mr5 left')[0].xpath('./text()')[0].strip().encode('utf-8').spilt(':')[1]
        temp = root.xpath('//span[@class="mr5 left"]/text()')
        hotel.address = temp[0].encode('utf-8').strip().split('：')[1]  # special chinese colon
    except:
        #print(e)
        hotel.address = 'NULL'

    if hotel.address == 'NULL':
        try:
            hotel.address = root.xpath('//span[@class="icon-address"]/text()')[0].replace('地址：', '').strip()
        except:
            #print(e)
            hotel.address = 'NULL'

    #print('hotel.address=>%s' % hotel.address)
    # #print hotel.address

    try:
        lat = re.findall(r'"lat":"([-+\d\.]*)"', content)[0]
        lon = re.findall(r'"lon":"([-+\d\.]*)"', content)[0]
        # map_infos = map_pat.findall(content)[0]
        hotel.map_info = '{},{}'.format(lon, lat)
    except:
        try:
            map_infos = page_js.eval('HotelDetailController').get('AjaxHotelInfo',{}).get('HotelGeoInfo',{})
            lat = map_infos.get('Lat',None)
            lon = map_infos.get('Long',None)
            hotel.map_info = '{0},{1}'.format(lon,lat)
            raise hotel.map_info == 'None,None'
        except:
            hotel.map_info = 'NULL'
            #print traceback.format_exc(e)

    #print 'map_info=>%s' % hotel.map_info
    # #print hotel.map_info

    # 解析酒店星级

    try:
        # star_temp = root.find_class('t24 yahei')[0].xpath('b/@class')[0].encode('utf-8')
        star_temp = root.xpath('//b[contains(@class, "icon_stars")]/@class')[0].encode('utf-8')
        hotel.star = star_temp[-1]
        if hotel.star == ' ':
            hotel.star = -1
    except:
        try:
            star_temp = page_js.eval('window.newDetailController').get('RecommendHotelRequest',{}).get('starLevel','')
            if json.loads(star_temp):
                hotel.star = json.loads(star_temp)[0]
        except:
            hotel.star = -1

    #print 'star=>%s' % hotel.star
    # #print hotel.star
    # 解析酒店评分
    try:
        grade = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_score', '')
        hotel.grade = grade

    except:
        try:
            grade = root.xpath('//div[@id="hover-hrela"]/p[1]')
            hotel.grade = float(re.search(r'[0-9\.]+', grade[0].text).group(0))
        except:
            try:
                # tp = root.xpath('//div[@class="pertxt_num"]/text()')[0].encode('utf-8')
                tp = root.xpath('//div[contains(@class, "pertxt_num")]/text()')[0].encode('utf-8')
                # t_grade = grade_pat.findall(tp)[0]
                # #print 't_grade', t_grade
                hotel.grade = float(tp)  # float(t_grade) * 0.05
            except:
                hotel.grade = 'NULL'
    #print 'grade=>%s' % hotel.grade
    # #print hotel.grade

    # 解析酒店评论数
    try:
        review_num_str = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_count', '')
        hotel.review_num = review_num_str
    except:
        try:
            # review_num_str = root.find_class('hrela_comt_total')[0]. \
            #     xpath('a/text()')[0].encode('utf-8').strip()
            # #print review_num_str
            review_num_str = root.find_class('fl sum-txt')[0].text_content().strip().encode('utf-8')
            hotel.review_num = int(grade_pat.findall(review_num_str)[0])
        except:
            hotel.review_num = -1

    #print 'review=>%s' % hotel.review_num
    # #print hotel.review_num

    # 解析酒店简介
    try:
        p_tags = root.find_class('dview_info')[0].xpath('dl[1]/dd/p')
        description = ''
        for p in p_tags:
            b_text = p.xpath('./b/text()')  # title
            p_text = p.xpath('./text()')  # description
            if len(b_text):
                description += b_text[0].strip().decode('utf-8') + ':' + p_text[1].strip().decode('utf-8') + '|'
        hotel.description = description[:-1].encode('utf-8')
        if hotel.description == '':
            hotel.description = p_tags[0].text_content().strip().encode('utf-8')
    except:
        hotel.description = 'NULL'

    #print 'description=>%s' % hotel.description
    # #print hotel.description

    # parse check_in time info , check out time info
    try:
        temp_time = root.xpath('//div[@id="iscrollNewAmenities"]/div/dl/dd/text()')[0]. \
            encode('utf-8').strip()
        #print temp_time
        hotel.check_in_time = temp_time.split('，')[0]
        k = temp_time.find('退房时间：')
        if k != -1:
            hotel.check_out_time = temp_time[k + 15:]
    except:
        hotel.check_out_time = 'NULL'
    #print 'check_in=>%s' % hotel.check_in_time
    # #print hotel.check_in_time

    #print 'check_out=>%s' % hotel.check_out_time
    # #print hotel.check_out_time
    # parse all services at this hotel

    accept_card = None
    try:
        service = ''
        accept_card = []
        service_list = root.xpath('//*[@id="serverall"]/li/text()')
        for each in service_list:
            service += each.encode('utf-8').strip() + '|'
            if '卡' in each:
                accept_card.append(each.strip())
        hotel.service = service[:-1]
    except:
        hotel.service = 'NULL'
    if accept_card:
        hotel.accepted_cards = '|'.join(accept_card).encode('utf-8')
    #print 'hotel.service=>%s' % hotel.service
    #print 'hotel.accept_cards=>%s' % hotel.accepted_cards
    # #print hotel.service
    first_img = None
    try:
        pattern_img = root.xpath('//div[@class="newdetaiL-img imgMore"]/@style')[0]
        first_img = re.search(r'url\(([^)]+)\)', pattern_img).group(1)
    except:
        #print e
        pass
    #others_info信息
    #print 'first_img=>%s' % first_img

    city_name = 'NULL'
    try:
        city_name = page_js.eval('window.newDetailController')['Region']['RegionName']
    except:
        #print e
        pass
    #print city_name
    hotel.city = city_name
    hotel.others_info = json.dumps({'city_name': city_name, 'first_img': first_img, 'hid':other_info.get('hid', 'NULL')})

    #获取source_city_id

    source_city_id = 'NULL'
    try:
        pattern_city_id = root.xpath('//p[@class="link555 t12"]/a[contains(@href,"region")]/@href')[0]
        source_city_id = re.search(r'[0-9]+',pattern_city_id).group()
    except:
        #print e
        pass
    # hotel.source_city_id = source_city_id
    #print "hotel.source_city_id",hotel.source_city_id
    # #print "hotel.others_info:",hotel.others_info

    # if '免费自助停车设施' in hotel.service:
    #     hotel.is_parking_free = 'Yes'
    #     hotel.has_parking = 'Yes'
    # if '收费自助停车设施' in hotel.service:
    #     hotel.has_parking = 'Yes'
    #     hotel.is_parking_free = 'No'
    # if '免费 Wi-Fi' in hotel.service:
    #     hotel.has_wifi = 'Yes'
    #     hotel.is_wifi_free = 'Yes'

    #print 'has_parking=>%s' % hotel.has_parking
    # #print hotel.has_parking
    #print 'is_parking_free=>%s' % hotel.is_parking_free
    # #print hotel.is_parking_free
    #print 'has_wifi=>%s' % hotel.has_wifi
    # #print hotel.has_wifi
    #print 'has_free_wifi=>%s' % hotel.is_wifi_free
    # #print hotel.is_wifi_free

    img_items = ''
    try:
        img_list = root.xpath('//ul[@class="hrela_spic_list"]/li/img/@src')
        for img_src in img_list:
            if '306' in img_src:
                img_src = img_src.replace('306', '307')
            img_items += img_src + '|'
        hotel.img_items = img_items[:-1]

        base_url = page_js.eval('window.newDetailController').get('BaseUrl')
        base_url = urljoin(base_url,'ihotel_848_470_all/')
        if not img_items:
            keys = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).keys()
            img_lists =[]
            for key in keys:
                img_list = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).get(key,{}).get('tagUrlList',{})
                img_lists.extend(img_list.values())
        img_lists = [base_url+img for img in img_lists]
        hotel.img_items = '|'.join(img_lists).encode('utf-8')
    except:
        hotel.img_items = 'NULL'

    #print 'img_items=>%s' % hotel.img_items
    # #print hotel.img_items

    if url.startswith('http://hotel'):
        try:
            hotel_obj = page_js.eval('HotelDetailController')
            lat = hotel_obj.get('googleLat', None)
            lon = hotel_obj.get('googleLng', None)
            hotel.map_info = '{0},{1}'.format(lon, lat)
            hotel.hotel_name = hotel_obj.get('hotelNameCn')
            hotel.hotel_name_en = hotel_obj.get('hotelNameEn')
            hotel.address = hotel_obj.get('hotelAddress')
            hotel.city = hotel_obj.get('cityNameCn') or hotel_obj.get('cityNameEn')
            hotel.grade = hotel_obj.get('starLevel')
            hotel.has_wifi = hotel_obj.get('hasWifi')
            hotel.source_city_id = hotel_obj.get('cityId')
            first_img = hotel_obj.get('hasWifi')
            hotel.others_info['first_img'] = first_img
        except:
            pass



    info_list = hotel.address.split(' ')
    hotel.country = info_list[-1]
    hotel.source = 'elong'
    hotel.hotel_url = url
    if other_info.get('hid'):
        hotel.source_id = re.search('/(\d+)/', url).groups()[0]
        # hotel.source_id = re.search('RegionId: ?"(\d+)"', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']
    hotel.others_info = json.dumps({
        'hotel_services_info':hotel.service,
    },ensure_ascii=False)

    # others_info_dict = hotel.__dict__
    # if first_img:
    #     others_info_dict['first_img'] = first_img
    # hotel.others_info = json.dumps(others_info_dict)
    # if first_img:
    #     del others_info_dict['first_img']
    # #print hotel

    return hotel

Example #5

Show file

File: new_gha_parser.py Project: 20113261/platform_service

def gha_parser(total_content, url, other_info):
    hotel = HotelNewBase()
    hotel.city_id = other_info.get("city_id", "NULL")

    select = etree.HTML(total_content)
    info = re.compile("pins\.gha_hotel\.push\((.*?)\)", re.S)
    address = re.compile(
        "<script type=\"application/ld\+json\">(.*?)</script>", re.S)
    address = json.loads(address.findall(total_content)[0].replace('	', ''))
    info = json.loads(info.findall(total_content)[0])
    hotel.hotel_name = info["title"]
    hotel.hotel_name_en = address["name"]
    hotel.source = "gha"
    hotel.source_id = info["id"]
    hotel.brand_name = info["brand_name"]
    hotel.map_info = str(info["lon"]) + "," + str(info["lat"])
    hotel.address = ''.join(select.xpath("//adress/text()")).strip()
    hotel.country = address["address"]["addressCountry"]
    hotel.city = address["address"]["addressLocality"]
    hotel.postal_code = address["address"]["postalCode"]
    hotel.star = '5'
    hotel.Img_first = select.xpath(
        "//div[@class='FlexEmbed-item']/span/img/@src")
    hotel.hotel_phone = address.get("telephone", 'NULL')
    hotel.hotel_zip_code = address["address"]["postalCode"]
    service = select.xpath('//ul[@class="prop-Amenities"]/li/span/text()')
    servicestr = ''.join(service)
    description = select.xpath("//div[@id='content-about-hotel']/p/text()")
    hotel.description = ''.join(description)
    if u'无线' in servicestr:
        hotel.facility["Room_wifi"] = u'无线上网'
        hotel.facility["Public_wifi"] = u'无线上网'
    if u'泳' in servicestr:
        hotel.facility["Swimming_Pool"] = u'泳池'
    if u'健身' in servicestr:
        hotel.facility["gym"] = u"健身中心"
    if u'水疗' in servicestr:
        hotel.facility['Mandara_Spa'] = u"水疗中心"
    if u'酒吧' in hotel.description:
        hotel.facility["Bar"] = u'酒吧'
    if u'儿童俱乐部' in hotel.description:
        hotel.facility["Recreation"] = u"儿童俱乐部"
    if u'餐' in servicestr:
        hotel.facility["Restaurant"] = u"餐饮"
    if u'商务中心' in servicestr:
        hotel.facility["Business_Centre"] = u'商务中心'
    if u'亲子' in servicestr:
        hotel.feature["Parent_child"] = u'亲子'
    img_list = select.xpath('//div[@class="RotateBanner-itemImg"]/span/@style')
    imgurl = re.compile("url\('(.*?)'\)")
    imgurl_list = []
    for img in img_list:
        imgurl_list.append(imgurl.findall(img)[0])

    hotel.img_items = '|'.join(imgurl_list)

    hotel.check_in_time = '14:00'
    hotel.check_out_time = '12:00'
    reviewsurl = re.compile('<script src="//(.*?)"')
    urls = reviewsurl.findall(total_content)
    if urls[0]:
        reviewsurl = "http://" + urls[0]
    else:
        hotel.grade = '0.0'
        hotel.review_num = 0
        hotel.hotel_url = url
        return hotel.to_dict()
    comment = requests.get(reviewsurl).content
    grade = re.compile('<div class=\\\\"rating-value\\\\">\\\\n(.*?)%', re.S)
    try:
        hotel.grade = str(float(grade.findall(comment)[0].strip()) / 10)
    except:
        hotel.grade = '0.0'
    review = re.compile('<div class=\\\\"review-count\\\\">\\\\n(.*?)reviews',
                        re.S)
    try:
        hotel.review_num = review.findall(comment)[0].strip()
    except:
        hotel.review_num = 0
    hotel.hotel_url = url
    # print room_tuple
    print hotel.to_dict()
    return hotel.to_dict()

Example #6

Show file

def hotels_parser(content, url, other_info):
    hotel = HotelNewBase()
    content = content.decode('utf-8')
    root = HTML.fromstring(content)

    ""
    try:
        source_city_id = re.findall(r'\"cityId\":(\d+),', content)[0]
        hotel.source_city_id = source_city_id.encode('utf8')
    except:
        #print e
        pass

    #print 'source_city_id=>%s' % hotel.source_city_id

    try:
        name_temp = root.xpath(
            '//div[@class="property-description"]/div[@class="vcard"]/h1/text()'
        )[0]
    except:
        #print(str(e))
        pass

    try:
        args = re.split('[(（]', name_temp, 2)
        # hotel.hotel_name = name_temp.split('(')[0].strip().encode('utf-8')
        hotel.hotel_name = args[0].strip().encode('utf-8')
        #print('hotel_name=>%s' % hotel.hotel_name)
        try:
            hotel.hotel_name_en = args[-1].rsplit('-', 1)[0].replace(
                ')', '').replace('）', '').strip().encode('utf-8')
            # hotel.hotel_name_en = re.findall('\(([\s\S]+?)\)', name_temp)[0].strip().encode('utf-8')
        except Exception:
            pass
        #print('hotel_name_en=>%s' % hotel.hotel_name_en)
    except:
        #print(str(e))
        pass

    if hotel.hotel_name_en == 'NULL' and hotel.hotel_name == 'NULL':
        try:
            name_temp = root.xpath('//*[@class="vcard"]/h1/text()')[0].encode(
                'utf-8')
        except:
            #print(str(e))
            pass

        try:
            hotel.hotel_name = name_temp.split('(')[0].strip().encode('utf-8')
            #print ('hotel_name=>%s' % hotel.hotel_name)
            try:
                hotel.hotel_name_en = name_temp.split('(')[1].replace(
                    ')', '').strip().encode('utf-8')
            except Exception:
                hotel.hotel_name_en = 'NULL'
            # hotel.source_id = root.xpath('//*[@id="roomdesc_mainContainerSize1"]/input[1]/@value')[0]
            #print ('hotel_name_en=>%s' % hotel.hotel_name_en)
        except:
            #print (str(e))
            pass
    # -- fengyufei
    if len(re.findall('[\x80-\xff]+', str(hotel.hotel_name_en))) > 0:
        #print '------va---'
        name_temp = root.xpath(
            '//div[@class="widget-query-group widget-query-destination"]/input/@value'
        )[0]
        # re.findall('[a-zA-Z ]+',name_temp)
        hotel.hotel_name_en = re.findall('\((.*?)\)',
                                         name_temp)[0].encode('utf8')

        try:
            hotel.hotel_name = name_temp.split('({}'.format(
                hotel.hotel_name_en))[0].strip()
        except Exception:
            pass
        # # 城市清除
        # if '-' in hotel.hotel_name:
        #     hotel.hotel_name = hotel.hotel_name.split('-')[0].strip()

    # 城市清除
    if hotel.hotel_name_en in hotel.hotel_name:
        if '-' in hotel.hotel_name:
            hotel.hotel_name = hotel.hotel_name.split('-')[0].strip()

        if hotel.hotel_name == hotel.hotel_name_en:
            hotel.hotel_name = 'NULL'

    #print('hotel_name=>%s' % hotel.hotel_name)
    #print('hotel_name_en=>%s' % hotel.hotel_name_en)

    try:
        hotel.address = root.find_class('postal-addr')[0].text_content() \
            .encode('utf-8').strip().replace('\n', '').replace('  ', '')
    except:
        hotel.address = 'NULL'
    #print ('address=>%s' % hotel.address)
    # #print hotel.address
    try:
        temp = root.find_class('visible-on-small map-widget-wrapper')[0].xpath(
            'div/@style')[0].encode('utf-8').strip()
        map_info = map_pat.findall(temp)[0]
        hotel.map_info = map_info.split(',')[1] + ',' + map_info.split(',')[0]
    except:
        # #print str(e)
        hotel.map_info = 'NULL'
    #print ('map_info=>%s' % hotel.map_info)
    # #print hotel.map_info
    try:
        # hotel.postal_code = root.find_class('postal-code')[0].text.strip() \
        #     .encode('utf-8').replace(',', '')
        hotel.postal_code = root.xpath(
            '//span[@itemprop="postalCode"]/text()')[0].strip().encode('utf-8')
    except:
        hotel.postal_code = 'NULL'

    #print('postal_code=>%s' % hotel.postal_code)
    # #print hotel.postal_code
    star_nums = 0
    try:
        # temp_star = root.xpath('//div [@class="vcard"]/span/span')
        # #print 'dasdsadsafdfd'
        # #print temp_star
        temp_star = root.xpath(
            '//div[@class="vcard"]/span/text()')[0].strip().encode('utf-8')
        # for i in temp_star:
        #     if i.xpath('./@class')[0] == 'icon icon-star':
        #         star_nums += 1
        #     if i.xpath('./@class')[0] == 'icon icon-star icon-star-scale icon-star-half-parent':
        #         star_nums += 0.5
        star_nums = re.findall(r'\d+', temp_star)
        hotel.star = int(star_nums[0])
    except:
        hotel.star = -1.0
    #print ('star=>%s' % hotel.star)
    # #print hotel.star
    try:
        hotel.grade = root.find_class('rating')[0].xpath('strong/text()')[0]
        hotel.grade = float(hotel.grade)
    except:
        try:
            if not hotel.grade:
                grade = root.xpath('//div[@class="logo-wrap"]/span[1]/text()'
                                   )[0].encode('utf-8')
                grade = re.search(r'[0-9\.]+', grade).group(0)
                hotel.grade = float(grade)
        except:
            #print(e)
            hotel.grade = -1.0

    #print ('hotel.grade=>%s' % hotel.grade)
    # #print hotel.grade
    try:
        review_num_temp = root.find_class('total-reviews')[0].text
        review_num = num_pat.findall(review_num_temp)[0]
        hotel.review_num = int(review_num)
    except:
        hotel.review_num = -1

    #print ('review_num_temp=>%s' % hotel.review_num)
    # #print hotel.review_num

    first_img = None
    try:
        img_list = root.xpath(
            '//div[@id="carousel-container"]/div[1]/ul/li[@data-src]')
        hotel.img_items = ''
        for i, li in enumerate(img_list):
            src = li.xpath('./@data-src')
            if len(src):
                size = li.xpath('./@data-sizes')
                if 'z' in size[0]:
                    img_url = src[0].strip().encode('utf-8').replace(
                        '{size}', 'z')
                else:
                    if 'y' in size[0]:
                        img_url = src[0].strip().encode('utf-8').replace(
                            '{size}', 'y')
                    else:
                        img_url = src[0].strip().encode('utf-8').replace(
                            '{size}', 'n')
                if i == 0:
                    first_img = img_url
                hotel.img_items += img_url + '|'
        hotel.img_items = hotel.img_items[:-1]

        if not hotel.img_items:
            img_list = root.xpath(
                '//div[@id="carousel-container"]/div[1]/ul/li[@data-desktop]')
            hotel.img_items = ''
            for i, li in enumerate(img_list):
                img_url = li.xpath('./@data-desktop')[0]
                hotel.img_items += img_url + '|'

        hotel.img_items = hotel.img_items[:-1]
        # image_list = root.find_class('carousel-thumbnails')[0].xpath('ol/li')
        # hotel.img_items = ''
        # image_name = ''
        # hotel.img_items = ''
        # for each_image_ele in image_list:
        #     image_url = each_image_ele.xpath('a/@href')[0]
        #     hotel.img_items += image_url + '|'
        # hotel.img_items = hotel.img_items[:-1]
    except:
        hotel.img_items = 'NULL'

    #print ('hotel_img_items=>%s' % hotel.img_items)
    #print 'first_img=>%s' % first_img
    # #print hotel.img_items

    try:
        description_temp = root.get_element_by_id('overview').xpath('b/text()')[0] \
            .encode('utf-8').strip()
        hotel.description = description_temp
    except:
        #print (str(e))
        hotel.description = 'NULL'

    if hotel.description == 'NULL':
        try:
            hotel.description = root.xpath(
                '// div[@class="tagline"]')[0].text_content().strip()
        except:
            #print(str(e))
            hotel.description = 'NULL'

    #print ('description=>%s' % hotel.description)
    # #print hotel.description

    total_service = ''
    service_1 = ''
    try:
        service_list = root.xpath('//div[@id="overview-columns"]/div')
        for div in service_list:
            title = div.xpath('./h3/text()')[0].strip().encode('utf-8') + '|'
            li_list = div.xpath('./ul/li/text()')
            for li in li_list:
                title += li.strip().encode('utf-8') + '|'
            # delete last comma
            service_1 += title[:-1] + '|'
            # service_list = root.find_class('main-amenities two-columned')[0].xpath('ul/li')
            # for each in service_list:
            #     service += each.text_content().encode('utf-8').strip() + '|'
    except:
        #print (str(e))
        service_1 = ''
    service_in_hotel_room = ''
    try:
        in_hotel_room_dom_list = root.find_class(
            'fact-sheet expandable-content')
        for in_hotel_room in in_hotel_room_dom_list:
            li_dom_list = in_hotel_room.xpath('.//li')
            for li_dom in li_dom_list:
                service_in_hotel_room += li_dom.text_content().strip().replace(
                    ' ', '').encode('utf-8') + '|'
    except:
        pass

    service_glance = ''
    try:
        glance_dom = root.get_element_by_id('at-a-glance')
        glance_li_list = glance_dom.xpath('.//li')
        for li_dom in glance_li_list:
            service_glance += li_dom.text_content().strip().replace(
                ' ', '').encode('utf-8') + '|'
    except:
        pass

    total_service = service_1 + service_in_hotel_room + service_glance
    if total_service:
        hotel.service = total_service[:-1]
    else:
        hotel.service = total_service
    #print ('service=>%s' % hotel.service)
    # #print hotel.service

    # try:
    #     temp = root.find_class('col-6-24 travelling-container resp-module')[0]
    #     wifi_text = temp.text_content()  # HTML.tostring(temp).encode('utf-8').strip()
    #     if 'WiFi' in wifi_text:
    #         hotel.has_wifi = 'Yes'
    #         if '免费WiFi' in wifi_text:
    #             hotel.is_wifi_free = 'Yes'
    #         else:
    #             hotel.is_wifi_free = 'No'
    #     else:
    #         hotel.has_wifi = 'No'
    #         hotel.is_wifi_free = 'NO'
    # except:
    #     print(str(e))
    # hotel.has_wifi = 'NULL'

    #print ('has_wifi=>%s' % hotel.has_wifi)
    #print ('is_wifi_free=>%s' % hotel.is_wifi_free)
    # #print hotel.has_wifi

    # try:
    #     temp = root.find_class('col-6-24 transport-container last resp-module')[0]
    #     car_text = temp.text_content()  # HTML.tostring(temp).encode('utf-8').strip()
    #     if '无停车' not in car_text:
    #         hotel.has_parking = 'Yes'
    #         if '免费自助停车' in car_text:
    #             hotel.is_parking_free = 'Yes'
    #         else:
    #             hotel.is_parking_free = 'No'
    #     else:
    #         hotel.has_parking = 'No'
    #         hotel.is_parking_free = 'No'
    # if car_text.find('免费自助停车'):
    #     hotel.has_parking = 'Yes'
    #     hotel.is_parking_free = 'Yes'
    # if car_text.find('停车场'):
    #     hotel.has_parking = 'Yes'
    # except:
    #print(str(e))
    # hotel.has_parking = 'NULL'
    # hotel.is_parking_free = 'NULL'
    #print ('has_park=>%s' % hotel.has_parking)
    # #print hotel.has_parking

    #print ('is_parking_free=>%s' % hotel.is_parking_free)
    # #print hotel.is_parking_free

    try:
        # temp = root.xpath('//*[@id="at-a-glance"]/div/div[1]/div[2]/div/ul[2]')[0]
        # check_in_time = temp.xpath('./li[1]/text()')[0]
        # check_out_time = temp.xpath('./li[2]/text()')[0]
        temp_titles = root.xpath(
            '//div[@class=" col-8-24 key-facts-container resp-module"]/div')[0]
        for title_i, title in enumerate(temp_titles.xpath('./h4')):
            if title.text == '抵達/離開':
                break
        temp_check_times = temp_titles.xpath('ul')[title_i]
        check_in_time = temp_check_times.xpath('./li[1]/text()')[0]
        check_out_time = temp_check_times.xpath('./li[2]/text()')[0]
    except:
        #print(str(e))
        check_in_time = 'NULL'
        check_out_time = 'NULL'

    hotel.check_in_time = check_in_time.encode('utf-8')
    hotel.check_out_time = check_out_time.encode('utf-8')
    #print('hotelcheck_in_time=>%s' % hotel.check_in_time)
    # #print hotel.check_in_time
    #print('hotel_check_out_time=>%s' % hotel.check_out_time)
    # #print hotel.check_out_time
    hotel.source = 'hotels'

    hotel.hotel_url = url
    if other_info.get('hid'):
        hotel.source_id = re.search('/ho(\d+)/', url).groups()[0]
        # hotel.source_id = re.search('"cityId": ?(\d+)', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']

    tmp_others_info = {
        'hid': other_info.get('hid', 'NULL'),
        'hotel_services_info': hotel.service,
    }
    if first_img:
        tmp_others_info['first_img'] = first_img
    hotel.others_info = json.dumps(tmp_others_info, ensure_ascii=False)

    return hotel

Example #7

Show file

File: new_booking_parser.py Project: 20113261/platform_service

def booking_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        root = HTML.fromstring(content)
    except Exception as e:
        print e.message
    hotel.hotel_name = re.findall(r'b_hotel_name:.*?\'(.+?)\',',
                                  content)[0].strip()
    hotel.hotel_name_en = re.findall(r'hotelName:.*?\"(.+?)\",',
                                     content)[0].strip()
    hotel.source = 'booking'
    hotel.source_id = other_info['source_id']
    latitude = re.findall(r'b_map_center_latitude = (.*?);',
                          content)[0].strip()
    longitude = re.findall(r'b_map_center_longitude = (.*?);',
                           content)[0].strip()
    hotel.map_info = '{},{}'.format(latitude, longitude)
    location_dict = json.loads(
        re.findall(r'<script type="application/ld\+json">(.*?)</script>',
                   content, re.S)[0].replace('\n', '').strip())
    hotel.address = location_dict['address']['streetAddress']
    hotel.city = re.findall(r'city_name:.*?\'(.*?)\'', content)[0].strip()
    hotel.country = location_dict['address']['addressCountry']
    hotel.city_id = other_info['city_id']
    hotel.postal_code = re.findall(r'"postalCode".*?\"(.*?)\"', content,
                                   re.S)[0].strip()
    try:
        hotel.star = root.xpath(
            '//*[@id="wrap-hotelpage-top"]/div[@class="hp__hotel-title"]/span/span[@class="hp__hotel_ratings__stars nowrap"]/i/@title'
        )[0].encode('utf-8').replace('星级酒店', '')
    except IndexError as e:
        print('Parser ERROR, NO Star Infomation.The reason follows: %s' %
              e.message)
    hotel.grade = location_dict['aggregateRating']['ratingValue']
    hotel.review_num = location_dict['aggregateRating']['reviewCount']
    hotel.Img_first = location_dict['image']
    # hotel.other_info =
    # hotel.hotel_phone =
    # hotel_zip_code =
    # hotel.feature =
    # hotel.brand_name =
    # hotel.continent =
    try:
        hotel.traffic = ','.join([
            root.xpath('//*[@id="public_transport_options"]/div/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[1]/text()')
            [1].strip('\n').strip(),
            root.xpath(
                '//*[@id="public_transport_options"]/ul/li/div[2]/text()')
            [0].strip('\n').strip()
        ])
    except IndexError as e:
        print('Parser ERROR, NO Traffic Infomation.The reason follows: %s' %
              e.message)
    # hotel.chiled_bed_type = '\n'.join(root.xpath('//*[@id="children_policy"]/p[position()>1]/text()'))
    hotel.chiled_bed_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="children_policy"]/p[position()>1]//text()|//*[@id="general-child-policy"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.pet_type = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hotelPoliciesInc"]/div[@class="description"]/p[position()>1]//text()'
        ) if i.replace('\n', '').strip()
    ])
    # －2:宠物  1:综合设施  2:活动设施  3:服务项目  5：浴室  6:媒体／科技  7:餐饮服务  11：网络  13:户外  16:停车场  17：卧室
    # 21:游泳及康复设施  27：商务设施
    hot_facilities = [
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilities-sliding-keep"]/div/div[@class="important_facility "]//text()'
        ) if i.replace('\n', '').strip()
    ]
    wifi = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=11]/ul/li[@class="policy"]/p/span//text()'
        ) if i.replace('\n', '').strip()
    ])
    if u'免费无线网络连接' in hot_facilities or u'免费！住宿方于各处提供WiFi（免费）。' in wifi:
        hotel.facility_content['Public_wifi'] = wifi
    elif u'免费！住宿方于客房提供WiFi（免费）。' in wifi:
        hotel.facility_content['Room_wifi'] = wifi
    elif u'客房' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Room_wired'] = wifi
    elif u'公共' in wifi or u'各处' in wifi and u'有线网络' in wifi:
        hotel.facility_content['Public_wired'] = wifi
    parking = ''.join([
        i.replace('\n', '').strip() for i in root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=16]//p//text()'
        ) if i.replace('\n', '').strip()
    ])
    hotel.facility_content['Parking'] = parking

    # 设施新字段添加到facilities_dict， 即可自动匹配
    facilities_dict = {
        'Swimming_Pool': ['游泳池'],
        'gym': ['健身房'],
        'SPA': ['SPA'],
        'Bar': ['酒吧'],
        'Coffee_house': ['咖啡厅'],
        'Tennis_court': ['网球场'],
        'Golf_Course': ['高尔夫球场'],
        'Sauna': ['桑拿'],
        'Mandara_Spa': ['水疗中心'],
        'Recreation': ['儿童娱乐场', '儿童游乐场'],
        'Business_Centre': ['商务中心'],
        'Lounge': ['行政酒廊'],
        'Wedding_hall': ['婚礼礼堂'],
        'Restaurant': ['餐厅'],
        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
        'Valet_Parking': ['代客泊车'],
        'Call_service': ['叫车服务'],
        'Rental_service': ['租车服务']
    }
    part_facilities = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]/div[@class="facilitiesChecklist"]/div/ul/li/span[@data-name-en]/text()'
        ))
    parser_list = []
    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
    # print reverse_facility_dict
    for every in part_facilities:
        value = every.replace('咖啡', '咖啡厅').replace('网球', '网球场').replace(
            '健身', '健身房').replace('儿童娱乐', '儿童游乐').upper()
        for keys, faci in facilities_dict.items():
            for fac in faci:
                if fac in value:
                    if keys in hotel.facility_content:
                        hotel.facility_content[
                            keys] = hotel.facility_content[keys] + ',' + every
                    else:
                        hotel.facility_content[keys] = every
                    parser_list.append(every)
    print('酒店设施：{}'.format(', '.join(part_facilities)))
    print('已解析出：%s' % ', '.join(parser_list))
    service_list = map(
        lambda x: x.encode('utf-8').replace('\n', '').strip(),
        root.xpath(
            '//*[@id="hp_facilities_box"]//div[@data-section-id=3]/ul/li/span[1]/text()'
        ))

    # 服务新字段添加到facilities_dict， 即可自动匹配
    service_dict = {
        'Luggage_Deposit': '行李寄存',
        'front_desk': '24小时前台',
        'Lobby_Manager': '24小时大堂经理',
        '24Check_in': '24小时办理入住',
        'Security': '24小时安保',
        'Protocol': '礼宾服务',
        'wake': '叫醒服务',
        'Chinese_front': '中文前台',
        'Postal_Service': '邮政服务',
        'Fax_copy': '传真/复印',
        'Laundry': '洗衣服务',
        'polish_shoes': '擦鞋服务',
        'Frontdesk_safe': '前台保险柜',
        'fast_checkin': '快速办理入住/退房',
        'ATM': '自动柜员机(ATM)/银行服务',
        'child_care': '儿童看护服务',
        'Food_delivery': '送餐服务'
    }
    reverse_sevice_dict = {v: k for k, v in service_dict.items()}
    parser_sevice_list = []
    for every in part_facilities:
        for serv in service_dict.values():
            value = serv.replace('服务', '')
            if value in every:
                hotel.service_content[reverse_sevice_dict[serv]] = every
                parser_sevice_list.append(every)
    print('酒店服务：{}'.format(', '.join(service_list)
                           or '如果你看见了这句话请不要好奇，它表示酒店服务项目是空的'))
    print('已解析出：%s' % ', '.join(parser_sevice_list))
    hotel.img_items = '|'.join(
        root.xpath('//*[@id="photos_distinct"]/a[position()<last()-1]/@href'))
    if not hotel.img_items:
        hotel.img_items = '|'.join(
            root.xpath('//div[@class="bh-photo-grid-thumb-cell"]/a/@href'))
    hotel.description = '\n'.join(
        map(lambda x: x.strip(), root.xpath('//*[@id="summary"]/p/text()')))
    a = root.xpath(
        '//*[@class="jq_tooltip payment_methods_overall"]/button/@aria-label|'
        '//div[contains(@class, "payment_promotion_labels")]/label/span/text()'
    )

    hotel.accepted_cards = '|'.join(a)
    hotel.check_in_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkin_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.check_out_time = re.sub(
        pattern=r'<script.+?script>',
        repl='',
        string=root.xpath('//*[@id="checkout_policy"]/p/span/@data-caption')
        [0].encode('utf-8'),
        flags=re.S).strip()
    hotel.hotel_url = url.encode('utf-8')
    print json.dumps(hotel.to_dict(), ensure_ascii=False)
    return hotel.to_dict()

Example #8

Show file

File: shangrila_hotel_spider.py Project: 20113261/platform_service

    def parse_detail(self, req, resp):
        tree = etree.HTML(resp)
        req_url = req['req']['url']
        # print req['req']['url']
        self.item['source'] = 'shangrila'
        self.item['brand_name'] = '香格里拉'

        if 'about' in req_url:
            if 'service' in req_url:

                hotel2 = Hotel_New()

                try:
                    service_all = tree.xpath(
                        "//div[@class='control2_1column']/ul/li/text()")
                    # facilities_dict = {'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅',
                    #                    'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心',
                    #                    'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊',
                    #                    'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车场',
                    #                    'Airport_bus': '机场', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务',
                    #                    'Rental_service': '租车服务', 'Room_wifi': '客房无线网络', 'Room_wired': '客房有线网络',
                    #                    'Public_wifi': '公共区域无线上网', 'Public_wired': '公共区域有线网络'}

                    facilities_dict = {
                        'Swimming_Pool': ['游泳池'],
                        'gym': ['健身房'],
                        'SPA': ['SPA'],
                        'Bar': ['酒吧'],
                        'Coffee_house': ['咖啡厅'],
                        'Tennis_court': ['网球场'],
                        'Golf_Course': ['高尔夫球场'],
                        'Sauna': ['桑拿'],
                        'Mandara_Spa': ['水疗中心'],
                        'Recreation': ['儿童娱乐场', '儿童游乐场'],
                        'Business_Centre': ['商务中心'],
                        'Lounge': ['行政酒廊'],
                        'Wedding_hall': ['婚礼礼堂'],
                        'Restaurant': ['餐厅'],
                        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
                        'Valet_Parking': ['代客泊车'],
                        'Call_service': ['叫车服务'],
                        'Rental_service': ['租车服务'],
                        'Room_wifi': ['客房无线网络'],
                        'Room_wired': ['客房有线网络'],
                        'Public_wifi': ['公共区域无线上网'],
                        'Public_wired': ['公共区域有线网络']
                    }
                    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
                    service_dict = {
                        'Luggage_Deposit': '行李寄存',
                        'front_desk': '24小时前台',
                        'Lobby_Manager': '24小时大堂经理',
                        '24Check_in': '24小时办理入住',
                        'Security': '24小时安保',
                        'Protocol': '礼宾服务',
                        'wake': '叫醒服务',
                        'Chinese_front': '中文前台',
                        'Postal_Service': '邮政服务',
                        'Fax_copy': '传真/复印',
                        'Laundry': '洗衣',
                        'polish_shoes': '擦鞋服务',
                        'Frontdesk_safe': '保险',
                        'fast_checkin': '快捷入住及退房服务',
                        'ATM': '自动柜员机(ATM)/银行服务',
                        'child_care': '儿',
                        'Food_delivery': '送餐服务'
                    }
                    reverse_sevice_dict = {
                        v: k
                        for k, v in service_dict.items()
                    }
                    for service in service_all:
                        for keys, fac_value in facilities_dict.items():
                            if fac_value in service:
                                service = self.clean_data(service)
                                if keys in hotel2.facility:
                                    hotel2.facility[keys] = hotel2.facility[
                                        keys] + ',' + service
                                else:
                                    hotel2.facility[keys] = service
                        for sev_value in service_dict.values():
                            if sev_value in service:
                                service = self.clean_data(service)
                                hotel2.service[
                                    reverse_sevice_dict[sev_value]] = service
                    self.item['service'] = hotel2.service
                    self.item['facility'] = hotel2.facility
                except Exception as e:
                    self.item['service'] = "NULL"
                    self.item['facility'] = "NULL"

            elif 'map' in req_url:
                try:
                    latitude = re.compile(r'"Lat":"(.*?)"',
                                          re.S).findall(resp)[0]
                    longitude = re.compile(r'"Lng":"(.*?)"',
                                           re.S).findall(resp)[0]
                except:
                    raise parser_except.ParserException(22, '代理失效')
                self.item['latitude'] = latitude
                self.item['longitude'] = longitude
                map_list = tree.xpath(
                    "//div[@class='control2_1column']/div[@class='map-list']/div/h4/text()"
                )
                self.item['traffic'] = "NULL"
                traffic_str_all = ""
                index = 1
                for tra_str in map_list:
                    # if tra_str == "公共交通":
                    traffic_str_l = tree.xpath(
                        "//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()"
                        .format(index))
                    traffic_str = " ".join(traffic_str_l).strip().replace(
                        " ", "")
                    traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "机场交通":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "地铁":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "出租车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "高速磁悬浮列车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "酒店豪华桥车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "").replace('\r', '').replace('\n', '')
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    self.item['traffic'] = traffic_str_all
                    index += 1
                return

        elif 'reviews' in req_url:
            self.flag = True

            try:
                link = tree.xpath(
                    '//iframe[contains(@id, "ChildFrame")]/@src')[0]
            except:
                raise parser_except.ParserException(22, 'proxy error')

            self.review_url = link
            if 'http' not in link:
                self.review_url = 'http:' + link

            self.review_url = self.review_url.strip()

Example #9

Show file

File: shangrila_hotel_spider.py Project: 20113261/platform_service

    def parse_hotel(self, req, resp):

        req_url = req['req']['url']

        if 'meta_review' in req_url or 'seal' in req_url or 'partnerId' in req_url:
            node = etree.HTML(resp)

            grade = ''
            try:
                grade = node.xpath(
                    '//div[@class="summary"]/p/span[2]/text()|//div[@class="value"]/text()'
                )[0]
                grade = grade.replace('\n', '').strip()
            except:
                grade = ''
                self.hotel.grade = grade
            try:
                try:
                    self.review_num = \
                    re.compile(r'\d*,*\d+').findall(node.xpath('//h1/text()|//div[@class="counter"]/text()')[0])[0]
                except:
                    self.review_num = \
                    re.compile(r'\d*,*\d+').findall((node.xpath('//div[@class="numReviews"]/text()'))[0])[0]
            except:
                self.review_num = ''
            try:
                self.hotel.review_num = self.review_num
            except:
                self.hotel.review_num = ''

        elif 'HotelPhotoVideoJson' in req_url or 'getphotosvideos' in req_url:
            self.hotel = Hotel_New()
            self.hotel.hotel_name = self.hotel_name
            self.hotel.hotel_name_en = self.item['hotel_name_en']
            self.hotel.source = self.item['source']
            self.hotel.source_id = self.item['source_id']
            self.hotel.brand_name = self.item['brand_name']
            try:
                self.hotel.map_info = '{},{}'.format(self.item['longitude'],
                                                     self.item['latitude'])
            except:
                self.hotel.map_info = ''
            self.hotel.address = self.item['address']
            self.hotel.city = self.item['city']
            self.hotel.country = self.country
            self.hotel.postal_code = self.item['postal_code']
            self.hotel.star = self.item['star']
            self.hotel.facility = self.item['facility']
            self.hotel.service = self.item['service']
            self.hotel.description = self.item['description']
            self.hotel.accepted_cards = self.item['accepted_cards']
            self.hotel.check_in_time = self.item['check_in_time']
            self.hotel.check_out_time = self.item['check_out_time']
            self.hotel.hotel_url = self.url_index
            self.hotel.hotel_phone = self.item['hotel_phone']
            self.hotel.traffic = self.item['traffic']
            if 'getphotosvideos' in req_url:
                self.img_list = '|'.join([
                    "https://www.hoteljen.com{}".format(img['image'])
                    for img in resp if img['image']
                ])
            else:
                self.img_list = '|'.join([
                    "http://www.shangri-la.com{}".format(img['image'])
                    for img in resp if img['image']
                ])

            self.hotel.img_items = self.img_list
            img_l = self.img_list.split("|")
            print self.img_first
            if not self.img_first:
                self.hotel.Img_first = img_l[0]
            else:
                self.hotel.Img_first = self.img_first

            res = self.hotel.to_dict()

            res = json.loads(res)
            return res

Example #10

Show file

File: shangrila_hotel_spider.py Project: 20113261/platform_service

class ShangRiLaDetailSpider(Spider):
    source_type = 'shangrilaDetailHotel'

    targets = {'hotel': {}}
    old_spider_tag = {'shangrilaDetailHotel': {'required': ['room', 'hotel']}}

    def __init__(self, task=None):
        self.flag = False
        self.item = {}
        self.info_list = []
        super(ShangRiLaDetailSpider, self).__init__(task)

    def targets_request(self):
        self.url_index, self.hotel_name, self.hotel_code, self.country = self.task.content.split(
            '&')[:-1]

        if not self.url_index.endswith('/'):
            self.url_index = '{}/'.format(self.url_index)
        if 'pre-opening' in self.url_index:
            self.url_index = self.url_index.replace('pre-opening/', '')

        # print self.url_index
        url_json = '{}NavigationMainMenuJson.json'.format(self.url_index)
        url_jsons = '{}about/NavigationJson.json'.format(self.url_index)
        self.city = self.url_index.split('/')[-3]
        self.index_url = '{}about'.format(self.url_index)

        @request(retry_count=3, proxy_type=PROXY_REQ, async=True)
        def crawl_index():
            p = []
            p.append({
                'req': {
                    'url': url_json,
                    'method': 'get',
                    'headers': {
                        'Host': 'www.shangri-la.com',
                        'Referer': self.url_index,
                        'Pragma': 'no-cache'
                    }
                },
                'user_handler': [self.parse_index],
                'data': {
                    'content_type': 'json'
                }
            })
            p.append({
                'req': {
                    'url': url_jsons,
                    'method': 'get',
                    'headers': {
                        'Host': 'www.shangri-la.com',
                        'Pragma': 'no-cache',
                    }
                },
                'user_handler': [self.parse_index],
                'data': {
                    'content_type': 'json'
                }
            })
            p.append({
                'req': {
                    'url': self.index_url,
                    'method': 'get',
                    'headers': {
                        'Host': 'www.shangri-la.com',
                        'Pragma': 'no-cache',
                        'Referer': 'http://www.shangri-la.com/cn/find-a-hotel/'
                    }
                },
                'user_handler': [self.parse_index]
            })
            return p

        yield crawl_index

        # print self.info_list,"*"*80

        @request(retry_count=3, proxy_type=PROXY_FLLOW, async=True)
        def crawl_details():
            pages = []
            for each_info_url in self.info_list:
                pages.append({
                    'req': {
                        'url': each_info_url,
                        'method': 'get',
                        'headers': {
                            'Host': 'www.shangri-la.com',
                            'Referer': self.url_index,
                            'Pragma': 'no-cache',
                            'Accept-Language': 'en;q=0.8',
                        }
                    },
                    'user_handler': [self.parse_detail]
                })
            return pages

        yield crawl_details
        # try:
        #     if self.img_url and self.review_url:
        if self.flag:

            @request(retry_count=3,
                     proxy_type=PROXY_FLLOW,
                     binding=self.parse_hotel,
                     async=False,
                     new_session=True)
            def crawl_more():
                page = []
                page.append({
                    'req': {
                        'url': self.review_url,
                        'method': 'get',
                        #  'headers': {
                        #     # 'Host': 'www.tripadvisor.cn', 'Pragma': 'no-cache', 'Referer': self.review_url,
                        #     # 'Cookie': 'SERVERID=srv-trustyou-web2_80'
                        #     # 'Cookie': self.cookie
                        # }
                    },
                })
                page.append({
                    'req': {
                        'url': self.img_url,
                        'method': 'get',
                        'headers': {
                            'referer':
                            'https://www.hoteljen.com/brisbane/romastreet/photos-videos/',
                        }
                    },
                    'data': {
                        'content_type': 'json'
                    }
                })

                return page

            yield crawl_more
        else:

            @request(retry_count=3,
                     proxy_type=PROXY_FLLOW,
                     binding=self.parse_hotel)
            def crawl_more():
                page = []
                page.append({
                    'req': {
                        'url': self.img_url,
                        'method': 'get',
                        'headers': {
                            'referer':
                            'https://www.hoteljen.com/brisbane/romastreet/photos-videos/',
                        }
                    },
                    'data': {
                        'content_type': 'json'
                    }
                })
                return page

            yield crawl_more
            # else:
            #     raise parser_except.ParserException(22, '代理失效')
        # except:
        #     raise parser_except.ParserException()

    def clean_data(self, str):
        str_l = str.replace(" ", "").replace("\r", "").replace("\n", "")
        return str_l

    def parse_index(self, req, resp):
        req_url = req['req']['url']
        # print req_url
        # print req_url

        if 'NavigationMainMenuJson' in req_url:
            # print resp
            node_list = resp['MainMenu']
            self.info_list.extend([
                'http://www.shangri-la.com{}'.format(node['Url'])
                for node in node_list
                if 'about' in node['Url'] or 'reviews' in node['Url']
            ])

        elif 'NavigationJson' in req_url:
            # print resp
            try:
                node_list = resp['NaviMenu']
                self.info_list.extend([
                    'http://www.shangri-la.com{}'.format(node['Url'])
                    for node in node_list
                    if 'map' in node['Url'] or 'service' in node['Url']
                ])
            except KeyError:
                raise parser_except.ParserException(
                    22, '请求失效，失效url为：{}'.format(req_url))
        else:
            tree = etree.HTML(resp)
            description = re.compile(r'<p>(.*?)</p>').findall(resp)
            description_info = ''
            for des in description:
                if u'本酒店可接受以下信用卡付款' in des or u'退房时用信用卡结账需' in des:
                    pass
                else:
                    description_info += des
            # print description_info

            self.item['description'] = description_info
            try:
                hotel_phone = tree.xpath(
                    "//span[@id='ctl00_ContentPlaceHolder1_ltrPhone']/text()"
                )[0]
                self.item['hotel_phone'] = hotel_phone
            except Exception as e:
                self.item['hotel_phone'] = 'NULL'
            self.item['source_id'] = self.hotel_code

            self.img_url = 'http://www.shangri-la.com/HotelPhotoVideoJson.json?hotel_code={}&lang=cn'.format(
                self.hotel_code)

            # hotel_name_start = tree.xpath('//div[@class="logoOverLayer"]/img/@alt')[0]
            # print resp
            try:
                hotel_name_info = tree.xpath(
                    '//meta[@property="og:title"]/@content')[0]
            except:
                raise parser_except.ParserException(22, 'proxy error')
            # title = tree.xpath('//title/text()')
            # print title
            # print hotel_name_info
            if '五星级' in hotel_name_info:
                self.item['star'] = 5
            elif '四星级' in hotel_name_info:
                self.item['star'] = 4
            elif '三星级' in hotel_name_info:
                self.item['star'] = 3
            else:
                self.item['star'] = ''
            # hotel_name = hotel_name_info.split('|')[-1]
            # self.item['hotel_name'] = hotel_name
            self.item['hotel_name_en'] = self.hotel_name
            try:
                # print resp
                post_code = tree.xpath(
                    '//div[@class="widget-mid"]//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()'
                )
                if not post_code:
                    post_code = tree.xpath(
                        '//span[@id="ctl00_ContentPlaceHolder1_ltrAddress"]/text()'
                    )

                if len(post_code) <= 1:
                    p_codes = post_code[0].split(',')
                    try:
                        p_code = p_codes[-2]
                        n = re.compile(r'(\d+)').findall(p_code)
                        if not n:
                            p_code = ''
                    except:
                        p_code = ''
                    address = post_code[0]
                else:
                    address = post_code[0]
                    p_code = re.compile(r'\d+').findall(post_code[1])[0]
            except:
                post_code = ''
                p_code = ''
                address = ''

            self.item['city'] = self.city

            self.item['postal_code'] = p_code
            self.item['address'] = address
            print req['req']['url']
            # print resp
            try:
                self.img_first = 'http://www.shangri-la.com{}'.format(
                    tree.xpath('//div[@id="background"]/img/@src')[0])
            except:
                self.img_first = ''
            try:
                check_time = tree.xpath(
                    '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/text()'
                )

                if not check_time:
                    check_time = tree.xpath(
                        '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()'
                    )

                    if check_time[1] == u'\xa0':
                        check_time1 = tree.xpath(
                            '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/text()'
                        )[0]
                        check_time2 = tree.xpath(
                            '//span[@id="ctl00_ContentPlaceHolder1_ltrChkInOut"]/p/span/text()'
                        )[0]
                        check_time = [check_time1, check_time2]

                try:
                    # print check_time[0]
                    self.item['check_in_time'] = check_time[0].split('：')[1]
                    self.item['check_out_time'] = check_time[1].split('：')[1]
                except:
                    self.item['check_in_time'] = ''
                    self.item['check_out_time'] = ''
            except:
                self.item['check_in_time'] = ''
                self.item['check_out_time'] = ''
            try:
                accepted_card_info = tree.xpath(
                    '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/p/text()'
                )
                if not accepted_card_info:
                    accepted_card_info = tree.xpath(
                        '//span[contains(@id, "ctl00_ContentPlaceHolder1_ltrPayment")]/text()'
                    )
                if len(accepted_card_info) <= 1:
                    accepted_card_infos = accepted_card_info[0].replace(
                        ':', '：')
                    accepted_cards = accepted_card_infos.split('：')[-1].replace('、', '|').replace('，', '|'). \
                        replace(u'及', "|").replace('。', '')
                else:
                    accepted_cards = accepted_card_info[-1].replace('、', '|').replace('，', '|'). \
                        replace(u'及', "|").replace('。', '')
            except:
                accepted_cards = ''
            self.item['accepted_cards'] = accepted_cards

    def parse_detail(self, req, resp):
        tree = etree.HTML(resp)
        req_url = req['req']['url']
        # print req['req']['url']
        self.item['source'] = 'shangrila'
        self.item['brand_name'] = '香格里拉'

        if 'about' in req_url:
            if 'service' in req_url:

                hotel2 = Hotel_New()

                try:
                    service_all = tree.xpath(
                        "//div[@class='control2_1column']/ul/li/text()")
                    # facilities_dict = {'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅',
                    #                    'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心',
                    #                    'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊',
                    #                    'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车场',
                    #                    'Airport_bus': '机场', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务',
                    #                    'Rental_service': '租车服务', 'Room_wifi': '客房无线网络', 'Room_wired': '客房有线网络',
                    #                    'Public_wifi': '公共区域无线上网', 'Public_wired': '公共区域有线网络'}

                    facilities_dict = {
                        'Swimming_Pool': ['游泳池'],
                        'gym': ['健身房'],
                        'SPA': ['SPA'],
                        'Bar': ['酒吧'],
                        'Coffee_house': ['咖啡厅'],
                        'Tennis_court': ['网球场'],
                        'Golf_Course': ['高尔夫球场'],
                        'Sauna': ['桑拿'],
                        'Mandara_Spa': ['水疗中心'],
                        'Recreation': ['儿童娱乐场', '儿童游乐场'],
                        'Business_Centre': ['商务中心'],
                        'Lounge': ['行政酒廊'],
                        'Wedding_hall': ['婚礼礼堂'],
                        'Restaurant': ['餐厅'],
                        'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'],
                        'Valet_Parking': ['代客泊车'],
                        'Call_service': ['叫车服务'],
                        'Rental_service': ['租车服务'],
                        'Room_wifi': ['客房无线网络'],
                        'Room_wired': ['客房有线网络'],
                        'Public_wifi': ['公共区域无线上网'],
                        'Public_wired': ['公共区域有线网络']
                    }
                    # reverse_facility_dict = {v: k for k, v in facilities_dict.items()}
                    service_dict = {
                        'Luggage_Deposit': '行李寄存',
                        'front_desk': '24小时前台',
                        'Lobby_Manager': '24小时大堂经理',
                        '24Check_in': '24小时办理入住',
                        'Security': '24小时安保',
                        'Protocol': '礼宾服务',
                        'wake': '叫醒服务',
                        'Chinese_front': '中文前台',
                        'Postal_Service': '邮政服务',
                        'Fax_copy': '传真/复印',
                        'Laundry': '洗衣',
                        'polish_shoes': '擦鞋服务',
                        'Frontdesk_safe': '保险',
                        'fast_checkin': '快捷入住及退房服务',
                        'ATM': '自动柜员机(ATM)/银行服务',
                        'child_care': '儿',
                        'Food_delivery': '送餐服务'
                    }
                    reverse_sevice_dict = {
                        v: k
                        for k, v in service_dict.items()
                    }
                    for service in service_all:
                        for keys, fac_value in facilities_dict.items():
                            if fac_value in service:
                                service = self.clean_data(service)
                                if keys in hotel2.facility:
                                    hotel2.facility[keys] = hotel2.facility[
                                        keys] + ',' + service
                                else:
                                    hotel2.facility[keys] = service
                        for sev_value in service_dict.values():
                            if sev_value in service:
                                service = self.clean_data(service)
                                hotel2.service[
                                    reverse_sevice_dict[sev_value]] = service
                    self.item['service'] = hotel2.service
                    self.item['facility'] = hotel2.facility
                except Exception as e:
                    self.item['service'] = "NULL"
                    self.item['facility'] = "NULL"

            elif 'map' in req_url:
                try:
                    latitude = re.compile(r'"Lat":"(.*?)"',
                                          re.S).findall(resp)[0]
                    longitude = re.compile(r'"Lng":"(.*?)"',
                                           re.S).findall(resp)[0]
                except:
                    raise parser_except.ParserException(22, '代理失效')
                self.item['latitude'] = latitude
                self.item['longitude'] = longitude
                map_list = tree.xpath(
                    "//div[@class='control2_1column']/div[@class='map-list']/div/h4/text()"
                )
                self.item['traffic'] = "NULL"
                traffic_str_all = ""
                index = 1
                for tra_str in map_list:
                    # if tra_str == "公共交通":
                    traffic_str_l = tree.xpath(
                        "//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()"
                        .format(index))
                    traffic_str = " ".join(traffic_str_l).strip().replace(
                        " ", "")
                    traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "机场交通":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "地铁":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "出租车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "高速磁悬浮列车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "")
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    # if tra_str == "酒店豪华桥车":
                    #     traffic_str_l = tree.xpath("//div[@class='control2_1column']/div[@class='map-list'][{}]/div/p/text()".format(index))
                    #     traffic_str = " ".join(traffic_str_l).strip().replace(" ", "").replace('\r', '').replace('\n', '')
                    #     traffic_str_all += tra_str + ":" + traffic_str
                    self.item['traffic'] = traffic_str_all
                    index += 1
                return

        elif 'reviews' in req_url:
            self.flag = True

            try:
                link = tree.xpath(
                    '//iframe[contains(@id, "ChildFrame")]/@src')[0]
            except:
                raise parser_except.ParserException(22, 'proxy error')

            self.review_url = link
            if 'http' not in link:
                self.review_url = 'http:' + link

            self.review_url = self.review_url.strip()

    def parse_hotel(self, req, resp):

        req_url = req['req']['url']

        if 'meta_review' in req_url or 'seal' in req_url or 'partnerId' in req_url:
            node = etree.HTML(resp)

            grade = ''
            try:
                grade = node.xpath(
                    '//div[@class="summary"]/p/span[2]/text()|//div[@class="value"]/text()'
                )[0]
                grade = grade.replace('\n', '').strip()
            except:
                grade = ''
                self.hotel.grade = grade
            try:
                try:
                    self.review_num = \
                    re.compile(r'\d*,*\d+').findall(node.xpath('//h1/text()|//div[@class="counter"]/text()')[0])[0]
                except:
                    self.review_num = \
                    re.compile(r'\d*,*\d+').findall((node.xpath('//div[@class="numReviews"]/text()'))[0])[0]
            except:
                self.review_num = ''
            try:
                self.hotel.review_num = self.review_num
            except:
                self.hotel.review_num = ''

        elif 'HotelPhotoVideoJson' in req_url or 'getphotosvideos' in req_url:
            self.hotel = Hotel_New()
            self.hotel.hotel_name = self.hotel_name
            self.hotel.hotel_name_en = self.item['hotel_name_en']
            self.hotel.source = self.item['source']
            self.hotel.source_id = self.item['source_id']
            self.hotel.brand_name = self.item['brand_name']
            try:
                self.hotel.map_info = '{},{}'.format(self.item['longitude'],
                                                     self.item['latitude'])
            except:
                self.hotel.map_info = ''
            self.hotel.address = self.item['address']
            self.hotel.city = self.item['city']
            self.hotel.country = self.country
            self.hotel.postal_code = self.item['postal_code']
            self.hotel.star = self.item['star']
            self.hotel.facility = self.item['facility']
            self.hotel.service = self.item['service']
            self.hotel.description = self.item['description']
            self.hotel.accepted_cards = self.item['accepted_cards']
            self.hotel.check_in_time = self.item['check_in_time']
            self.hotel.check_out_time = self.item['check_out_time']
            self.hotel.hotel_url = self.url_index
            self.hotel.hotel_phone = self.item['hotel_phone']
            self.hotel.traffic = self.item['traffic']
            if 'getphotosvideos' in req_url:
                self.img_list = '|'.join([
                    "https://www.hoteljen.com{}".format(img['image'])
                    for img in resp if img['image']
                ])
            else:
                self.img_list = '|'.join([
                    "http://www.shangri-la.com{}".format(img['image'])
                    for img in resp if img['image']
                ])

            self.hotel.img_items = self.img_list
            img_l = self.img_list.split("|")
            print self.img_first
            if not self.img_first:
                self.hotel.Img_first = img_l[0]
            else:
                self.hotel.Img_first = self.img_first

            res = self.hotel.to_dict()

            res = json.loads(res)
            return res

Example #11

Show file

    def parse_hotel(self, req, resp):
        hotels = []
        # hotel = Hotel()
        hotel = Hotel_New()
        # hotel = BaseModel()
        hotel.hotel_name = 'NULL'
        hotel.hotel_name_en = self.hotel_test['hotel_name_en']
        hotel.source = 'hyatt'
        hotel.source_id = self.hotel_test['source_id']
        hotel.brand_name = 'NULL'
        hotel.map_info = self.hotel_test['map_info']
        hotel.address = self.hotel_test['address']
        hotel.city = self.hotel_test['hotel_city']
        hotel.country = self.hotel_test['hotel_country']
        hotel.postal_code = self.hotel_test['hotel_postal_code']
        hotel.star = 5
        hotel.grade = 'NULL'
        hotel.review_num = 'NULL'
        # hotel.has_wifi = self.hotel_test['has_wifi']
        # hotel.is_wifi_free = self.hotel_test['is_wifi_free']
        # hotel.has_parking = 'NULL'
        # hotel.is_parking_free = 'NULL'
        # hotel.service = self.hotel_test['services']
        # hotel.img_items = self.hotel_test['img_items']
        # hotel.description = ''.join(self.hotel_test['description'])
        hotel.Img_first = self.hotel_test['Img_first']
        hotel.hotel_phone = self.hotel_test['hotel_phone']
        hotel.hotel_zip_code = self.hotel_test['hotel_postal_code']
        hotel.traffic = ''
        hotel.chiled_bed_type = self.hotel_test['chiled_bed_type']
        hotel.pet_type = ''
        if self.hotel_test['has_wifi']:
            hotel.facility['Room_wifi'] = self.hotel_test['has_wifi']
        for one in self.hotel_test['services']:
            one = one.lower()
            if 'faxing' in one:
                hotel.service['Fax_copy'] = one
            elif 'postal' in one:
                hotel.service['Postal_Service'] = one
            elif 'laundry' in one:
                hotel.service['Laundry'] = one
            elif 'room service' in one:
                hotel.service['Food_delivery'] = one
            elif 'concierge service' in one:
                hotel.service['Protocol'] = one
            elif 'babysitting' in one:
                hotel.service['child_care'] = one
            elif 'shoeshine' in one:
                hotel.service['polish_shoes'] = one

            elif 'valet parking' in one:
                hotel.facility['Valet_Parking'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'wifi' in one or 'wi-fi' in one:
                hotel.facility['Room_wifi'] = one
            elif 'pool' in one:
                hotel.facility['Swimming_Pool'] = one
            elif 'gym' in one:
                hotel.facility['gym'] = one
            elif 'bar' in one:
                hotel.facility['Bar'] = one
            elif 'coffee' in one:
                hotel.facility['coffee'] = one
            elif 'parking' in one:
                hotel.facility['Parking'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one
            elif 'golf' in one:
                hotel.facility['Golf_Course'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'sauna' in one:
                hotel.facility['Sauna'] = one
            elif 'service to airport' in one or 'shuttle airport' in one:
                hotel.facility['Airport_bus'] = one
            elif 'wedding' in one:
                hotel.facility['Wedding_hall'] = one
            elif 'restaurant' in one:
                hotel.facility['Restaurant'] = one
            elif 'business centre' in one:
                hotel.facility['Business_Centre'] = one
            elif 'sereno Spa' in one:
                hotel.facility['Mandara_Spa'] = one
            elif 'tennis' in one:
                hotel.facility['Tennis_court'] = one
            elif 'spa' in one:
                hotel.facility['SPA'] = one

            elif "China_Friendly" in one:
                hotel.feature['China_Friendly'] = one
            elif "Romantic_lovers" in one:
                hotel.feature['Romantic_lovers'] = one
            elif "Parent_child" in one:
                hotel.feature['Parent_child'] = one
            elif "Beach_Scene" in one:
                hotel.feature['Beach_Scene'] = one
            elif "Hot_spring" in one:
                hotel.feature['Hot_spring'] = one
            elif "Japanese_Hotel" in one:
                hotel.feature['Japanese_Hotel'] = one
            elif "Vacation" in one:
                hotel.feature['Vacation'] = one

        hotel.accepted_cards = 'NULL'
        hotel.check_in_time = self.hotel_test['check_in_time']
        hotel.check_out_time = self.hotel_test['check_out_time']
        hotel.hotel_url = self.url_en

        # hotel_tuple = dict(
        #     hotel_name=hotel.hotel_name,
        #     hotel_name_en=hotel.hotel_name_en,
        #     source=hotel.source,
        #     source_id=hotel.source_id,
        #     brand_name=hotel.brand_name,
        #     map_info=hotel.map_info,
        #     address=hotel.address,
        #     city=hotel.city,
        #     country=hotel.country,
        #     postal_code=hotel.postal_code,
        #     star=hotel.star,
        #     grade=hotel.grade,
        #     review_num=hotel.review_num,
        #     has_wifi=hotel.has_wifi,
        #     is_wifi_free=hotel.is_wifi_free,
        #     has_parking=hotel.has_parking,
        #     is_parking_free=hotel.is_parking_free,
        #     service=hotel.service,
        #     img_items=hotel.img_items,
        #     description=hotel.description,
        #     accepted_cards=hotel.accepted_cards,
        #     check_in_time=hotel.check_in_time,
        #     check_out_time=hotel.check_out_time,
        #     hotel_url=hotel.hotel_url,
        # )
        # hotels.append(hotel_tuple)
        # return hotels
        res = hotel.to_dict()
        res = json.loads(res)

        # print json.dumps(res,ensure_ascii=False)
        return res

Example #12

Show file

def agoda_parser(content, url, other_info):
    hotel = HotelNewBase()
    try:
        content = content.decode('utf-8')
        root = HTML.fromstring(content)
    except:
        #print str(e)
        pass

    ph_runtime = execjs.get('PhantomJS')
    page_js = ph_runtime.compile(
        root.xpath('//script[contains(text(),"propertyPageParams")]/text()')
        [0])
    page_params = page_js.eval('propertyPageParams')
    try:
        hotel_name = page_params['hotelInfo']['name']
    except:
        try:
            hotel_name = root.xpath('//*[@id="hotelname"]/text()')[0].encode(
                'utf-8').strip()
        except:
            try:
                hotel_name = root.xpath('//title/text()')[0].split('-')[0][:-1]
            except:
                #print str(e)
                pass

    try:
        k = hotel_name.find('(')
        # #print k
        hotel.hotel_name = hotel_name[:k if k != -1 else None]
    except:
        # #print str(e)
        hotel.hotel_name = 'NULL'
    #print 'hotel_name=>%s' % hotel.hotel_name
    # #print hotel.hotel_name

    try:
        hotel.hotel_name_en = hotel_name[
            k + 1 if k != -1 else None:-1 if k != -1 else None]
    except:
        hotel.hotel_name_en = 'NULL'
        # #print str(e)
    #print 'hotel.hotel_name_en=>%s' % hotel.hotel_name_en
    # #print hotel.hotel_name_en

    try:
        if page_params['hotelInfo']['address']['address'] in page_params[
                'hotelInfo']['address']['full']:
            hotel.address = page_params['hotelInfo']['address']['full']
        else:
            hotel.address = page_params['hotelInfo']['address'][
                'address'] + page_params['hotelInfo']['address']['full']
    except:
        hotel.address = "NULL"
    #print 'hotel.address=>%s' % hotel.address

    try:
        hotel.star = int(
            page_params['hotelInfo']['starRating']['icon'].split('-')[-1])
    except:
        hotel.star = -1

    if hotel.star > 5:
        if hotel.star % 5 == 0:
            hotel.star = int(hotel.star / 10)
        else:
            hotel.star = -1

    #print 'hotel.star=>%s' % hotel.star

    try:
        lat_pat = re.compile(r'latitude\" content=(.*?) \/>', re.S)
        lon_pat = re.compile(r'longitude\" content=(.*?) \/>', re.S)

        lon_text = lon_pat.findall(content)[0][1:-1]
        lat_text = lat_pat.findall(content)[0][1:-1]
        hotel.map_info = lon_text + ',' + lat_text
    except:
        # #print str(e)
        hotel.map_info = 'NULL'

    #print 'map_info=>%s' % hotel.map_info

    try:
        hotel.grade = float(page_params['reviews']['score'])
    except:
        try:
            hotel.grade = root.find_class('review-score-value')[0].text
        except:
            try:
                hotel.grade = page_params['masterRoomInfo'][0]['demographics'][
                    'grades'][0]['score']
            except:
                hotel.grade = -1
    #print 'grade=>%s' % hotel.grade

    try:
        hotel.review_num = page_params['reviews']['reviewsCount']
    except:
        try:
            review_num = root.find_class('review-based-on-section')[0].xpath(
                './strong/text()')[0].encode('utf8').strip()
            hotel.review_num = review_num_pat.findall(review_num)[0]
        except:
            try:
                hotel.review_num = page_params['masterRoomInfo'][0][
                    'demographics']['count']
            except:
                hotel.review_num = -1

    #print 'hotel.review_num=>%s' % hotel.review_num

    try:
        first_img = page_params.get("mosaicInitData",
                                    {}).get('images',
                                            [])[0].get('Location', 'NULL')
        first_img = urljoin('http:', first_img)
    except:
        first_img = 'NULL'

    try:
        hotel.img_items = '|'.join(
            filter(
                lambda x: 'hotel' in x,
                map(lambda x: 'http:' + x['Location'].split('?')[0],
                    page_params['mosaicInitData']['images']))).encode('utf-8')
    except:
        try:
            img_lists = []
            for img in page_params['masterRoomInfo']:
                img_lists.extend(img['images'])
            hotel.img_items = '|'.join(
                map(lambda x: urljoin('http:', x), img_lists)).encode('utf-8')
        except:
            try:
                img_list = '|'.join([
                    image
                    for images in page_params['roomGridData']['masterRooms']
                    for image in images['images']
                ])
                hotel.img_items = img_list
            except:
                try:
                    img_json = images_url_pat.findall(content)[0]
                    location_pat = re.compile(r'"Location":"(.*?)",', re.S)
                    img_list = location_pat.findall(img_json)
                    hotel.img_items = '|'.join(
                        map(lambda x: 'http:' + x, img_list))
                except:
                    hotel.img_items = 'NULL'
    #print 'img_items=>%s' % hotel.img_items

    try:
        hotel.hotel_url = url
    except:
        pass

    try:
        service_url = "https://www.agoda.com/api/zh-cn/Hotel/AboutHotel?hotelId={0}".format(
            page_params['hotelId'])
        json_data = json.loads(requests.get(service_url).content)
        hotel.service = '|'.join([
            feature['name'] for features in json_data['featureGroups']
            for feature in features['feature'] if feature['available']
        ]).encode('utf-8')

    except:
        try:
            hotel.service = '|'.join([
                service['text'].strip()
                for services in page_params['featuresYouLove']['features']
                for service in services
            ])
        except:
            # hotel.service = '|'.join()
            hotel.service = 'NULL'
    #print 'hotel.service=>%s' % hotel.service

    try:
        hotel.description = json_data['hotelDesc']['overview'].strip().replace(
            '<BR>', '').encode('utf-8')
    except:
        hotel.description = 'NULL'
    #print 'hotel.description=>%s' % hotel.description

    # hotel.check_in_time = None
    # hotel.check_out_time = None
    try:
        for checkInOut in json_data['usefulInfoGroups']:
            if '入住/退房' in checkInOut['name']:
                for item in checkInOut['items']:
                    if '入住办理起始' in item['title']:
                        hotel.check_in_time = item['description']
                        break
                for item in checkInOut['items']:
                    if '退房办理截止' in item['title']:
                        hotel.check_out_time = item['description']
                        break
                break
    except:
        pass

    if hotel.check_in_time == 'NULL' and hotel.check_out_time == 'NULL':
        try:
            in_and_out = json_data.get("CheckInOutInfo", {})
            hotel.check_in_time = in_and_out.get("CheckInAndOutTime", {}).get(
                "CheckInTime", {}).get("From", {}).get("Description")
            hotel.check_out_time = in_and_out.get("CheckInAndOutTime", {}).get(
                "CheckOutTime", {}).get("Until", {}).get("Description")
        except:
            pass
    #print "hotel.check_in_time:", hotel.check_in_time
    #print "hotel.check_out_time:", hotel.check_out_time
    # 从酒店页面获取城市信息
    try:
        country_id = page_params['hotelSearchCriteria']['countryId']
        country_name = page_params['hotelInfo']['address']['countryName']
        city_name = page_params['hotelInfo']['address']['cityName']
        city_id = page_params['hotelInfo']['address']['cityId']
    except:
        country_id = 'NULL'
        country_name = 'NULL'
        city_name = 'NULL'
        city_id = 'NULL'
        #print e
        # pass

    hotel.others_info = json.dumps(
        {
            'country_id': country_id,
            'country_name': country_name,
            'city_name': city_name,
            'city_id': city_id,
            'first_img': first_img,
            'hid': other_info.get('hid'),
            'hotel_services_info': hotel.service
        },
        ensure_ascii=False)
    # hotel.source_city_id = city_id
    hotel.country = page_params['hotelInfo'].get('address',
                                                 {}).get('countryName', '')
    hotel.city = page_params['hotelInfo'].get('address',
                                              {}).get('cityName', '')
    #print "hotel.others_info:", hotel.others_info
    #print "hotel.source_city_id:", hotel.source_city_id
    hotel.accepted_cards = 'NULL'
    #print "accepted_cards:", hotel.accepted_cards
    #print "check_in_time：", hotel.check_in_time
    #print "check_out_time:", hotel.check_out_time

    # if '无线网络' in hotel.service:
    #     hotel.has_wifi = 'Yes'
    # if '免费房内无线网络' in hotel.service:
    #     hotel.is_wifi_free = 'Yes'
    # if 'free wi-fi' in hotel.service.lower() or 'wi-fi free' in hotel.service.lower():
    #     hotel.has_wifi = 'Yes'
    #     hotel.is_wifi_free = 'Yes'
    # if '停车场' in hotel.service:
    #     hotel.has_parking = 'Yes'
    # if '停车场免费' in hotel.service or 'parking free' in hotel.service:
    #     hotel.is_parking_free = 'Yes'

    #print 'hotel.has_wifi=>%s' % hotel.has_wifi
    # #print hotel.has_wifi
    #print 'hotel.is_wifi_free=>%s' % hotel.is_wifi_free
    # #print hotel.has_wifi
    #print 'hotel.has_parking=>%s' % hotel.has_parking
    # #print hotel.has_parking
    #print 'hotel.is_parking_free=>%s' % hotel.is_parking_free

    hotel.source = 'agoda'
    hotel.hotel_url = url.encode('utf-8')
    if other_info.get('hid'):
        hotel.source_id = re.search('hotelId: ?(\d+),', content).groups()[0]
        # hotel.source_id = re.search('cityId: ?(\d+),', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']

    # others_info_dict = hotel.__dict__
    # hotel.others_info = json.dumps(others_info_dict)
    # #print hotel

    return hotel

Example #13

Show file

File: new_ctrip_parser.py Project: 20113261/platform_service

def ctrip_parser(page, url, other_info):
    hotel = HotelNewBase()
    try:
        root = HTML.fromstring(page.decode('utf-8'))
    except Exception, e:
        print str(e)

Example #14

Show file

def bestwestern_parser(content, url, other_info):
    lng_lat = content[0]
    html = etree.HTML(content[1])
    hotel = HotelNewBase()

    # 酒店名
    hotel.hotel_name = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//h1[@id="hotel-name"]/a/text()'
    )[0]
    # 酒店英文名
    hotel.hotel_name_en = hotel.hotel_name
    # 酒店源
    hotel.source = 'bestwestern'
    # 酒店id
    hotel.source_id = url.split('-')[-1]
    # 酒店品牌名
    hotel.brand_name = get_brand_name(html)
    # 酒店经纬度
    hotel.map_info = get_map_info(lng_lat)
    # 酒店地址
    hotel.address = "".join(
        html.xpath(
            '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span/text()'
        ))
    # 酒店所在城市
    hotel.city = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span[@id="address-1-city-state-zip"]/text()'
    )[0]
    # 酒店所在国家
    hotel.country = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span'
    )[-1].text
    # 城市ID（mioji）
    hotel.city_id = other_info['city_id']
    # 酒店邮编
    hotel.postal_code = html.xpath(
        '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]//span[@class="postalCode"]/text()'
    )[0]
    # 酒店星级
    hotel.star = 5
    # 酒店评分
    hotel.grade = html.xpath('//div[@class="tripAdvisorOwl"]/img/@src'
                             )[0].split("/")[-1].split('-')[0]
    # 酒店评论数
    try:
        hotel.review_num = re.search(
            r'\d+',
            html.xpath(
                '//div[@class="hotelDetailsContainer"]//div[@id="hotel-reviews"]//div[@class="reviewRatingCount"]/text()'
            )[0]).group()
    except Exception:
        hotel.review_num = 0
    # 酒店头图
    hotel.Img_first = html.xpath(
        "//div[contains(@class, 'hotelImageSlider')]//li/img/@src")[0]
    # 酒店电话
    hotel.hotel_phone = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[0]
    # 酒店邮编
    hotel.hotel_zip_code = html.xpath(
        '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[1]
    # 到达酒店的交通信息
    hotel.traffic = 'NULL'
    # 儿童和加床政策
    hotel.chiled_bed_type = 'NULL'
    # 宠物政策
    hotel.pet_type = html.xpath(
        '//div[@class="policyContent uk-margin-small-left"]/text()')[0]
    # 酒店特色
    get_feature(hotel, html)
    # 设施信息
    get_facility(hotel, html)
    # 服务信息
    get_service(hotel, html)
    # 酒店照片
    hotel.img_items = ",".join(
        html.xpath("//div[contains(@class, 'hotelImageSlider')]//li/img/@src"))
    # 酒店描述
    hotel.description = html.xpath(
        '//div[@class="hotelOverviewDetailSection"]/div[@class="overviewText"]/text()'
    )[0].strip()
    # 支付接受的卡
    hotel.accepted_cards = 'NULL'
    # 入住时间
    hotel.check_in_time = html.xpath(
        '//div[@class="uk-width-3-10 checkInPositionContainer addressCheckInTableCell"]/p[2]/text()'
    )[0]
    # 退房时间
    hotel.check_out_time = html.xpath(
        '//div[@class="phoneNumbers"]/div[contains(@class,"phonesRow")][1]/div[2]/p[2]/text()'
    )[0]
    # 酒店url
    hotel.hotel_url = url
    hotel_service_info = __get_hotel_service(html)
    hotel.others_info = json.dumps({"hotel_services_info": hotel_service_info})
    print hotel.to_dict()
    # with open("bestwestren.json", 'a') as f:
    #     f.write(hotel.to_dict() + "\n")
    return hotel.to_dict()