Beispiel #1
0
def hotel_routine_base_data(self, source, url, other_info, **kwargs):
    self.task_source = source.title()
    self.task_type = 'Hotel'

    self.error_code = 0

    # 初始化任务
    try:
        # hotels
        if source == 'hotels':
            hotel_id = re.findall('hotel-id=(\d+)', url)[0]
            url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
    except Exception as e:
        self.error_code = 12
        logger.exception(e)
        raise e

    # 修改请求参数
    try:
        pass
    except Exception as e:
        self.error_code = 101
        logger.exception(e)
        raise e

    try:
        session = MySession()
        page = session.get(url, timeout=240)
        page.encoding = 'utf8'
        content = page.text
    except Exception as e:
        self.error_code = 22
        logger.exception(e)
        raise e

    try:
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source,
                             part="NULL")
    except TypeCheckError as e:
        self.error_code = 102
        logger.exception(e)
        raise e
    except Exception as e:
        self.error_code = 27
        logger.exception(e)
        raise e

    try:
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        self.error_code = 33
        logger.exception(e)
        raise e

    try:
        # 保存抓取成功后的页面信息
        save_task_and_page_content(
            task_name='hotelinfo_routine_{0}'.format(source),
            content=content,
            task_id=kwargs['mongo_task_id'],
            source=source,
            source_id=other_info['source_id'],
            city_id=other_info['city_id'],
            url=url)
    except Exception as e:
        self.error_code = 104
        logger.exception(e)
        raise e
Beispiel #2
0
    session = MySession()
    # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/'
    # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/'
    url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/'
    # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format(
        url.split('/')[-2])
    map_info_url = url + 'maps-directions.html'
    desc_url = url + 'about.html'

    page = session.get(url)
    page.encoding = 'utf8'
    content = page.text
    detail_content = session.get(detail_url).text
    map_info_content = session.get(map_info_url).text
    desc_page = session.get(desc_url)
    desc_page.encoding = 'utf8'
    desc_content = desc_page.text

    total_content = [content, detail_content, map_info_content, desc_content]
    other_info = {'source_id': '1000', 'city_id': '50795'}
    result = hilton_parser(total_content, url, other_info)
    try:
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        print str(e)