Beispiel #1
0
if __name__ == '__main__':
    from proj.my_lib.Common.Browser import MySession

    session = MySession()
    # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/'
    # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/'
    url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/'
    # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format(
        url.split('/')[-2])
    map_info_url = url + 'maps-directions.html'
    desc_url = url + 'about.html'

    page = session.get(url)
    page.encoding = 'utf8'
    content = page.text
    detail_content = session.get(detail_url).text
    map_info_content = session.get(map_info_url).text
    desc_page = session.get(desc_url)
    desc_page.encoding = 'utf8'
    desc_content = desc_page.text

    total_content = [content, detail_content, map_info_content, desc_content]
    other_info = {'source_id': '1000', 'city_id': '50795'}
    result = hilton_parser(total_content, url, other_info)
    try:
        session = DBSession()
        session.merge(result)
        session.commit()
Beispiel #2
0
def hotel_routine_base_data(self, source, url, other_info, **kwargs):
    self.task_source = source.title()
    self.task_type = 'Hotel'

    self.error_code = 0

    # 初始化任务
    try:
        # hotels
        if source == 'hotels':
            hotel_id = re.findall('hotel-id=(\d+)', url)[0]
            url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
    except Exception as e:
        self.error_code = 12
        logger.exception(e)
        raise e

    # 修改请求参数
    try:
        pass
    except Exception as e:
        self.error_code = 101
        logger.exception(e)
        raise e

    try:
        session = MySession()
        page = session.get(url, timeout=240)
        page.encoding = 'utf8'
        content = page.text
    except Exception as e:
        self.error_code = 22
        logger.exception(e)
        raise e

    try:
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source,
                             part="NULL")
    except TypeCheckError as e:
        self.error_code = 102
        logger.exception(e)
        raise e
    except Exception as e:
        self.error_code = 27
        logger.exception(e)
        raise e

    try:
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        self.error_code = 33
        logger.exception(e)
        raise e

    try:
        # 保存抓取成功后的页面信息
        save_task_and_page_content(
            task_name='hotelinfo_routine_{0}'.format(source),
            content=content,
            task_id=kwargs['mongo_task_id'],
            source=source,
            source_id=other_info['source_id'],
            city_id=other_info['city_id'],
            url=url)
    except Exception as e:
        self.error_code = 104
        logger.exception(e)
        raise e
Beispiel #3
0
        'recommended_time': recommended_time,
        'introduction': desc,
        'prize': prize,
        'traveler_choice': traveler_choice,
        'first_review_id': first_review_id,
        'imgurl': image_urls,
        'site': site,
        'id': source_id,
        'source_city_id': source_city_id,
        'url': url
    }
    return result


def insert_db(result, city_id):
    result['city_id'] = city_id
    db_localhost.insert('shop', **result)

    # sql = "insert into tp_attr_basic (`source`, `name`, `name_en`, `phone`, `map_info`, `address`, `opentime`, `star`, `ranking`, `tagid`, `commentcounts`, `recommended_time`, `introduction`, `prize`, `traveler_choice`, `first_review_id`, `imgurl`,`miaoji_id`, `id`, `source_city_id`, `url`, `site_raw`, `site_before_301`, `city_id`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    # result = list(result)
    # result.append(city_id)
    # return db.ExecuteSQL(sql, tuple(result))


if __name__ == '__main__':
    # url = 'https://www.tripadvisor.cn/Attraction_Review-g143034-d108754-Reviews-Nahuku_Thurston_Lava_Tube-Hawaii_Volcanoes_National_Park_Island_of_Hawaii_Hawaii.html'
    url = 'https://www.tripadvisor.cn/Attraction_Review-g1024140-d10000541-Reviews-Castaway_Yoga-Ko_Lipe_Satun_Province.html'
    content = ss.get(url).content
    result = parse(content, url)
    insert_db(result, 10086)