def hotel_routine_base_data(self, source, url, other_info, **kwargs): self.task_source = source.title() self.task_type = 'Hotel' self.error_code = 0 # 初始化任务 try: # hotels if source == 'hotels': hotel_id = re.findall('hotel-id=(\d+)', url)[0] url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id except Exception as e: self.error_code = 12 logger.exception(e) raise e # 修改请求参数 try: pass except Exception as e: self.error_code = 101 logger.exception(e) raise e try: session = MySession() page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text except Exception as e: self.error_code = 22 logger.exception(e) raise e try: result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part="NULL") except TypeCheckError as e: self.error_code = 102 logger.exception(e) raise e except Exception as e: self.error_code = 27 logger.exception(e) raise e try: session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: self.error_code = 33 logger.exception(e) raise e try: # 保存抓取成功后的页面信息 save_task_and_page_content( task_name='hotelinfo_routine_{0}'.format(source), content=content, task_id=kwargs['mongo_task_id'], source=source, source_id=other_info['source_id'], city_id=other_info['city_id'], url=url) except Exception as e: self.error_code = 104 logger.exception(e) raise e
session = MySession() # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/' # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/' url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/' # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html' # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html' detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format( url.split('/')[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) page.encoding = 'utf8' content = page.text detail_content = session.get(detail_url).text map_info_content = session.get(map_info_url).text desc_page = session.get(desc_url) desc_page.encoding = 'utf8' desc_content = desc_page.text total_content = [content, detail_content, map_info_content, desc_content] other_info = {'source_id': '1000', 'city_id': '50795'} result = hilton_parser(total_content, url, other_info) try: session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: print str(e)