if __name__ == '__main__': from proj.my_lib.Common.Browser import MySession session = MySession() # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/' # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/' url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/' # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html' # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html' detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format( url.split('/')[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) page.encoding = 'utf8' content = page.text detail_content = session.get(detail_url).text map_info_content = session.get(map_info_url).text desc_page = session.get(desc_url) desc_page.encoding = 'utf8' desc_content = desc_page.text total_content = [content, detail_content, map_info_content, desc_content] other_info = {'source_id': '1000', 'city_id': '50795'} result = hilton_parser(total_content, url, other_info) try: session = DBSession() session.merge(result) session.commit()
def hotel_routine_base_data(self, source, url, other_info, **kwargs): self.task_source = source.title() self.task_type = 'Hotel' self.error_code = 0 # 初始化任务 try: # hotels if source == 'hotels': hotel_id = re.findall('hotel-id=(\d+)', url)[0] url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id except Exception as e: self.error_code = 12 logger.exception(e) raise e # 修改请求参数 try: pass except Exception as e: self.error_code = 101 logger.exception(e) raise e try: session = MySession() page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text except Exception as e: self.error_code = 22 logger.exception(e) raise e try: result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part="NULL") except TypeCheckError as e: self.error_code = 102 logger.exception(e) raise e except Exception as e: self.error_code = 27 logger.exception(e) raise e try: session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: self.error_code = 33 logger.exception(e) raise e try: # 保存抓取成功后的页面信息 save_task_and_page_content( task_name='hotelinfo_routine_{0}'.format(source), content=content, task_id=kwargs['mongo_task_id'], source=source, source_id=other_info['source_id'], city_id=other_info['city_id'], url=url) except Exception as e: self.error_code = 104 logger.exception(e) raise e
'recommended_time': recommended_time, 'introduction': desc, 'prize': prize, 'traveler_choice': traveler_choice, 'first_review_id': first_review_id, 'imgurl': image_urls, 'site': site, 'id': source_id, 'source_city_id': source_city_id, 'url': url } return result def insert_db(result, city_id): result['city_id'] = city_id db_localhost.insert('shop', **result) # sql = "insert into tp_attr_basic (`source`, `name`, `name_en`, `phone`, `map_info`, `address`, `opentime`, `star`, `ranking`, `tagid`, `commentcounts`, `recommended_time`, `introduction`, `prize`, `traveler_choice`, `first_review_id`, `imgurl`,`miaoji_id`, `id`, `source_city_id`, `url`, `site_raw`, `site_before_301`, `city_id`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # result = list(result) # result.append(city_id) # return db.ExecuteSQL(sql, tuple(result)) if __name__ == '__main__': # url = 'https://www.tripadvisor.cn/Attraction_Review-g143034-d108754-Reviews-Nahuku_Thurston_Lava_Tube-Hawaii_Volcanoes_National_Park_Island_of_Hawaii_Hawaii.html' url = 'https://www.tripadvisor.cn/Attraction_Review-g1024140-d10000541-Reviews-Castaway_Yoga-Ko_Lipe_Satun_Province.html' content = ss.get(url).content result = parse(content, url) insert_db(result, 10086)