def hotel_base_data(self, source, url, other_info, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: page = requests.get(url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text # agoda 特殊情况 start url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format( other_info['source_id']) page_about = requests.get(url=url_about, headers=headers) page_about.encoding = 'utf8' about_content = page_about.text other_info['about_content'] = about_content # agoda end result = parse_hotel(content=content, url=url, other_info=other_info, source=source) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_static_base_data(self, parent_task_id, task_name, source, source_id, city_id, hotel_url, **kwargs): logger.info("parent task id: {0}, start task".format(parent_task_id)) self.task_source = source.title() self.task_type = 'HotelStaticDataParse' # 获取保存的页面信息 other_info = {'source_id': source_id, 'city_id': city_id} logger.info( 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_tripadvisor_total_new&id=' + parent_task_id) content = get_page_content(task_id=parent_task_id, task_name=task_name) logger.info( "parent task id: {0}, end of get hotel content, start parse hotel". format(parent_task_id)) result = parse_hotel(content=content, url=hotel_url, other_info=other_info, source=source, part=task_name) logger.info( "parent task id: {0}, end of parse hotel, start insert db".format( parent_task_id)) if not result: raise Exception('db error') try: # logger.info(str(result)) session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: self.error_code = 33 logger.exception(e) raise e logger.info("parent task id: {0}, end of insert db".format(parent_task_id)) self.error_code = 0 return result
def hotel_routine_base_data(self, source, url, other_info, **kwargs): self.task_source = source.title() self.task_type = 'Hotel' self.error_code = 0 # 初始化任务 try: # hotels if source == 'hotels': hotel_id = re.findall('hotel-id=(\d+)', url)[0] url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id except Exception as e: self.error_code = 12 logger.exception(e) raise e # 修改请求参数 try: pass except Exception as e: self.error_code = 101 logger.exception(e) raise e try: session = MySession() page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text except Exception as e: self.error_code = 22 logger.exception(e) raise e try: result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part="NULL") except TypeCheckError as e: self.error_code = 102 logger.exception(e) raise e except Exception as e: self.error_code = 27 logger.exception(e) raise e try: session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: self.error_code = 33 logger.exception(e) raise e try: # 保存抓取成功后的页面信息 save_task_and_page_content( task_name='hotelinfo_routine_{0}'.format(source), content=content, task_id=kwargs['mongo_task_id'], source=source, source_id=other_info['source_id'], city_id=other_info['city_id'], url=url) except Exception as e: self.error_code = 104 logger.exception(e) raise e
def _execute(self, **kwargs): url = self.task.kwargs['url'] source = self.task.kwargs['source'] source_id = self.task.kwargs['source_id'] city_id = self.task.kwargs['city_id'] country_id = self.task.kwargs['country_id'] hid = self.task.kwargs['hid'] headers = {} other_info = {'source_id': source_id, 'city_id': city_id} if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']: error_code, res, page_store_key_list = hotel_detail_database( url, source) if error_code == 0: result = parse_hotel_info(res) else: raise ServiceStandardError(error_code=error_code) else: with MySession(need_cache=True) as session: # booking start if source == 'booking': headers['Referer'] = 'http://www.booking.com' # booking end session.headers.update(headers) start = time.time() if source not in ('hilton', 'ihg', 'holiday', 'accor', 'marriott'): page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text elif source == 'ihg': url1, url2 = url.split('#####') page1 = session.get(url1, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = session.get(url2, timeout=240) page2.encoding = 'utf8' content2 = page2.text content = [content1, content2] elif source == 'holiday': url2, url1 = url.split('#####') page1 = requests.get( url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y', 'ihg-language': 'zh-CN' }, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = requests.get( url2, timeout=240, headers={ 'accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json; charset=UTF-8', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'ihg-language': 'zh-CN', }) page2.encoding = 'utf8' content2 = page2.text page3 = requests.get(url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y' }, timeout=240) page3.encoding = 'utf8' content3 = page3.text content = (content1, content2, content3) elif source == 'accor': proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014" r = requests.get(proxy_url) proxies = {'https': "socks5://" + str(r.text)} headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } page = requests.get(url, headers=headers, verify=False, proxies=proxies) page.encoding = 'utf8' content = page.text elif source == 'marriott': url_list = url.split('#####') url = url_list[0] for i in url_list: if len(i.split('=')) > 1: key, value = i.split('=')[0], i.split('=')[1] if key == 'longtitude': other_info['longtitude'] = value if key == 'latitude': other_info['latitude'] = value else: if url_list.index(i) == 1: other_info['hotel_name_en'] = i url2 = url.replace("travel", "hotel-photos") url3 = url.replace("travel/", "maps/travel/") url4 = url.replace("hotels/", "hotels/fact-sheet/") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0' } if "https://www.marriott.com" in url: page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page3 = requests.get(url3, headers=headers, timeout=240) page4 = requests.get(url4, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' page3.encoding = 'utf8' page4.encoding = 'utf8' content1 = page1.text content2 = page2.text content3 = page3.text content4 = page4.text content = (content1, content2, content3, content4) else: url2 = url + "/hotel-overview" page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' content1 = page1.text content2 = page2.text content = (content1, content2) else: session.auto_update_host = False hilton_index = url.find('index.html') if hilton_index > -1: url = url[:hilton_index] split_args = url.split('/') detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format( split_args[-3], split_args[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) map_info_page = session.get(map_info_url) desc_page = session.get(desc_url) detail_page = session.get(detail_url, ) page.encoding = 'utf8' detail_page.encoding = 'utf8' map_info_page.encoding = 'utf8' desc_page.encoding = 'utf8' __content = page.text logger.info(detail_url) __detail_content = detail_page.text __map_info_content = map_info_page.text __desc_content = desc_page.text content = [ __content, __detail_content, __map_info_content, __desc_content ] logger.debug("[crawl_data][Takes: {}]".format(time.time() - start)) start = time.time() result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part=self.task.task_name, retry_count=self.task.used_times) logger.debug("[parse_hotel][func: {}][Takes: {}]".format( parse_hotel.func_name, time.time() - start)) try: data_collections = mongo_data_client['ServicePlatform'][ self.task.task_name] data_collections.create_index([('source', 1), ('source_id', 1)], unique=True, background=True) data_collections.create_index([('location', '2dsphere')], background=True) tmp_result = deepcopy(result.values(backdict=True)) lon, lat = str(result.map_info).split(',') lon, lat = float(lon), float(lat) tmp_result.update( {'location': { 'type': "Point", 'coordinates': [lon, lat] }}) data_collections.save(tmp_result) except pymongo.errors.DuplicateKeyError: # logger.exception("[result already in db]", exc_info=e) logger.warning("[result already in db]") except Exception as exc: raise ServiceStandardError( error_code=ServiceStandardError.MONGO_ERROR, wrapped_exception=exc) start = time.time() try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() others_info = json.loads(result.others_info) others_info['hid'] = hid result.others_info = json.dumps(others_info) sql = result.generation_sql() sql = sql.format(table_name=self.task.task_name) values = result.values() self.logger.info(result.__dict__) cursor.execute(sql, values) service_platform_conn.commit() cursor.close() service_platform_conn.close() except Exception as e: logger.exception(e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) logger.debug("[Insert DB][Takes: {}]".format(time.time() - start)) self.task.error_code = 0 return self.task.error_code