def getScenery(city_name): operate = DB.operateDB() result = operate.searchCity(city_name) if operate.countCity() and not result: print('没有这个城市!') return if not result: url = 'http://travel.qunar.com/place' getCityURL(url) result = operate.searchCity(city_name) city_id = result[0][0] city_number = result[0][2] if not city_number: logUtil.getLogger(1).error(city_name + ':没有该旅游城市信息') else: cityPinYin = "".join(Pinyin().get_pinyin(city_name).split('-')) pools = ThreadPoolExecutor(3) all_pools = [] # 景点网页拼接,攻略网页拼接,美食区 scnery_website = 'https://travel.qunar.com/p-cs' + city_number + '-' + cityPinYin + '-jingdian' strategy_website = 'https://travel.qunar.com/travelbook/list/22-' + cityPinYin + '-' + str( city_number) + '/hot_heat/1.htm' cate_website = 'https://travel.qunar.com/p-cs' + str( city_number) + '-' + cityPinYin + '-meishi?page=1' all_pools.append( pools.submit(qunarScenery.getScenery, ((scnery_website, city_id)))) all_pools.append( pools.submit(qunarStrategy.getStrategy, ((strategy_website, city_name)))) all_pools.append( pools.submit(qunarCate.getCate, ((cate_website, city_id)))) wait(all_pools, return_when=ALL_COMPLETED)
def __init__(self): self.__dbName = 'qunar' self.__user = '******' self.__password = '******' self.__host = 'localhost' self.__char = 'utf8' self.logger = logUtil.getLogger(0)
import requests,random,time,re from bs4 import BeautifulSoup from python0_1.qunar_crawler.saveToDB import DB from python0_1.qunar_crawler.common import headMsg from python0_1.qunar_crawler.utils import logUtil User_Agent = headMsg.UA logger = logUtil.getLogger(1) # 将攻略内容爬取 # 参数:相应的攻略url,攻略编号 def getStrategyContent(url,strategy_number): operate = DB.operateDB() Hostreferer = { 'User-Agent': random.choice(User_Agent), 'Referer': 'https://www.qunar.com' } html = requests.get(url=url,headers=Hostreferer,timeout=5).text soup = BeautifulSoup(html, "html.parser") try: # print(url) # 爬取每个攻略页面的内容 box = soup.find('div', class_='container main-container') strategy_att = box.find('ul', class_='foreword_list') playTime = strategy_att.find('li', class_='f_item howlong') playTime = playTime.find('span', class_='data') if playTime != None else None playTime = playTime.string if playTime != None else 0 cost = strategy_att.find('li', class_='f_item howmuch') cost = cost.find('span', class_='data').string if cost != None else 0 theme = strategy_att.find('li', class_='f_item how') theme = theme.find('span', class_='data') if theme != None else None