def getPagesHasItems(self, index=0): if self.currentPageIndex != 1 and self.currentPageIndex > self.currentTotalPageCounts: self.currentTotalPageCounts = 0 self.currentPageIndex = 1 return None if self.currentTotalPageCounts == 0: try: index = max(0, int(index)) except Exception: self.logger.error('起始页为整数.') exit(-1) self.currentPageIndex = index if index is not 0 else self.currentPageIndex fullUrl = self.start_url + 'page' + str(self.currentPageIndex) + '/' try: mainPage = requests.get(fullUrl, headers=Utils.headers) soup = BeautifulSoup(mainPage.text, "lxml") self.currentPageIndex += 1 except Exception: self.logger.error('节目页面获取失败!') return if self.currentTotalPageCounts == 0: try: # 找几页 self.currentTotalPageCounts = Utils.getPageCount(soup) except Exception as e: self.logger.error('获取最大页数失败,默认只访问第一页!') self.currentTotalPageCounts = 1 self.logger.info('此次访问共有' + str(self.currentTotalPageCounts) + '页') return fullUrl
def getSomeArticlesPageSoup(self, index=0, skipfail=False): ''' 1. 获取当前节目总共页数 2. 存储节目信息 3. 获取并解析节目第index页html内容 :return: BeautifulSoup类实例,已访问过返回None ''' # 如果当前没有正在访问的节目 if self.currentSoup is None: try: # 获取到包含多个节目页面某个节目的解析 while True: # 没有内容则抛出异常 self.currentSoup = self.currentSoupList.pop() itemUrl = Utils.listenHost + self.currentSoup["href"] if itemUrl not in self.itemUrls: self.itemUrls.add(itemUrl) break except Exception: # 没有节目了,看看fromUrl里有没有可以获取节目的页面 self.currentSoup = None if self.fromUrlsIndex >= len(self.fromUrls): return None currentFromUrl = self.fromUrls[self.fromUrlsIndex] self.fromUrlsIndex += 1 try: self.currentSoupList = self.getItemsFromUrl(currentFromUrl) return self.getSomeArticlesPageSoup(index=index, skipfail=skipfail) except Exception: self.logger.error('节目包含页面访问失败,地址: ' + currentFromUrl) # 此页面访问失败后,若再次调用此函数,依旧访问此页面 if skipfail == False: self.fromUrlsIndex -= 1 raise Exception # 既然换了节目,就要初始化一些属性 self.currentItemInit() else: itemUrl = Utils.listenHost + self.currentSoup["href"] # 已经获取到了节目首页,现在要根据传入的index获取页面 # index为0则表示自增 index = max(0, int(index)) index = self.currentPageIndex if index == 0 else index itemFullUrl = itemUrl + 'page' + str(index) + '/' try: articlesContent = requests.get(itemFullUrl, headers=Utils.headers) resSoup = BeautifulSoup(articlesContent.text, "lxml") except Exception as e: self.logger.error('获取某节目某页失败: ' + itemFullUrl) raise Exception # 如果是第一次访问这个节目(无论哪一页),那么要做一些被延迟处理的事 # 1. 持久化节目信息 if self.hasBeenSaved is False: # 即使index非法,也能获取到节目信息 try: self.currentItemInfo = self.getListenItemInfo( resSoup, itemFullUrl) except Exception: self.logger.error('节目信息存储失败: ' + itemFullUrl) raise Exception self.hasBeenSaved = True # 2. 获取总页数 if self.currentTotalPageCounts == 0: # 获取页数,若指定的index值过大,是无法获取到总页数的,即认为index非法 self.currentTotalPageCounts = Utils.getPageCount(resSoup) # index非法,此次获取失败,恢复currentTotalPageCounts的值 if index > self.currentTotalPageCounts: self.currentTotalPageCounts = 0 return None # 如果指定了index,则下次访问index下一页 # 默认访问下一页 self.currentPageIndex = index + 1 if self.currentPageIndex > self.currentTotalPageCounts: self.currentSoup = None # 如果当前节目访问完毕,且数量达到限制,那么设置超出位 if self.currentSoup is None and self.getItemsSize() == self.limit: self.isOverLimited = True # 节目完整url(包含页码), 该页的soup return (itemFullUrl, resSoup)