Esempio n. 1
0
    def getPagesHasItems(self, index=0):
        if self.currentPageIndex != 1 and self.currentPageIndex > self.currentTotalPageCounts:
            self.currentTotalPageCounts = 0
            self.currentPageIndex = 1
            return None

        if self.currentTotalPageCounts == 0:
            try:
                index = max(0, int(index))
            except Exception:
                self.logger.error('起始页为整数.')
                exit(-1)
            self.currentPageIndex = index if index is not 0 else self.currentPageIndex

        fullUrl = self.start_url + 'page' + str(self.currentPageIndex) + '/'
        try:
            mainPage = requests.get(fullUrl, headers=Utils.headers)
            soup = BeautifulSoup(mainPage.text, "lxml")
            self.currentPageIndex += 1
        except Exception:
            self.logger.error('节目页面获取失败!')
            return

        if self.currentTotalPageCounts == 0:
            try:
                # 找几页
                self.currentTotalPageCounts = Utils.getPageCount(soup)
            except Exception as e:
                self.logger.error('获取最大页数失败,默认只访问第一页!')
                self.currentTotalPageCounts = 1
            self.logger.info('此次访问共有' + str(self.currentTotalPageCounts) + '页')

        return fullUrl
Esempio n. 2
0
    def getSomeArticlesPageSoup(self, index=0, skipfail=False):
        '''
        1. 获取当前节目总共页数
        2. 存储节目信息
        3. 获取并解析节目第index页html内容
        :return: BeautifulSoup类实例,已访问过返回None
        '''
        # 如果当前没有正在访问的节目
        if self.currentSoup is None:
            try:
                # 获取到包含多个节目页面某个节目的解析
                while True:
                    # 没有内容则抛出异常
                    self.currentSoup = self.currentSoupList.pop()
                    itemUrl = Utils.listenHost + self.currentSoup["href"]
                    if itemUrl not in self.itemUrls:
                        self.itemUrls.add(itemUrl)
                        break
            except Exception:
                # 没有节目了,看看fromUrl里有没有可以获取节目的页面
                self.currentSoup = None
                if self.fromUrlsIndex >= len(self.fromUrls): return None
                currentFromUrl = self.fromUrls[self.fromUrlsIndex]
                self.fromUrlsIndex += 1
                try:
                    self.currentSoupList = self.getItemsFromUrl(currentFromUrl)
                    return self.getSomeArticlesPageSoup(index=index,
                                                        skipfail=skipfail)
                except Exception:
                    self.logger.error('节目包含页面访问失败,地址: ' + currentFromUrl)
                    # 此页面访问失败后,若再次调用此函数,依旧访问此页面
                    if skipfail == False:
                        self.fromUrlsIndex -= 1
                    raise Exception

            # 既然换了节目,就要初始化一些属性
            self.currentItemInit()
        else:
            itemUrl = Utils.listenHost + self.currentSoup["href"]

        # 已经获取到了节目首页,现在要根据传入的index获取页面
        # index为0则表示自增
        index = max(0, int(index))
        index = self.currentPageIndex if index == 0 else index

        itemFullUrl = itemUrl + 'page' + str(index) + '/'

        try:
            articlesContent = requests.get(itemFullUrl, headers=Utils.headers)
            resSoup = BeautifulSoup(articlesContent.text, "lxml")
        except Exception as e:
            self.logger.error('获取某节目某页失败: ' + itemFullUrl)
            raise Exception

        # 如果是第一次访问这个节目(无论哪一页),那么要做一些被延迟处理的事
        # 1. 持久化节目信息
        if self.hasBeenSaved is False:
            # 即使index非法,也能获取到节目信息
            try:
                self.currentItemInfo = self.getListenItemInfo(
                    resSoup, itemFullUrl)
            except Exception:
                self.logger.error('节目信息存储失败: ' + itemFullUrl)
                raise Exception

            self.hasBeenSaved = True

        # 2. 获取总页数
        if self.currentTotalPageCounts == 0:
            # 获取页数,若指定的index值过大,是无法获取到总页数的,即认为index非法
            self.currentTotalPageCounts = Utils.getPageCount(resSoup)
            # index非法,此次获取失败,恢复currentTotalPageCounts的值
            if index > self.currentTotalPageCounts:
                self.currentTotalPageCounts = 0
                return None

        # 如果指定了index,则下次访问index下一页
        # 默认访问下一页
        self.currentPageIndex = index + 1
        if self.currentPageIndex > self.currentTotalPageCounts:
            self.currentSoup = None

        # 如果当前节目访问完毕,且数量达到限制,那么设置超出位
        if self.currentSoup is None and self.getItemsSize() == self.limit:
            self.isOverLimited = True

        # 节目完整url(包含页码), 该页的soup
        return (itemFullUrl, resSoup)