Ejemplo n.º 1
0
class Image(GeneralTool):
    
    def __init__(self, queue):
        self._database = DBUtil()
        self._taskQueue = queue
        self._queueProducerStatus = True
        self.completed = 0
        self.totalDownload = 0

    def generatorListPage(self):
        '''
        generator list page url
        :return: response list
        '''
        # check track page count is vailed
        # track page count is 100 if track too many
        trackingPageCount = TRACKING_PAGE_COUNT
        if isinstance(trackingPageCount, int):
            if trackingPageCount > 100:
                trackingPageCount = 100
        else:
            trackingPageCount = 100
            
        listPageUrlList = []
        for trackingNumber in range(1, trackingPageCount + 1):
            url = "https://www.t66y.com/thread0806.php?fid=16&search=&page={0}".format(str(trackingNumber))
            listPageUrlList.append(url)
        requestsList = []
        for listPageUrl in listPageUrlList:
            if COOKIES:
                requestsList.append(
                    AsyncRequests.get(listPageUrl, headers=self.headers, cookies=COOKIES)
                )
            else:
                requestsList.append(
                    AsyncRequests.get(listPageUrl, headers=self.headers)
                )
        resultList = AsyncRequests.map(requestsList, size=DOWNLOAD_CONCURRENT_COUNT)
        return resultList
    
    def getDetailsPageUrl(self, responseList):
        '''
        get details page url by list page
        input url to queue
        :return:
        '''
        if not responseList:
            logger.error("get response list is empty in details page function")
        detailsPageUrlList = []
        # get details page url from list page response
        try:
            for response in responseList:
                if response is not None and response.status_code == 200:
                    response.encoding = "gbk"
                    soup = bs(response.text, "lxml")
                    trList = soup.find_all("tr")
                    for tr in trList:
                        tdList = tr.find_all("td")
                        if len(tdList) == 5:
                            urlTagList = tdList[1].select('a[href]')
                            if urlTagList:
                                url = urlTagList[0]["href"]
                                # title = urlTagList[0].string
                                if url not in self.excludeUrlList and not self.urlInExclued(url):
                                    detailsPageUrlList.append(self.domains + url.strip())
            # url to response
            logger.info("get details page response ...")
            detailsPageResponseList = self.urlToResponse(detailsPageUrlList)
            self.totalDownload = len(detailsPageResponseList)
            logger.info("present craw count:{0}".format(str(len(detailsPageResponseList))))
            # put to queue
            for detailsPageResponse in detailsPageResponseList:
                self._taskQueue.put(detailsPageResponse)
        except BaseException:
            logger.error(traceback.format_exc())
            
    def computeDetailsImageCount(self, detailsResponse):
        '''
        compute details image count bt detailsResponse
        :param detailsResponse:
        :return:
        '''
        try:
            if detailsResponse is not None and detailsResponse.status_code == 200:
                detailsResponse.encoding = "gbk"
                soup = bs(detailsResponse.text, "lxml")
                inputTagList = soup.find_all("input")
                imageCount = 0
                for inputTag in inputTagList:
                    try:
                        imageDownloadUrl = inputTag['data-src']
                        imageCount += 1
                    except KeyError:
                        continue
                return imageCount
            else:
                logger.error("detalisResponse is None or status code not 200 in the computeDetailsImageCount function")
                return 0
        except BaseException:
            logger.error(traceback.format_exc())
            return 0
        
    def getImageDownloadUrlGenerator(self, detailsResponse):
        '''
        get download page url by details page
        this is a generator function
        generator imageResultClass
        :param detailsResponse:
        :return: imageResultClassList
        '''
        try:
            # imageResultClassList = []
            if detailsResponse is not None and detailsResponse.status_code == 200:
                detailsResponse.encoding = "gbk"
                soup = bs(detailsResponse.text, "lxml")
                title = soup.head.title.text
                inputTagList = soup.find_all("input")
                for inputTag in inputTagList:
                    try:
                        imageDownloadUrl = inputTag['data-src']
                    except KeyError:
                        continue
                    if imageDownloadUrl:
                        # print("is ok")
                        if self.existInDatabase("image", {"imageDownloadUrl": imageDownloadUrl}):
                            continue
                        if self.isImageDownloadPageUrl(imageDownloadUrl):
                            # imageResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url)))
                            imageResultClass = ImageResultClass()
                            imageResultClass.setCrawDate(self.formatDate())
                            imageResultClass.setDetailsPageUrl(str(detailsResponse.url))
                            imageResultClass.setDetailsPageImageCount(str(self.computeDetailsImageCount(detailsResponse)))
                            imageResultClass.setImageDownloadUrl(imageDownloadUrl)
                            imageResultClass.setResponse(self.urlToResponse([imageDownloadUrl])[0])
                            imageResultClass.setTitle(self.clearTitle(title))
                            imageResultClass.setCrawDate(self.formatDate())
                            yield imageResultClass
                            # imageResultClassList.append(imageResultClass)
                    else:
                        continue
            # return imageResultClassList
        except BaseException:
            logger.error(traceback.format_exc())
            return None
      
    def saveImageGenerator(self, imageResultClassGenerator):
        '''
        save image by image result class list
        :param imageResultClassList:
        :return:
        '''
        if imageResultClassGenerator is None:
            logger.error("imageResultClass generator is None")
            return None
        for imageResultClass in imageResultClassGenerator:
            try:
                # imageResultClass = imageResultClassGenerator.__next__()
                if isinstance(imageResultClass, str):
                    continue
                response = imageResultClass.getResponse()
                if response is not None and response.status_code == 200:
                    imageByte = imageResultClass.getResponse().content
                    imageMd5 = self.computeMD5ByFile(imageByte)
                    imageResultClass.setMd5(imageMd5)
                    imageName = os.path.split(imageResultClass.getImageDownloadUrl())[1]
                    imageDirName = imageResultClass.getTitle()
                    # check save path is vailed
                    imageSavePath = self.checkDirExist(os.path.join(SAVE_PATH,
                                                                    "image",
                                                                    self.year(),
                                                                    self.month(),
                                                                    self.day(),
                                                                    imageDirName,
                                                                    imageName))
                    imageResultClass.setSavePath(imageSavePath)
                    with open(imageSavePath, "wb+") as fo:
                        fo.write(imageByte)
                    yield imageResultClass
            # except StopIteration:
            #     break
            except BaseException:
                logger.error(traceback.format_exc())
                return None
        # return imageResultClassList
    
    # def imageResultClassListGenerator(self, imageResultClassList):
    
    
    def imageResultClassPreprocessing(self, imageResultClass):
        '''
        the preprocessing infoClass is used to write to the database
        :param infoClass:
        :return:
        '''
        if imageResultClass is None:
            logger.error("imageResultClass is None in the imageResultClassPreprocessign function")
            return None
        try:
            
            imageResultDict = {}
            imageResultDict.setdefault("id", None)
            imageResultDict.setdefault("title", imageResultClass.getTitle())
            imageResultDict.setdefault("detailsPageUrl", imageResultClass.getDetailsPageUrl())
            imageResultDict.setdefault("detailsPageImageCount", imageResultClass.getDetailsPageImageCount())
            imageResultDict.setdefault("imageDownloadUrl", imageResultClass.getImageDownloadUrl())
            imageResultDict.setdefault("savePath", imageResultClass.getSavePath())
            imageResultDict.setdefault("crawData", imageResultClass.getCrawDate())
            imageResultDict.setdefault("md5", imageResultClass.getMd5())
            return imageResultDict
        except BaseException:
            logger.error(traceback.format_exc())
            return None
    
    def writeToDatabase(self, imageResultDict):
        '''
        information write to database
        :param infoDict:
        :return:
        '''
        if imageResultDict is None:
            logger.error("imageResultDict is None")
            return None
        try:
            result = self._database.queryIsExist("image", {"md5": imageResultDict["md5"]})
            progressBar = self.computeProgressBar(self.completed, self.totalDownload)
            if not result:
                logger.info("Image completed: {progressBar: <10}".format(progressBar=progressBar) + \
                            "Title:{title}".format(title=imageResultDict['title']),
                            level="ALL")
                self._database.insert("image", imageResultDict)
        except BaseException:
            logger.error(traceback.format_exc())
            logger.error("An error occurred in the function ---> wirteToDataBase")
            return None
        
    def producer(self):
        '''
        used to produce details response
        :return:
        '''
        self._queueProducerStatus = True
        self.getDetailsPageUrl(self.generatorListPage())
        self._queueProducerStatus = False
    
    def consumer(self):
        '''
        used to consume details response
        get a details page response by queue, then get image download url by details page response,
        then save image to local dir, then write to database
        :return:
        '''
        while not self._taskQueue.empty() or self._queueProducerStatus:
            # get detailsResponse
            detailsResponse = self._taskQueue.get()
            if detailsResponse is None:
                continue
            # get image download url result is a imageResultClass generator
            imageDownloadUrlGenerator = self.getImageDownloadUrlGenerator(detailsResponse)
            if imageDownloadUrlGenerator is None:
                continue
            # save image to dir result is a imageResultClass
            imageResultClassGenerator = self.saveImageGenerator(imageDownloadUrlGenerator)
            for imageResultClass in imageResultClassGenerator:
                if imageResultClass is None:
                    continue
                # imageResultClass to imageResultDict
                imageResultDict = self.imageResultClassPreprocessing(imageResultClass)
                if imageResultDict is None:
                    continue
                # imageResultDict write to database
                self.writeToDatabase(imageResultDict)
            self.completed += 1
            progressBar = self.computeProgressBar(self.completed, self.totalDownload)
            logger.info("Image completed:{0}".format(progressBar))
Ejemplo n.º 2
0
class Torrent(GeneralTool):
    
    def __init__(self, queue):
        
        self._database = DBUtil()
        self._taskQueue = queue
        self._queueProducerStatus = True
        self.completed = 0
        self.totalDownload = 0
    
    def generatorListPage(self):
        '''
        generator list page url
        :return: response list
        '''
        
        # check track page count is vailed
        # track page count is 100 if track too many
        trackingPageCount = TRACKING_PAGE_COUNT
        if isinstance(trackingPageCount, int):
            if trackingPageCount > 100:
                trackingPageCount = 100
        else:
            trackingPageCount = 100
        
        listPageUrlList = []
        for categoryName in BT_DOWNLOAD_CATEGORY:
            categoryNumber = Category.categoryNameToNumberDict[categoryName]
            for trackingNumber in range(1, trackingPageCount + 1):
                url = "https://www.t66y.com/thread0806.php?fid={0}&search=&page={1}".format(str(categoryNumber),
                                                                                            str(trackingNumber))
                listPageUrlList.append(url)
        requestsList = []
        for listPageUrl in listPageUrlList:
            if COOKIES:
                requestsList.append(
                    AsyncRequests.get(listPageUrl, headers=self.headers, cookies=COOKIES)
                )
            else:
                requestsList.append(
                    AsyncRequests.get(listPageUrl, headers=self.headers)
                )
        resultList = AsyncRequests.map(requestsList, size=DOWNLOAD_CONCURRENT_COUNT)
        return resultList
    
    def getDetailsPageUrl(self, responseList):
        '''
        get details page url by list page
        input url to queue
        :return:
        '''
        if not responseList:
            logger.error("get response list is empty in details page function")
        detailsPageUrlList = []
        # get details page url from list page response
        try:
            for response in responseList:
                if response is not None and response.status_code == 200:
                    response.encoding = "gbk"
                    soup = bs(response.text, "lxml")
                    trList = soup.find_all("tr")
                    for tr in trList:
                        tdList = tr.find_all("td")
                        if len(tdList) == 5:
                            urlTagList = tdList[1].select('a[href]')
                            if urlTagList:
                                url = urlTagList[0]["href"]
                                # title = urlTagList[0].string
                                if url not in self.excludeUrlList:
                                    detailsPageUrlList.append(self.domains + url.strip())
            # url to response
            logger.info("get details page response ...")
            detailsPageResponseList = self.urlToResponse(detailsPageUrlList)
            logger.info("present craw count:{0}".format(str(len(detailsPageResponseList))))
            self.totalDownload = len(detailsPageResponseList)
            # put to queue
            for detailsPageResponse in detailsPageResponseList:
                self._taskQueue.put(detailsPageResponse)
        except BaseException:
            logger.error(traceback.format_exc())
    
    def getDownloadPageUrl(self, detailsResponse):
        '''
        get download page url by details page
        :param detailsResponse:
        :return:
        '''
        try:
            torrentResultClass = TorrentResultClass()
            if detailsResponse is not None and detailsResponse.status_code == 200:
                detailsResponse.encoding = "gbk"
                soup = bs(detailsResponse.text, "lxml")
                title = soup.head.title.text
                aTagList = soup.find_all("a")
                for a in aTagList:
                    downloadPageUrl = a.string
                    if downloadPageUrl:
                        if self.isDownloadPageUrl(downloadPageUrl):
                            torrentResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url)))
                            torrentResultClass.setCrawData(self.formatDate())
                            torrentResultClass.setDetailsPageUrl(str(detailsResponse.url))
                            torrentResultClass.setResponse(self.urlToResponse([downloadPageUrl.strip()])[0])
                            torrentResultClass.setTitle(self.clearTitle(title))
                    else:
                        continue
            # return self.getTorrentDownloadUrl(torrentResultClass)
            return torrentResultClass
        except BaseException:
            logger.error(traceback.format_exc())
            return None
    
    def getTorrentDownloadUrl(self, torrentResultClass):
        '''
        get torrent download url by download information page
        :param torrentResultClass:
        :return:
        '''
        if torrentResultClass is None:
            logger.error("get download page url failed, because torrentResultClass is None")
            return None
        try:
            response = torrentResultClass.getResponse()
            # i don't knonw why i might get a string here
            if isinstance(response, str):
                return None
            if response is not None and response.status_code == 200:
                response.encoding = "utf-8"
                downloadUrl = self.torrentDownloadUrl(response.text)
                torrentResultClass.setDownloadPageUrl(str(response.url))
                torrentResultClass.setTorrentDownloadUrl(downloadUrl)
                return torrentResultClass
            else:
                return None
        except BaseException:
            logger.error(traceback.format_exc())
            return None
    
    def downloadTorrentFile(self, torrentResultClass):
        '''
        download torrent file
        :param torrentResultClass:
        :return:
        '''
        if torrentResultClass is None:
            logger.error("torrentResultClass is None in the downloadTorrentFile function")
            return None
        torrentName = self.filterTorrentName(self.clearTitle(torrentResultClass.getTitle()))
        torrentDownloadUrl = torrentResultClass.getTorrentDownloadUrl()
        detailsPageUrl = torrentResultClass.getDetailsPageUrl()
        if not torrentName:
            logger.error("get torrent name failed")
            return None
        if not torrentDownloadUrl:
            logger.error("get torrent download url failed")
            return None
        if not detailsPageUrl:
            logger.error("get details page url failed")
            return None
        try:
            categoryName = self.getCategoryFromDatailsPageUrl(detailsPageUrl)
            torrentResponse = requests.get(torrentDownloadUrl, headers=self.headers)
            torrentMd5 = self.computeMD5ByFile(torrentResponse.content)
            # check save path is vailed
            torrentPath = self.checkDirExist(
                os.path.join(SAVE_PATH,
                             "torrent",
                             categoryName,
                             self.year(),
                             self.month(),
                             self.day(),
                             torrentName + ".torrent"))
            with open(torrentPath, "wb+") as file:
                file.write(torrentResponse.content)
            torrentResultClass.setSavePath(torrentPath)
            torrentResultClass.setCrawData(self.formatDate())
            torrentResultClass.setMd5(torrentMd5)
            torrentResultClass.setMagnet(self.torrentToMagnet(torrentResponse.content))
            torrentResultClass.setDownloaded(0)
            return torrentResultClass
        except BaseException:
            logger.error(traceback.format_exc())
            return None
    
    def torrentResultClassPreprocessing(self, torrentResultClass):
        '''
        the preprocessing torrentResultClass is used to write to the database
        :param torrentResultClass:
        :return:
        '''
        if torrentResultClass is None:
            logger.error("torrentResultClass is None in the torrentResultClassPreprocessign function")
            return None
        try:
            torrentResultDict = {}
            torrentResultDict.setdefault("id", None)
            torrentResultDict.setdefault("category", torrentResultClass.getCategory())
            torrentResultDict.setdefault("title", torrentResultClass.getTitle())
            torrentResultDict.setdefault("detailsPageUrl", torrentResultClass.getDetailsPageUrl())
            torrentResultDict.setdefault("downloadPageUrl", torrentResultClass.getDownloadPageUrl())
            torrentResultDict.setdefault("torrentDownloadUrl", torrentResultClass.getTorrentDownloadUrl())
            torrentResultDict.setdefault("savePath", torrentResultClass.getSavePath())
            torrentResultDict.setdefault("crawData", torrentResultClass.getCrawData())
            torrentResultDict.setdefault("md5", torrentResultClass.getMd5())
            torrentResultDict.setdefault("magnet", torrentResultClass.getMagnet())
            torrentResultDict.setdefault("downloaded", torrentResultClass.getDownloaded())
            return torrentResultDict
        except BaseException:
            logger.error(traceback.format_exc())
            return None
    
    def writeToDatabase(self, torrentResultDict):
        '''
        information write to database
        :param infoDict:
        :return:
        '''
        if torrentResultDict is None:
            logger.error("torrentResultDict is None")
            return None
        try:
            result = self._database.queryIsExist("torrent", {"md5": torrentResultDict["md5"]})
            progressBar = self.computeProgressBar(self.completed, self.totalDownload)
            if not result:
                
                logger.info("Torrent completed: {progressBar: <10}".format(progressBar=progressBar) + \
                            "category: {category: <20}".format(category=torrentResultDict['category']) + \
                            "Title:{title}".format(title=torrentResultDict['title']),
                            level="ALL")
                self._database.insert("torrent", torrentResultDict)
            else:
                logger.info("Torrent completed:{progressBar: <5} torrent already exist database.".format(
                    progressBar=progressBar))
        except BaseException:
            logger.error(traceback.format_exc())
            logger.error("An error occurred in the function ---> wirteToDataBase")
            return None
    
    def producer(self):
        '''
        used to produce details response
        :return:
        '''
        self._queueProducerStatus = True
        self.getDetailsPageUrl(self.generatorListPage())
        self._queueProducerStatus = False
    
    def consumer(self):
        '''
        used to consume details response
        get a response by queue, then get download page url by details page response,
        then get torrent download url by download page response
        :return:
        '''
        while not self._taskQueue.empty() or self._queueProducerStatus:
            # get detailsResponse
            detailsResponse = self._taskQueue.get()
            if detailsResponse is None:
                self.completed += 1
                continue
            # get download page url result is a torrentResultClass
            downloadPageUrlTorrentResultClass = self.getDownloadPageUrl(detailsResponse)
            if downloadPageUrlTorrentResultClass is None:
                self.completed += 1
                continue
            # get torrent download url result is a torrentResultClass
            torrentDownloadUrlTorrentResultClass = self.getTorrentDownloadUrl(downloadPageUrlTorrentResultClass)
            if torrentDownloadUrlTorrentResultClass is None:
                self.completed += 1
                continue
            # download torrent result is a torrentResultClass
            downloadTorrentFileTorrentResultClass = self.downloadTorrentFile(torrentDownloadUrlTorrentResultClass)
            if downloadTorrentFileTorrentResultClass is None:
                self.completed += 1
                continue
            # torrentResultClass to infoDict
            torrentResultDict = self.torrentResultClassPreprocessing(downloadTorrentFileTorrentResultClass)
            if torrentResultDict is None:
                self.completed += 1
                continue
            # infoDict write to database
            self.completed += 1
            self.writeToDatabase(torrentResultDict)