def consumer(self): ''' used to consume details response get a details page response by queue, then get image download url by details page response, then save image to local dir, then write to database :return: ''' while not self._taskQueue.empty() or self._queueProducerStatus: # get detailsResponse detailsResponse = self._taskQueue.get() if detailsResponse is None: continue # get image download url result is a imageResultClass generator imageDownloadUrlGenerator = self.getImageDownloadUrlGenerator(detailsResponse) if imageDownloadUrlGenerator is None: continue # save image to dir result is a imageResultClass imageResultClassGenerator = self.saveImageGenerator(imageDownloadUrlGenerator) for imageResultClass in imageResultClassGenerator: if imageResultClass is None: continue # imageResultClass to imageResultDict imageResultDict = self.imageResultClassPreprocessing(imageResultClass) if imageResultDict is None: continue # imageResultDict write to database self.writeToDatabase(imageResultDict) self.completed += 1 progressBar = self.computeProgressBar(self.completed, self.totalDownload) logger.info("Image completed:{0}".format(progressBar))
def getDetailsPageUrl(self, responseList): ''' get details page url by list page input url to queue :return: ''' if not responseList: logger.error("get response list is empty in details page function") detailsPageUrlList = [] # get details page url from list page response try: for response in responseList: if response is not None and response.status_code == 200: response.encoding = "gbk" soup = bs(response.text, "lxml") trList = soup.find_all("tr") for tr in trList: tdList = tr.find_all("td") if len(tdList) == 5: urlTagList = tdList[1].select('a[href]') if urlTagList: url = urlTagList[0]["href"] # title = urlTagList[0].string if url not in self.excludeUrlList and not self.urlInExclued(url): detailsPageUrlList.append(self.domains + url.strip()) # url to response logger.info("get details page response ...") detailsPageResponseList = self.urlToResponse(detailsPageUrlList) self.totalDownload = len(detailsPageResponseList) logger.info("present craw count:{0}".format(str(len(detailsPageResponseList)))) # put to queue for detailsPageResponse in detailsPageResponseList: self._taskQueue.put(detailsPageResponse) except BaseException: logger.error(traceback.format_exc())
def writeToDatabase(self, torrentResultDict): ''' information write to database :param infoDict: :return: ''' if torrentResultDict is None: logger.error("torrentResultDict is None") return None try: result = self._database.queryIsExist("torrent", {"md5": torrentResultDict["md5"]}) progressBar = self.computeProgressBar(self.completed, self.totalDownload) if not result: logger.info("Torrent completed: {progressBar: <10}".format(progressBar=progressBar) + \ "category: {category: <20}".format(category=torrentResultDict['category']) + \ "Title:{title}".format(title=torrentResultDict['title']), level="ALL") self._database.insert("torrent", torrentResultDict) else: logger.info("Torrent completed:{progressBar: <5} torrent already exist database.".format( progressBar=progressBar)) except BaseException: logger.error(traceback.format_exc()) logger.error("An error occurred in the function ---> wirteToDataBase") return None
def addToAria2(self, torentResultDict): magnet = torentResultDict["magnet"] savePath = self.getSavePath(torentResultDict) torrenName = torentResultDict["title"] logger.info("add to aria2, torrent file: {0}".format(torrenName)) self._pyaria2.addUrls(magnet, savePath) self._completed += 1
def downloadImage(cls): logger.info("start download image") taskQueue = Queue(maxsize=128) imageSpider = Image(taskQueue) threading.Thread(target=imageSpider.producer, name="producer").start() for i in range(DOWNLOAD_CONCURRENT_COUNT * 2): threading.Thread(target=imageSpider.consumer, name="ThreadID:{0}".format(str(i))).start()
def downloadTorrent(cls): logger.info("start download torrent") taskQueue = Queue(maxsize=128) torrentSpider = Torrent(taskQueue) threading.Thread(target=torrentSpider.producer, name="producer").start() for i in range(DOWNLOAD_CONCURRENT_COUNT): threading.Thread(target=torrentSpider.consumer, name="ThreadID:{0}".format(str(i))).start()
def torrentResultDictGenerator(self): torrentResultDictList = self._database.query("torrent", {"downloaded": 0}) self._totalDownload = len(torrentResultDictList) logger.info("not downloaded torrent count {0}".format( len(torrentResultDictList))) for torentResultDict in torrentResultDictList: yield torentResultDict
def writeToDatabase(self, imageResultDict): ''' information write to database :param infoDict: :return: ''' if imageResultDict is None: logger.error("imageResultDict is None") return None try: result = self._database.queryIsExist("image", {"md5": imageResultDict["md5"]}) progressBar = self.computeProgressBar(self.completed, self.totalDownload) if not result: logger.info("Image completed: {progressBar: <10}".format(progressBar=progressBar) + \ "Title:{title}".format(title=imageResultDict['title']), level="ALL") self._database.insert("image", imageResultDict) except BaseException: logger.error(traceback.format_exc()) logger.error("An error occurred in the function ---> wirteToDataBase") return None
def downloadVideo(cls): logger.info("start download video") downloadVideo = Video() threading.Thread(target=downloadVideo.downloadScheduler, name="downloadVideo").start()