Example #1
0
 def consumer(self):
     '''
     used to consume details response
     get a details page response by queue, then get image download url by details page response,
     then save image to local dir, then write to database
     :return:
     '''
     while not self._taskQueue.empty() or self._queueProducerStatus:
         # get detailsResponse
         detailsResponse = self._taskQueue.get()
         if detailsResponse is None:
             continue
         # get image download url result is a imageResultClass generator
         imageDownloadUrlGenerator = self.getImageDownloadUrlGenerator(detailsResponse)
         if imageDownloadUrlGenerator is None:
             continue
         # save image to dir result is a imageResultClass
         imageResultClassGenerator = self.saveImageGenerator(imageDownloadUrlGenerator)
         for imageResultClass in imageResultClassGenerator:
             if imageResultClass is None:
                 continue
             # imageResultClass to imageResultDict
             imageResultDict = self.imageResultClassPreprocessing(imageResultClass)
             if imageResultDict is None:
                 continue
             # imageResultDict write to database
             self.writeToDatabase(imageResultDict)
         self.completed += 1
         progressBar = self.computeProgressBar(self.completed, self.totalDownload)
         logger.info("Image completed:{0}".format(progressBar))
Example #2
0
 def getDetailsPageUrl(self, responseList):
     '''
     get details page url by list page
     input url to queue
     :return:
     '''
     if not responseList:
         logger.error("get response list is empty in details page function")
     detailsPageUrlList = []
     # get details page url from list page response
     try:
         for response in responseList:
             if response is not None and response.status_code == 200:
                 response.encoding = "gbk"
                 soup = bs(response.text, "lxml")
                 trList = soup.find_all("tr")
                 for tr in trList:
                     tdList = tr.find_all("td")
                     if len(tdList) == 5:
                         urlTagList = tdList[1].select('a[href]')
                         if urlTagList:
                             url = urlTagList[0]["href"]
                             # title = urlTagList[0].string
                             if url not in self.excludeUrlList and not self.urlInExclued(url):
                                 detailsPageUrlList.append(self.domains + url.strip())
         # url to response
         logger.info("get details page response ...")
         detailsPageResponseList = self.urlToResponse(detailsPageUrlList)
         self.totalDownload = len(detailsPageResponseList)
         logger.info("present craw count:{0}".format(str(len(detailsPageResponseList))))
         # put to queue
         for detailsPageResponse in detailsPageResponseList:
             self._taskQueue.put(detailsPageResponse)
     except BaseException:
         logger.error(traceback.format_exc())
Example #3
0
 def writeToDatabase(self, torrentResultDict):
     '''
     information write to database
     :param infoDict:
     :return:
     '''
     if torrentResultDict is None:
         logger.error("torrentResultDict is None")
         return None
     try:
         result = self._database.queryIsExist("torrent", {"md5": torrentResultDict["md5"]})
         progressBar = self.computeProgressBar(self.completed, self.totalDownload)
         if not result:
             
             logger.info("Torrent completed: {progressBar: <10}".format(progressBar=progressBar) + \
                         "category: {category: <20}".format(category=torrentResultDict['category']) + \
                         "Title:{title}".format(title=torrentResultDict['title']),
                         level="ALL")
             self._database.insert("torrent", torrentResultDict)
         else:
             logger.info("Torrent completed:{progressBar: <5} torrent already exist database.".format(
                 progressBar=progressBar))
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("An error occurred in the function ---> wirteToDataBase")
         return None
Example #4
0
 def addToAria2(self, torentResultDict):
     magnet = torentResultDict["magnet"]
     savePath = self.getSavePath(torentResultDict)
     torrenName = torentResultDict["title"]
     logger.info("add to aria2, torrent file: {0}".format(torrenName))
     self._pyaria2.addUrls(magnet, savePath)
     self._completed += 1
Example #5
0
 def downloadImage(cls):
     logger.info("start download image")
     taskQueue = Queue(maxsize=128)
     imageSpider = Image(taskQueue)
     threading.Thread(target=imageSpider.producer, name="producer").start()
     for i in range(DOWNLOAD_CONCURRENT_COUNT * 2):
         threading.Thread(target=imageSpider.consumer, name="ThreadID:{0}".format(str(i))).start()
Example #6
0
 def downloadTorrent(cls):
     logger.info("start download torrent")
     taskQueue = Queue(maxsize=128)
     torrentSpider = Torrent(taskQueue)
     threading.Thread(target=torrentSpider.producer, name="producer").start()
     for i in range(DOWNLOAD_CONCURRENT_COUNT):
         threading.Thread(target=torrentSpider.consumer, name="ThreadID:{0}".format(str(i))).start()
Example #7
0
 def torrentResultDictGenerator(self):
     torrentResultDictList = self._database.query("torrent",
                                                  {"downloaded": 0})
     self._totalDownload = len(torrentResultDictList)
     logger.info("not downloaded torrent count {0}".format(
         len(torrentResultDictList)))
     for torentResultDict in torrentResultDictList:
         yield torentResultDict
Example #8
0
 def writeToDatabase(self, imageResultDict):
     '''
     information write to database
     :param infoDict:
     :return:
     '''
     if imageResultDict is None:
         logger.error("imageResultDict is None")
         return None
     try:
         result = self._database.queryIsExist("image", {"md5": imageResultDict["md5"]})
         progressBar = self.computeProgressBar(self.completed, self.totalDownload)
         if not result:
             logger.info("Image completed: {progressBar: <10}".format(progressBar=progressBar) + \
                         "Title:{title}".format(title=imageResultDict['title']),
                         level="ALL")
             self._database.insert("image", imageResultDict)
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("An error occurred in the function ---> wirteToDataBase")
         return None
Example #9
0
 def downloadVideo(cls):
     logger.info("start download video")
     downloadVideo = Video()
     threading.Thread(target=downloadVideo.downloadScheduler, name="downloadVideo").start()