class Downloader(QObject): finished = pyqtSignal() def __init__(self, rddtDataExtractor, validUsersOrSubs, queue, listModelType): """ The object that handles coordinating the download of the submission content. Spawns threads to download from users / subreddits simultaneously and to download images simultaneously. :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor :type validUsersOrSubs: list :type queue: Queue.queue :type listModelType: RedditDataExtractor.redditDataExtractor.ListType """ super().__init__() self._rddtDataExtractor = rddtDataExtractor self._validUsersOrSubs = validUsersOrSubs self._queue = queue self._listModelType = listModelType self._dataPool = QThreadPool() self._dataPool.setMaxThreadCount(4) self._continueOperation = True self.finishSignalForTest = False def stop(self): self._continueOperation = False def isStopped(self): return not self._continueOperation @pyqtSlot() def run(self): self.finishSignalForTest = False self._rddtDataExtractor.currentlyDownloading = True if len(self._validUsersOrSubs) > 0: for lstModelObj, validatedPRAWUserOrSub in self._validUsersOrSubs: worker = Worker(self._rddtDataExtractor, lstModelObj, validatedPRAWUserOrSub, self._queue, self._listModelType, self.isStopped) self._dataPool.start(worker) self._dataPool.waitForDone() self.finished.emit() self.finishSignalForTest = True
class Worker(QRunnable): def __init__(self, rddtDataExtractor, lstModelObj, validatedPRAWUserOrSub, queue, lstModelType, isStopped): """ Thread to download for a submission. Spawns more threads for downloading images or submission json data :param lstModelObj: The User or Subreddit "ListModel" Object :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor :type lstModelObj: RedditDataExtractor.GUI.genericListModelObjects.GenericListModelObj :type validatedPRAWUserOrSub: praw.objects.Subreddit or praw.objects.User :type queue: Queue.queue :type lstModelType: RedditDataExtractor.redditDataExtractor.ListType :type isStopped: function """ super().__init__() self._rddtDataExtractor = rddtDataExtractor self._lstModelObj = lstModelObj self._validatedPRAWUserOrSub = validatedPRAWUserOrSub self._queue = queue self._lstModelType = lstModelType self._imagePool = QThreadPool() self._imagePool.setMaxThreadCount(3) self._submissionPool = QThreadPool() self._submissionPool.setMaxThreadCount(3) self._videoPool = QThreadPool() self._videoPool.setMaxThreadCount(2) self._mostRecentDownloadTimestamp = None self._downloaderIsStopped = isStopped def _startDownloadsForSubmission(self, submission): """ :type submission: praw.objects.Submission """ if self._rddtDataExtractor.getExternalContent and self._lstModelObj.isNewContent(submission, DownloadedContentType.EXTERNAL_SUBMISSION_DATA) and not submission.is_self and not "reddit" in submission.domain: downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_SUBMISSION_DATA) images = self._rddtDataExtractor.getImages(submission, self._lstModelObj, self._queue) self._startDownloadImages(images, downloadedContent, submission) if not self._rddtDataExtractor.avoidVideos: videos = self._rddtDataExtractor.getVideos(submission, self._lstModelObj) self._startDownloadVideos(videos, downloadedContent, submission) if self._rddtDataExtractor.getCommentExternalContent and self._lstModelObj.isNewContent(submission, DownloadedContentType.EXTERNAL_COMMENT_DATA): downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_COMMENT_DATA) images = self._rddtDataExtractor.getCommentImages(submission, self._lstModelObj, self._queue) self._startDownloadImages(images, downloadedContent, submission) if not self._rddtDataExtractor.avoidVideos: videos = self._rddtDataExtractor.getCommentVideos(submission, self._lstModelObj) self._startDownloadVideos(videos, downloadedContent, submission) if self._rddtDataExtractor.getSelftextExternalContent and self._lstModelObj.isNewContent(submission, DownloadedContentType.EXTERNAL_SELFTEXT_DATA): downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_SELFTEXT_DATA) images = self._rddtDataExtractor.getSelftextImages(submission, self._lstModelObj, self._queue) self._startDownloadImages(images, downloadedContent, submission) if not self._rddtDataExtractor.avoidVideos: videos = self._rddtDataExtractor.getSelftextVideos(submission, self._lstModelObj) self._startDownloadVideos(videos, downloadedContent, submission) if self._rddtDataExtractor.getSubmissionContent and self._lstModelObj.isNewContent(submission, DownloadedContentType.JSON_DATA): if not self._downloaderIsStopped(): downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.JSON_DATA) submissionWorker = SubmissionWorker(self._rddtDataExtractor, self._lstModelObj, submission, self._queue, downloadedContent, self._lstModelType, self.setMostRecentDownloadTimestamp, self._downloaderIsStopped) self._submissionPool.start(submissionWorker) def _startDownloadImages(self, images, downloadedContent, submission): """ :type: images: generator :type downloadedContent: DownloadedContent :type: submission: praw.objects.Submission """ if images is not None: for image in images: if self._downloaderIsStopped(): break elif image is not None: imageWorker = ImageWorker(image, self._lstModelObj, submission, self._queue, downloadedContent, self._rddtDataExtractor.avoidDuplicates, self.setMostRecentDownloadTimestamp, self._downloaderIsStopped) self._imagePool.start(imageWorker) def _startDownloadVideos(self, videos, downloadedContent, submission): """ :type: videos: generator :type downloadedContent: DownloadedContent """ for video in videos: if self._downloaderIsStopped(): break elif video is not None: videoWorker = VideoWorker(video, self._lstModelObj, submission, self._queue, downloadedContent, self._rddtDataExtractor.avoidDuplicates, self.setMostRecentDownloadTimestamp, self._downloaderIsStopped) self._videoPool.start(videoWorker) def run(self): if not self._downloaderIsStopped(): name = self._lstModelObj.name self._queue.put("Starting download for " + name + "\n") self._rddtDataExtractor.makeDirectory(name) if self._lstModelType is ListType.SUBREDDIT: submitted = self._rddtDataExtractor.getSubredditSubmissions(self._validatedPRAWUserOrSub) else: submitted = self._validatedPRAWUserOrSub.get_submitted(limit=None) submissions = self._rddtDataExtractor.getValidSubmissions(submitted, self._lstModelObj) for submission, passesFilter in submissions: if passesFilter: self._startDownloadsForSubmission(submission) self._imagePool.waitForDone() self._submissionPool.waitForDone() self._videoPool.waitForDone() self._lstModelObj.mostRecentDownloadTimestamp = self._mostRecentDownloadTimestamp self._queue.put("Finished download for " + name + "\n") def setMostRecentDownloadTimestamp(self, utc): """ As the various threads download submissions, this keeps track of the most recent (by creation date) one. Then, when ALL downloads are finished, it sets the lstModelObj's mostRecentDownloadTimestamp. This allows submissions to be downloaded out of order in a download session, and still be able to prevent downloads from older time periods unless the user specifies they don't want that behavior. :type utc: float """ if self._mostRecentDownloadTimestamp is None or utc > self._mostRecentDownloadTimestamp: self._mostRecentDownloadTimestamp = utc