Example #1
0
    def downloadCurrentFileTask(self, task):
        """ Continues downloading the URL in self.url and self.filename. """

        if self.ch.run():
            return task.cont

        if self.ch.getStatusCode() == 304:
            # This file is still cached from before.  We don't need to
            # download it again.  Move on to the next file.
            self.notify.info("already cached: %s" % (self.filename))
            return self.downloadNextFile(task)

        localFilename = Filename(self.newsDir, self.filename)

        if not self.ch.isValid():
            self.notify.warning("Unable to download %s" % (self.url))
            localFilename.unlink()

            if self.filename in self.newsCache:
                del self.newsCache[self.filename]
                self.saveNewsCache()

            # Might as well see if we can get the next file.
            return self.downloadNextFile(task)

        # Successfully downloaded.
        self.notify.info("downloaded %s" % (self.filename))

        # The HTTP "Entity Tag" appears to be useless with our CDN:
        # different CDN servers will serve up different etag values
        # for the same file.  We rely on file size and date instead.

        size = self.ch.getFileSize()
        doc = self.ch.getDocumentSpec()
        date = ''
        if doc.hasDate():
            date = doc.getDate().getString()

        self.newsCache[self.filename] = (size, date)
        self.saveNewsCache()

        # Continue downloading files.
        return self.downloadNextFile(task)
Example #2
0
    def downloadIndexTask(self, task):
        """ Get the initial index file from the HTTP server. """
        if self.ch.run():
            return task.cont

        if not self.ch.isValid():
            self.notify.warning("Unable to download %s" % (self.url))
            self.redownloadingNews = False
            return task.done

        # OK, now we've got the list of files hosted by the server.
        # Parse the list.
        self.newsFiles = []
        filename = self.rf.readline()
        while filename:
            filename = filename.strip()
            if filename:
                self.newsFiles.append(filename)
            filename = self.rf.readline()
        del self.rf

        self.newsFiles.sort()
        self.newsIndexEntries = list(self.newsFiles)
        self.notify.info("Server lists %s news files" % (len(self.newsFiles)))
        self.notify.debug("self.newsIndexEntries=%s" % self.newsIndexEntries)

        # Now see if we already have copies of these files we
        # downloaded previously.
        self.readNewsCache()

        # Clean up any unexpected files in this directory--they might
        # be old news files, or partial failed downloads from before.
        for basename in os.listdir(self.newsDir.toOsSpecific()):
            if basename != self.CacheIndexFilename and basename not in self.newsCache:
                junk = Filename(self.newsDir, basename)
                self.notify.info("Removing %s" % (junk))
                junk.unlink()

        # And start downloading the files.
        self.nextNewsFile = 0
        return self.downloadNextFile(task)