def downloadCurrentFileTask(self, task): """ Continues downloading the URL in self.url and self.filename. """ if self.ch.run(): return task.cont if self.ch.getStatusCode() == 304: # This file is still cached from before. We don't need to # download it again. Move on to the next file. self.notify.info("already cached: %s" % (self.filename)) return self.downloadNextFile(task) localFilename = Filename(self.newsDir, self.filename) if not self.ch.isValid(): self.notify.warning("Unable to download %s" % (self.url)) localFilename.unlink() if self.filename in self.newsCache: del self.newsCache[self.filename] self.saveNewsCache() # Might as well see if we can get the next file. return self.downloadNextFile(task) # Successfully downloaded. self.notify.info("downloaded %s" % (self.filename)) # The HTTP "Entity Tag" appears to be useless with our CDN: # different CDN servers will serve up different etag values # for the same file. We rely on file size and date instead. size = self.ch.getFileSize() doc = self.ch.getDocumentSpec() date = '' if doc.hasDate(): date = doc.getDate().getString() self.newsCache[self.filename] = (size, date) self.saveNewsCache() # Continue downloading files. return self.downloadNextFile(task)
def downloadIndexTask(self, task): """ Get the initial index file from the HTTP server. """ if self.ch.run(): return task.cont if not self.ch.isValid(): self.notify.warning("Unable to download %s" % (self.url)) self.redownloadingNews = False return task.done # OK, now we've got the list of files hosted by the server. # Parse the list. self.newsFiles = [] filename = self.rf.readline() while filename: filename = filename.strip() if filename: self.newsFiles.append(filename) filename = self.rf.readline() del self.rf self.newsFiles.sort() self.newsIndexEntries = list(self.newsFiles) self.notify.info("Server lists %s news files" % (len(self.newsFiles))) self.notify.debug("self.newsIndexEntries=%s" % self.newsIndexEntries) # Now see if we already have copies of these files we # downloaded previously. self.readNewsCache() # Clean up any unexpected files in this directory--they might # be old news files, or partial failed downloads from before. for basename in os.listdir(self.newsDir.toOsSpecific()): if basename != self.CacheIndexFilename and basename not in self.newsCache: junk = Filename(self.newsDir, basename) self.notify.info("Removing %s" % (junk)) junk.unlink() # And start downloading the files. self.nextNewsFile = 0 return self.downloadNextFile(task)