def serve(self, host, path): (hostName, urlHash) = self.processMirrorURL(host, path) if hostName: self.database.setLocation(hostName) (url, contentType, contentLength, contentHash) = self.database.loadURL(urlHash) if url: contentLength = int(contentLength) if contentLength == 0: assert contentHash == self.ZERO_HASH return (url, contentType, 0, None) else: gzipped = contentHash.endswith(self.GZIP_SUFFIX) assert len(contentHash) == 2 * dbHash().digest_size + len(self.GZIP_SUFFIX) * gzipped # pylint: disable=E1101 (contentSize, contentStream) = self.database.loadData(contentHash) if gzipped: assert contentSize < contentLength contentStream = GzipFile(fileobj = contentStream) else: assert contentSize == contentLength if contentLength < DATA_CHUNK and contentType.lower().split(';')[0] in self.TYPES_TO_PROCESS: contentStream = BytesIO(self.processContent(hostName, contentStream.read())) return (url, contentType, contentLength, contentStream) return (None, None, None, None)
def downloadURL(self, url): try: print(url, end = ' ', flush = True) request = requests.get(url, stream = True) contentType = request.headers['content-type'] contentLength = request.headers.get('content-length', '') print(':: %s :: %s ::' % (contentType, ('%s bytes' % contentLength) if contentLength else 'no content-length'), end = ' ', flush = True) tempHash = dbHash() with SpooledTemporaryFile(DATA_CHUNK) as tempFile: for chunk in request.iter_content(DATA_CHUNK): tempFile.write(chunk) tempHash.update(chunk) size = tempFile.tell() if contentLength: if size != int(contentLength): print("ACTUALLY %d bytes ::" % size, end = ' ', flush = True) else: print("%d bytes ::" % size, end = ' ', flush = True) contentLength = size if contentLength: contentHash = self.dataHash(tempHash) (dataSize, _dataStream) = self.database.loadData(contentHash) if contentLength == dataSize: print("exists, match", end = ' ', flush = True) else: print("DAMAGED, OVERWRITING" if dataSize else "new, saving", end = ' ', flush = True) gzipped = False if contentLength >= self.MIN_SIZE_FOR_GZIP: tempFile.seek(0) with SpooledTemporaryFile(DATA_CHUNK) as gzipFile: with GzipFile(contentHash, 'wb', fileobj = gzipFile) as gzip: while True: data = tempFile.read(DATA_CHUNK) if not data: break gzip.write(data) zipLength = gzipFile.tell() if zipLength * 100 < contentLength * self.MIN_GZIP_EFFECTIVENESS: contentHash += self.GZIP_SUFFIX gzipFile.seek(0) written = self.database.saveData(contentHash, gzipFile) assert written == zipLength gzipped = True if not gzipped: tempFile.seek(0) written = self.database.saveData(contentHash, tempFile) assert written == contentLength else: contentHash = self.ZERO_HASH print("OK") urlHash = self.processOriginalURL(url) (oldURL, oldContentType, oldContentLength, oldContentHash) = self.database.loadURL(urlHash) if oldURL: print("Previous URL %s :: %s :: %s bytes :: content %s" % (oldURL, oldContentType, oldContentLength, 'matches' if contentHash == oldContentHash else 'DIFFERENT')) if oldContentHash != self.ZERO_HASH or contentHash == self.ZERO_HASH: return print("Previous URL contained empty page, overwriting") self.database.saveURL(urlHash, url, contentType, str(contentLength), contentHash) except Exception as e: print("\nERROR: %s" % e) print(format_exc()) raise
def dataHash(data): """Returns a hexlified hash digest for the specified block of data or already existing hash object.""" ret = (data if hasattr(data, 'digest') else dbHash(data.encode(UTF8))).hexdigest() assert len(ret) == 2 * dbHash().digest_size # pylint: disable=E1101 return ret