def put_file_from_fobj(self, src, desc=""): """Store a file in the storage. If it's already (for some reason...) in the cache send that copy to the backend. Otherwise store it in the file-system cache first. The file is obtained from a file-object. Other interfaces are available as `put_file_content', `put_file_from_path'. src (fileobj): a readable binary file-like object from which to read the contents of the file. desc (unicode): the (optional) description to associate to the file. return (unicode): the digest of the stored file. """ logger.debug("Reading input file to store on the database.") # Unfortunately, we have to read the whole file-obj to compute # the digest but we take that chance to save it to a temporary # path so that we then just need to move it. Hoping that both # locations will be on the same filesystem, that should be way # faster than reading the whole file-obj again (as it could be # compressed or require network communication). # XXX We're *almost* reimplementing copyfileobj. with tempfile.NamedTemporaryFile('wb', delete=False, dir=config.temp_dir) as dst: hasher = hashlib.sha1() buf = src.read(self.CHUNK_SIZE) while len(buf) > 0: hasher.update(buf) while len(buf) > 0: written = dst.write(buf) # Cooperative yield. gevent.sleep(0) if written is None: break buf = buf[written:] buf = src.read(self.CHUNK_SIZE) digest = hasher.hexdigest().decode("ascii") dst.flush() logger.debug("File has digest %s." % digest) cache_file_path = os.path.join(self.file_dir, digest) if not os.path.exists(cache_file_path): move(dst.name, cache_file_path) else: os.unlink(dst.name) # Store the file in the backend. We do that even if the file # was already in the cache (that is, we ignore the check above) # because there's a (small) chance that the file got removed # from the backend but somehow remained in the cache. self.save(digest, desc) return digest
def put_file_from_fobj(self, src, desc=""): """Store a file in the storage. If it's already (for some reason...) in the cache send that copy to the backend. Otherwise store it in the file-system cache first. The file is obtained from a file-object. Other interfaces are available as `put_file_content', `put_file_from_path'. src (fileobj): a readable binary file-like object from which to read the contents of the file. desc (unicode): the (optional) description to associate to the file. return (unicode): the digest of the stored file. """ logger.debug("Reading input file to store on the database.") # Unfortunately, we have to read the whole file-obj to compute # the digest but we take that chance to save it to a temporary # path so that we then just need to move it. Hoping that both # locations will be on the same filesystem, that should be way # faster than reading the whole file-obj again (as it could be # compressed or require network communication). # XXX We're *almost* reimplementing copyfileobj. with tempfile.NamedTemporaryFile('wb', delete=False, dir=self.temp_dir) as dst: hasher = hashlib.sha1() buf = src.read(self.CHUNK_SIZE) while len(buf) > 0: hasher.update(buf) while len(buf) > 0: written = dst.write(buf) # Cooperative yield. gevent.sleep(0) if written is None: break buf = buf[written:] buf = src.read(self.CHUNK_SIZE) digest = bin_to_hex(hasher.digest()) dst.flush() logger.debug("File has digest %s.", digest) cache_file_path = os.path.join(self.file_dir, digest) if not os.path.exists(cache_file_path): move(dst.name, cache_file_path) else: os.unlink(dst.name) # Store the file in the backend. We do that even if the file # was already in the cache (that is, we ignore the check above) # because there's a (small) chance that the file got removed # from the backend but somehow remained in the cache. self.save(digest, desc) return digest
def put_file(self, description="", binary_data=None, file_obj=None, path=None): """Put a file in the storage, and keep a copy locally. The caller has to provide exactly one among binary_data, file_obj and path. description (string): a human-readable description of the content. binary_data (string): the content of the file to send. file_obj (file): the file-like object to send. path (string): the file to send. """ temp_fd, temp_path = tempfile.mkstemp(dir=self.tmp_dir) os.close(temp_fd) # Input checking if [binary_data, file_obj, path].count(None) != 2: error_string = "No content (or too many) specified in put_file." logger.error(error_string) raise ValueError(error_string) logger.debug("Reading input file to store on the database.") # Copy the file content, whatever forms it arrives, into the # temporary file if path is not None: copy(path, temp_path) elif binary_data is not None: with open(temp_path, 'wb') as temp_file: temp_file.write(binary_data) else: # file_obj is not None. with open(temp_path, 'wb') as temp_file: copyfileobj(file_obj, temp_file) hasher = hashlib.sha1() # Calculate the file SHA1 digest with open(temp_path, 'rb') as temp_file: buf = temp_file.read(self.CHUNK_SIZE) while buf != '': hasher.update(buf) buf = temp_file.read(self.CHUNK_SIZE) digest = hasher.hexdigest() logger.debug("File has digest %s." % digest) self.backend.put_file(digest, temp_path, description=description) # Move the temporary file in the cache move(temp_path, os.path.join(self.obj_dir, digest)) return digest
def get_file(self, digest, path=None, file_obj=None, string=False, temp_path=False, temp_file_obj=False): """Get a file from the storage, possibly using the cache if the file is available there. digest (string): the sha1 sum of the file. path (string): a path where to save the file. file_obj (file): a handler where to save the file (that is not closed at return). string (bool): True to return content as a string. temp_path (bool): True to return path of a temporary file with that content. The file is reserved to the caller, who has the duty to unlink it. temp_file-obj (bool): True to return a file object opened to a temporary file with that content. The file is reserved to the caller. Use this method only for debugging purpose, as it leave a file lying in the temporary directory of FileCacher. """ if [string, temp_path, temp_file_obj].count(True) > 1: raise ValueError("Ask for at most one amongst content, " "temp path and temp file obj.") cache_path = os.path.join(self.obj_dir, digest) cache_exists = os.path.exists(cache_path) logger.debug("Getting file %s." % (digest)) if not cache_exists: logger.debug("File %s not in cache, downloading " "from database." % digest) # Receives the file from the database temp_file, temp_filename = tempfile.mkstemp(dir=self.tmp_dir) temp_file = os.fdopen(temp_file, "wb") self.backend.get_file(digest, temp_filename) # And move it in the cache. Warning: this is not atomic if # the temp and the cache dir are on different filesystems. move(temp_filename, cache_path) logger.debug("File %s downloaded." % digest) # Saving to path if path is not None: copy(cache_path, path) # Saving to file object if file_obj is not None: with open(cache_path, "rb") as file_: copyfileobj(file_, file_obj) # Returning string? if string: with open(cache_path, "rb") as cache_file: return cache_file.read() # Returning temporary file? elif temp_path: temp_file, temp_filename = tempfile.mkstemp(dir=self.tmp_dir) os.close(temp_file) copy(cache_path, temp_filename) return temp_filename # Returning temporary file object? elif temp_file_obj: temp_file, temp_filename = tempfile.mkstemp(dir=self.tmp_dir) os.close(temp_file) copy(cache_path, temp_filename) temp_file = open(temp_filename, "rb") return temp_file