def test_store_should_add_file_to_storage(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') with gzip.open(storage_path, 'rb') as f: self.assertEqual(f.read(), b'hello')
def test_store_should_respect_given_data_size(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1, size=2) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') with gzip.open(storage_path, 'rb') as f: self.assertEqual(f.read(), b'he')
def __init__(self, dir=None): if dir is None: if 'FILETRACKER_DIR' not in os.environ: raise AssertionError("LocalFileServer must have its working " "directory specified either as a constructor argument " "or passed via FILETRACKER_DIR environment variable.") dir = os.environ['FILETRACKER_DIR'] self.storage = FileStorage(dir) self.dir = self.storage.links_dir
def test_store_should_set_modified_time_to_version(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') storage_version = os.lstat(storage_path).st_mtime self.assertEqual(1, storage_version)
def test_store_should_not_overwrite_newer_files(self): storage = FileStorage(self.temp_dir) old_data = BytesIO(b'hello') new_data = BytesIO(b'world') storage.store('hello.txt', new_data, version=2) storage.store('hello.txt', old_data, version=1) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') with gzip.open(storage_path, 'rb') as f: self.assertEqual(f.read(), b'world')
def test_store_should_reuse_blobs(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) data.seek(0) storage.store('world.txt', data, version=1) storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt') storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt') self.assertEqual(os.readlink(storage_path_a), os.readlink(storage_path_b))
def test_store_should_add_compressed_file_to_storage_as_is(self): storage = FileStorage(self.temp_dir) raw_data = BytesIO(b'hello') gz_data = BytesIO() with gzip.GzipFile(fileobj=gz_data, mode='wb') as dst: shutil.copyfileobj(raw_data, dst) gz_data.seek(0) storage.store('hello.txt', gz_data, version=1, compressed=True) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') with gzip.open(storage_path, 'rb') as f: self.assertEqual(f.read(), b'hello')
def test_store_should_accept_digest_hints(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') digest = hashlib.sha256(b'hello').hexdigest() storage.store('hello.txt', data, version=1) data.seek(0) storage.store('world.txt', data, version=1, digest=digest) storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt') storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt') self.assertEqual(os.readlink(storage_path_a), os.readlink(storage_path_b))
def test_changing_version_should_not_affect_other_links(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) data.seek(0) storage.store('world.txt', data, version=2) self.assertEqual(storage.stored_version('hello.txt'), 1) self.assertEqual(storage.stored_version('world.txt'), 2)
def test_should_recreate_db(self): _touch_hello_gz(os.path.join(self.temp_dir, 'blobs', '00', '0000')) os.symlink(os.path.join(self.temp_dir, 'blobs', '00', '0000'), os.path.join(self.temp_dir, 'links', '0.txt')) recover.main([self.temp_dir, '-s', '-f']) storage = FileStorage(self.temp_dir) self.assertEqual(storage.db.get(b'0000'), b'1') self.assertEqual(storage.db.get(b'0000:logical_size'), b'5')
def test_removing_one_reference_should_not_break_others(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) data.seek(0) storage.store('world.txt', data, version=1) storage.delete('hello.txt', version=1) path = os.path.join(self.temp_dir, 'links', 'world.txt') with gzip.open(path, 'rb') as f: self.assertEqual(f.read(), b'hello')
def test_deleting_older_version_should_have_no_effect(self): storage = FileStorage(self.temp_dir) storage.store('hello.txt', BytesIO(b'world'), version=2) storage.delete('hello.txt', version=1) storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt') with gzip.open(storage_path, 'rb') as f: self.assertEqual(f.read(), b'world')
def test_blob_should_be_deleted_with_the_last_reference(self): storage = FileStorage(self.temp_dir) data = BytesIO(b'hello') storage.store('hello.txt', data, version=1) storage.delete('hello.txt', version=1) # check that blobs directory has no files in it # (empty directories are allowed) for _, _, files in os.walk(storage.blobs_dir): self.assertEqual(len(files), 0)
def main(argv=None): parser = argparse.ArgumentParser(description=_DESCRIPTION) parser.add_argument('root', help='root directory of filetracker storage') parser.add_argument('-s', '--silent', action='store_true', help='if set, progress bar is not printed') parser.add_argument('-f', '--full', action='store_true', help='if set, logical size of all blobs is recalculated ' '(this may take a lot of time)') args = parser.parse_args(argv) root = args.root silent = args.silent full = args.full ensure_storage_format(root) db_init(os.path.join(root, 'db')) # Create a FileStorage object to use the same db settings as usual file_storage = FileStorage(root) db = file_storage.db links_widgets = [ ' [', progress_bar.Timer(format='Time: %(elapsed)s'), '] ', ' Checking links '.ljust(_ACTION_LENGTH), ' ', progress_bar.Counter(), ' ', progress_bar.BouncingBar() ] processed_links = 0 broken_links = 0 blob_links = {} with progress_bar.conditional(show=not silent, widgets=links_widgets) as bar: for cur_dir, _, files in os.walk(file_storage.links_dir): for file_name in files: link_path = os.path.join(cur_dir, file_name) # In an unlikely case when links/ contains files # that are not links, they are removed. if not os.path.islink(link_path): os.unlink(link_path) broken_links += 1 else: blob_path = os.path.join( os.path.dirname(link_path), os.readlink(link_path)) if (os.path.islink(blob_path) or not os.path.exists(blob_path) or 'blobs/' not in blob_path): os.unlink(link_path) broken_links += 1 else: digest = os.path.basename(blob_path) blob_links[digest] = blob_links.get(digest, 0) + 1 processed_links += 1 bar.update(processed_links) for digest, link_count in six.iteritems(blob_links): db.put(digest.encode(), str(link_count).encode()) blobs_widgets = [ ' [', progress_bar.Timer(format='Time: %(elapsed)s'), '] ', ' Checking blobs '.ljust(_ACTION_LENGTH), ' ', progress_bar.Counter(), ' ', progress_bar.BouncingBar() ] processed_blobs = 0 broken_blobs = 0 with progress_bar.conditional(show=not silent, widgets=blobs_widgets) as bar: for cur_dir, _, files in os.walk(file_storage.blobs_dir): for blob_name in files: if blob_name not in blob_links: os.unlink(os.path.join(cur_dir, blob_name)) broken_blobs += 1 continue size_key = '{}:logical_size'.format(blob_name).encode() if not db.has_key(size_key) or full: blob_path = os.path.join(cur_dir, blob_name) with gzip.open(blob_path, 'rb') as zf: logical_size = _read_stream_for_size(zf) db.put(size_key, str(logical_size).encode()) processed_blobs += 1 bar.update(processed_blobs) if not silent: print('Completed, {} broken links and {} stray blobs found.'.format( broken_links, broken_blobs))
class FiletrackerServer(base.Server): """A WSGI application providing a filetracker server. Note that this wouldn't work as standalone server: a "manager" process should handle DB initialization and recovery, refer to ``filetracker.servers.run`` for more details. """ def __init__(self, dir=None): if dir is None: if 'FILETRACKER_DIR' not in os.environ: raise AssertionError( "LocalFileServer must have its working " "directory specified either as a constructor argument " "or passed via FILETRACKER_DIR environment variable.") dir = os.environ['FILETRACKER_DIR'] self.storage = FileStorage(dir) self.dir = self.storage.links_dir def parse_query_params(self, environ): return parse_qs(environ.get('QUERY_STRING', '')) def handle_PUT(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint != 'files': raise base.HttpError('400 Bad Request', 'PUT can be only performed on "/files/..."') content_length = int(environ.get('CONTENT_LENGTH')) query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None, ))[0] if last_modified: last_modified = email.utils.parsedate_tz(last_modified) last_modified = email.utils.mktime_tz(last_modified) else: raise base.HttpError('400 Bad Request', '"?last-modified=" is required') compressed = environ.get('HTTP_CONTENT_ENCODING', None) == 'gzip' digest = environ.get('HTTP_SHA256_CHECKSUM', None) logical_size = environ.get('HTTP_LOGICAL_SIZE', None) if compressed and digest and logical_size: logger.debug('Handling PUT %s.', path) else: logger.info( 'Handling PUT %s with unusual headers: ' 'compressed=%s, digest=%s, logical_size=%s', path, compressed, digest, logical_size) version = self.storage.store(name=path, data=environ['wsgi.input'], version=last_modified, size=content_length, compressed=compressed, digest=digest, logical_size=logical_size) start_response('200 OK', [ ('Content-Type', 'text/plain'), ('Last-Modified', email.utils.formatdate(version)), ]) return [] def _file_headers(self, name): link_st = os.lstat(os.path.join(self.dir, name)) blob_st = os.stat(os.path.join(self.dir, name)) logical_size = self.storage.logical_size(name) return [ ('Content-Type', 'application/octet-stream'), ('Content-Length', str(blob_st.st_size)), ('Content-Encoding', 'gzip'), ('Last-Modified', email.utils.formatdate(link_st.st_mtime)), ('Logical-Size', str(logical_size)), ] def handle_GET(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint == 'list': return self.handle_list(environ, start_response) elif endpoint == 'version': return self.handle_version(environ, start_response) elif endpoint == 'files': full_path = os.path.join(self.dir, path) if not os.path.isfile(full_path): raise base.HttpError('404 Not Found', 'File "{}" not found'.format(full_path)) start_response('200 OK', self._file_headers(path)) return _FileIterator(open(full_path, 'rb')) else: raise base.HttpError( '400 Bad Request', 'Unknown endpoint "{}", expected "files" or "list"'.format( endpoint)) def handle_DELETE(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint != 'files': raise HttpError('400 Bad Request', 'DELETE can be only performed on "/files/..."') query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None, ))[0] if last_modified: last_modified = email.utils.parsedate_tz(last_modified) last_modified = email.utils.mktime_tz(last_modified) else: raise base.HttpError('400 Bad Request', '"?last-modified=" is required') logger.debug('Handling DELETE %s@%d', path, last_modified) try: self.storage.delete(name=path, version=last_modified) except FiletrackerFileNotFoundError: raise base.HttpError('404 Not Found', '') start_response('200 OK', []) return [] def handle_list(self, environ, start_response): _, path = base.get_endpoint_and_path(environ) query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None, ))[0] if not last_modified: last_modified = int(time.time()) logger.debug('Handling GET /list/%s (@%d)', path, last_modified) root_dir = os.path.join(self.dir, path) if not os.path.isdir(root_dir): raise base.HttpError('400 Bad Request', 'Path doesn\'t exist or is not a directory') start_response('200 OK', []) return _list_files_iterator(root_dir, last_modified) def handle_version(self, environ, start_response): start_response('200 OK', [('Content-Type', 'application/json')]) response = { 'protocol_versions': [2], } return [json.dumps(response).encode('utf8')]
class FiletrackerServer(base.Server): """A WSGI application providing a filetracker server. Note that this wouldn't work as standalone server: a "manager" process should handle DB initialization and recovery, refer to ``filetracker.servers.run`` for more details. """ def __init__(self, dir=None): if dir is None: if 'FILETRACKER_DIR' not in os.environ: raise AssertionError("LocalFileServer must have its working " "directory specified either as a constructor argument " "or passed via FILETRACKER_DIR environment variable.") dir = os.environ['FILETRACKER_DIR'] self.storage = FileStorage(dir) self.dir = self.storage.links_dir def parse_query_params(self, environ): return parse_qs(environ.get('QUERY_STRING', '')) def handle_PUT(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint != 'files': raise base.HttpError('400 Bad Request', 'PUT can be only performed on "/files/..."') content_length = int(environ.get('CONTENT_LENGTH')) query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None,))[0] if last_modified: last_modified = email.utils.parsedate_tz(last_modified) last_modified = email.utils.mktime_tz(last_modified) else: raise base.HttpError('400 Bad Request', '"?last-modified=" is required') compressed = environ.get('HTTP_CONTENT_ENCODING', None) == 'gzip' digest = environ.get('HTTP_SHA256_CHECKSUM', None) logical_size = environ.get('HTTP_LOGICAL_SIZE', None) if compressed and digest and logical_size: logger.debug('Handling PUT %s.', path) else: logger.info('Handling PUT %s with unusual headers: ' 'compressed=%s, digest=%s, logical_size=%s', path, compressed, digest, logical_size) version = self.storage.store(name=path, data=environ['wsgi.input'], version=last_modified, size=content_length, compressed=compressed, digest=digest, logical_size=logical_size) start_response('200 OK', [ ('Content-Type', 'text/plain'), ('Last-Modified', email.utils.formatdate(version)), ]) return [] def _file_headers(self, name): link_st = os.lstat(os.path.join(self.dir, name)) blob_st = os.stat(os.path.join(self.dir, name)) logical_size = self.storage.logical_size(name) return [ ('Content-Type', 'application/octet-stream'), ('Content-Length', str(blob_st.st_size)), ('Content-Encoding', 'gzip'), ('Last-Modified', email.utils.formatdate(link_st.st_mtime)), ('Logical-Size', str(logical_size)), ] def handle_GET(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint == 'list': return self.handle_list(environ, start_response) elif endpoint == 'version': return self.handle_version(environ, start_response) elif endpoint == 'files': full_path = os.path.join(self.dir, path) if not os.path.isfile(full_path): raise base.HttpError('404 Not Found', 'File "{}" not found'.format(full_path)) start_response('200 OK', self._file_headers(path)) return _FileIterator(open(full_path, 'rb')) else: raise base.HttpError( '400 Bad Request', 'Unknown endpoint "{}", expected "files" or "list"' .format(endpoint)) def handle_DELETE(self, environ, start_response): endpoint, path = base.get_endpoint_and_path(environ) if endpoint != 'files': raise HttpError('400 Bad Request', 'DELETE can be only performed on "/files/..."') query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None,))[0] if last_modified: last_modified = email.utils.parsedate_tz(last_modified) last_modified = email.utils.mktime_tz(last_modified) else: raise base.HttpError('400 Bad Request', '"?last-modified=" is required') logger.debug('Handling DELETE %s@%d', path, last_modified) try: self.storage.delete(name=path, version=last_modified) except FiletrackerFileNotFoundError: raise base.HttpError('404 Not Found', '') start_response('200 OK', []) return [] def handle_list(self, environ, start_response): _, path = base.get_endpoint_and_path(environ) query_params = self.parse_query_params(environ) last_modified = query_params.get('last_modified', (None,))[0] if not last_modified: last_modified = int(time.time()) logger.debug('Handling GET /list/%s (@%d)', path, last_modified) root_dir = os.path.join(self.dir, path) if not os.path.isdir(root_dir): raise base.HttpError('400 Bad Request', 'Path doesn\'t exist or is not a directory') start_response('200 OK', []) return _list_files_iterator(root_dir, last_modified) def handle_version(self, environ, start_response): start_response('200 OK', [('Content-Type', 'application/json')]) response = { 'protocol_versions': [2], } return [json.dumps(response).encode('utf8')]