Ejemplo n.º 1
0
    def test_store_should_add_file_to_storage(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        with gzip.open(storage_path, 'rb') as f:
            self.assertEqual(f.read(), b'hello')
Ejemplo n.º 2
0
    def test_store_should_respect_given_data_size(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1, size=2)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        with gzip.open(storage_path, 'rb') as f:
            self.assertEqual(f.read(), b'he')
Ejemplo n.º 3
0
 def __init__(self, dir=None):
     if dir is None:
         if 'FILETRACKER_DIR' not in os.environ:
             raise AssertionError("LocalFileServer must have its working "
                     "directory specified either as a constructor argument "
                     "or passed via FILETRACKER_DIR environment variable.")
         dir = os.environ['FILETRACKER_DIR']
     self.storage = FileStorage(dir)
     self.dir = self.storage.links_dir
Ejemplo n.º 4
0
    def test_store_should_set_modified_time_to_version(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        storage_version = os.lstat(storage_path).st_mtime

        self.assertEqual(1, storage_version)
Ejemplo n.º 5
0
    def test_store_should_not_overwrite_newer_files(self):
        storage = FileStorage(self.temp_dir)
        old_data = BytesIO(b'hello')
        new_data = BytesIO(b'world')

        storage.store('hello.txt', new_data, version=2)
        storage.store('hello.txt', old_data, version=1)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        with gzip.open(storage_path, 'rb') as f:
            self.assertEqual(f.read(), b'world')
Ejemplo n.º 6
0
    def test_store_should_reuse_blobs(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)

        data.seek(0)
        storage.store('world.txt', data, version=1)

        storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt')
        storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt')

        self.assertEqual(os.readlink(storage_path_a),
                         os.readlink(storage_path_b))
Ejemplo n.º 7
0
    def test_store_should_add_compressed_file_to_storage_as_is(self):
        storage = FileStorage(self.temp_dir)
        raw_data = BytesIO(b'hello')
        gz_data = BytesIO()

        with gzip.GzipFile(fileobj=gz_data, mode='wb') as dst:
            shutil.copyfileobj(raw_data, dst)

        gz_data.seek(0)

        storage.store('hello.txt', gz_data, version=1, compressed=True)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        with gzip.open(storage_path, 'rb') as f:
            self.assertEqual(f.read(), b'hello')
Ejemplo n.º 8
0
    def test_store_should_accept_digest_hints(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')
        digest = hashlib.sha256(b'hello').hexdigest()

        storage.store('hello.txt', data, version=1)

        data.seek(0)
        storage.store('world.txt', data, version=1, digest=digest)

        storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt')
        storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt')

        self.assertEqual(os.readlink(storage_path_a),
                         os.readlink(storage_path_b))
Ejemplo n.º 9
0
 def __init__(self, dir=None):
     if dir is None:
         if 'FILETRACKER_DIR' not in os.environ:
             raise AssertionError("LocalFileServer must have its working "
                     "directory specified either as a constructor argument "
                     "or passed via FILETRACKER_DIR environment variable.")
         dir = os.environ['FILETRACKER_DIR']
     self.storage = FileStorage(dir)
     self.dir = self.storage.links_dir
Ejemplo n.º 10
0
 def test_changing_version_should_not_affect_other_links(self):
     storage = FileStorage(self.temp_dir)
     data = BytesIO(b'hello')
     storage.store('hello.txt', data, version=1)
     data.seek(0)
     storage.store('world.txt', data, version=2)
     self.assertEqual(storage.stored_version('hello.txt'), 1)
     self.assertEqual(storage.stored_version('world.txt'), 2)
Ejemplo n.º 11
0
    def test_should_recreate_db(self):
        _touch_hello_gz(os.path.join(self.temp_dir, 'blobs', '00', '0000'))

        os.symlink(os.path.join(self.temp_dir, 'blobs', '00', '0000'),
                   os.path.join(self.temp_dir, 'links', '0.txt'))

        recover.main([self.temp_dir, '-s', '-f'])

        storage = FileStorage(self.temp_dir)

        self.assertEqual(storage.db.get(b'0000'), b'1')
        self.assertEqual(storage.db.get(b'0000:logical_size'), b'5')
Ejemplo n.º 12
0
    def test_removing_one_reference_should_not_break_others(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)

        data.seek(0)
        storage.store('world.txt', data, version=1)
        storage.delete('hello.txt', version=1)

        path = os.path.join(self.temp_dir, 'links', 'world.txt')

        with gzip.open(path, 'rb') as f:
            self.assertEqual(f.read(), b'hello')
Ejemplo n.º 13
0
    def test_deleting_older_version_should_have_no_effect(self):
        storage = FileStorage(self.temp_dir)

        storage.store('hello.txt', BytesIO(b'world'), version=2)
        storage.delete('hello.txt', version=1)

        storage_path = os.path.join(self.temp_dir, 'links', 'hello.txt')
        with gzip.open(storage_path, 'rb') as f:
            self.assertEqual(f.read(), b'world')
Ejemplo n.º 14
0
    def test_blob_should_be_deleted_with_the_last_reference(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)
        storage.delete('hello.txt', version=1)

        # check that blobs directory has no files in it
        # (empty directories are allowed)
        for _, _, files in os.walk(storage.blobs_dir):
            self.assertEqual(len(files), 0)
Ejemplo n.º 15
0
    def test_store_should_reuse_blobs(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')

        storage.store('hello.txt', data, version=1)

        data.seek(0)
        storage.store('world.txt', data, version=1)

        storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt')
        storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt')

        self.assertEqual(os.readlink(storage_path_a), os.readlink(storage_path_b))
Ejemplo n.º 16
0
    def test_store_should_accept_digest_hints(self):
        storage = FileStorage(self.temp_dir)
        data = BytesIO(b'hello')
        digest = hashlib.sha256(b'hello').hexdigest()

        storage.store('hello.txt', data, version=1)

        data.seek(0)
        storage.store('world.txt', data, version=1, digest=digest)

        storage_path_a = os.path.join(self.temp_dir, 'links', 'hello.txt')
        storage_path_b = os.path.join(self.temp_dir, 'links', 'world.txt')

        self.assertEqual(os.readlink(storage_path_a), os.readlink(storage_path_b))
Ejemplo n.º 17
0
def main(argv=None):
    parser = argparse.ArgumentParser(description=_DESCRIPTION)
    parser.add_argument('root', help='root directory of filetracker storage')
    parser.add_argument('-s', '--silent', action='store_true',
            help='if set, progress bar is not printed')
    parser.add_argument('-f', '--full', action='store_true',
            help='if set, logical size of all blobs is recalculated '
                 '(this may take a lot of time)')

    args = parser.parse_args(argv)
    root = args.root
    silent = args.silent
    full = args.full

    ensure_storage_format(root)
    db_init(os.path.join(root, 'db'))

    # Create a FileStorage object to use the same db settings as usual
    file_storage = FileStorage(root)
    db = file_storage.db

    links_widgets = [
            ' [', progress_bar.Timer(format='Time: %(elapsed)s'), '] ',
            ' Checking links '.ljust(_ACTION_LENGTH),
            ' ', progress_bar.Counter(), ' ',
            progress_bar.BouncingBar()
    ]

    processed_links = 0
    broken_links = 0
    blob_links = {}

    with progress_bar.conditional(show=not silent,
                                  widgets=links_widgets) as bar:
        for cur_dir, _, files in os.walk(file_storage.links_dir):
            for file_name in files:
                link_path = os.path.join(cur_dir, file_name)

                # In an unlikely case when links/ contains files
                # that are not links, they are removed.
                if not os.path.islink(link_path):
                    os.unlink(link_path)
                    broken_links += 1
                else:
                    blob_path = os.path.join(
                            os.path.dirname(link_path), os.readlink(link_path))
                    if (os.path.islink(blob_path)
                            or not os.path.exists(blob_path)
                            or 'blobs/' not in blob_path):
                        os.unlink(link_path)
                        broken_links += 1
                    else:
                        digest = os.path.basename(blob_path)
                        blob_links[digest] = blob_links.get(digest, 0) + 1

                processed_links += 1
                bar.update(processed_links)

    for digest, link_count in six.iteritems(blob_links):
        db.put(digest.encode(), str(link_count).encode())

    blobs_widgets = [
            ' [', progress_bar.Timer(format='Time: %(elapsed)s'), '] ',
            ' Checking blobs '.ljust(_ACTION_LENGTH),
            ' ', progress_bar.Counter(), ' ',
            progress_bar.BouncingBar()
    ]

    processed_blobs = 0
    broken_blobs = 0

    with progress_bar.conditional(show=not silent,
                                  widgets=blobs_widgets) as bar:
        for cur_dir, _, files in os.walk(file_storage.blobs_dir):
            for blob_name in files:
                if blob_name not in blob_links:
                    os.unlink(os.path.join(cur_dir, blob_name))
                    broken_blobs += 1
                    continue

                size_key = '{}:logical_size'.format(blob_name).encode()
                if not db.has_key(size_key) or full:
                    blob_path = os.path.join(cur_dir, blob_name)
                    with gzip.open(blob_path, 'rb') as zf:
                        logical_size = _read_stream_for_size(zf)

                    db.put(size_key, str(logical_size).encode())

                processed_blobs += 1
                bar.update(processed_blobs)

    if not silent:
        print('Completed, {} broken links and {} stray blobs found.'.format(
            broken_links, broken_blobs))
Ejemplo n.º 18
0
class FiletrackerServer(base.Server):
    """A WSGI application providing a filetracker server.

    Note that this wouldn't work as standalone server: a "manager"
    process should handle DB initialization and recovery, refer
    to ``filetracker.servers.run`` for more details.
    """
    def __init__(self, dir=None):
        if dir is None:
            if 'FILETRACKER_DIR' not in os.environ:
                raise AssertionError(
                    "LocalFileServer must have its working "
                    "directory specified either as a constructor argument "
                    "or passed via FILETRACKER_DIR environment variable.")
            dir = os.environ['FILETRACKER_DIR']
        self.storage = FileStorage(dir)
        self.dir = self.storage.links_dir

    def parse_query_params(self, environ):
        return parse_qs(environ.get('QUERY_STRING', ''))

    def handle_PUT(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint != 'files':
            raise base.HttpError('400 Bad Request',
                                 'PUT can be only performed on "/files/..."')

        content_length = int(environ.get('CONTENT_LENGTH'))

        query_params = self.parse_query_params(environ)
        last_modified = query_params.get('last_modified', (None, ))[0]
        if last_modified:
            last_modified = email.utils.parsedate_tz(last_modified)
            last_modified = email.utils.mktime_tz(last_modified)
        else:
            raise base.HttpError('400 Bad Request',
                                 '"?last-modified=" is required')

        compressed = environ.get('HTTP_CONTENT_ENCODING', None) == 'gzip'

        digest = environ.get('HTTP_SHA256_CHECKSUM', None)
        logical_size = environ.get('HTTP_LOGICAL_SIZE', None)

        if compressed and digest and logical_size:
            logger.debug('Handling PUT %s.', path)
        else:
            logger.info(
                'Handling PUT %s with unusual headers: '
                'compressed=%s, digest=%s, logical_size=%s', path, compressed,
                digest, logical_size)

        version = self.storage.store(name=path,
                                     data=environ['wsgi.input'],
                                     version=last_modified,
                                     size=content_length,
                                     compressed=compressed,
                                     digest=digest,
                                     logical_size=logical_size)
        start_response('200 OK', [
            ('Content-Type', 'text/plain'),
            ('Last-Modified', email.utils.formatdate(version)),
        ])
        return []

    def _file_headers(self, name):
        link_st = os.lstat(os.path.join(self.dir, name))
        blob_st = os.stat(os.path.join(self.dir, name))
        logical_size = self.storage.logical_size(name)
        return [
            ('Content-Type', 'application/octet-stream'),
            ('Content-Length', str(blob_st.st_size)),
            ('Content-Encoding', 'gzip'),
            ('Last-Modified', email.utils.formatdate(link_st.st_mtime)),
            ('Logical-Size', str(logical_size)),
        ]

    def handle_GET(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint == 'list':
            return self.handle_list(environ, start_response)
        elif endpoint == 'version':
            return self.handle_version(environ, start_response)
        elif endpoint == 'files':
            full_path = os.path.join(self.dir, path)

            if not os.path.isfile(full_path):
                raise base.HttpError('404 Not Found',
                                     'File "{}" not found'.format(full_path))

            start_response('200 OK', self._file_headers(path))
            return _FileIterator(open(full_path, 'rb'))
        else:
            raise base.HttpError(
                '400 Bad Request',
                'Unknown endpoint "{}", expected "files" or "list"'.format(
                    endpoint))

    def handle_DELETE(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint != 'files':
            raise HttpError('400 Bad Request',
                            'DELETE can be only performed on "/files/..."')

        query_params = self.parse_query_params(environ)
        last_modified = query_params.get('last_modified', (None, ))[0]
        if last_modified:
            last_modified = email.utils.parsedate_tz(last_modified)
            last_modified = email.utils.mktime_tz(last_modified)
        else:
            raise base.HttpError('400 Bad Request',
                                 '"?last-modified=" is required')

        logger.debug('Handling DELETE %s@%d', path, last_modified)

        try:
            self.storage.delete(name=path, version=last_modified)
        except FiletrackerFileNotFoundError:
            raise base.HttpError('404 Not Found', '')

        start_response('200 OK', [])
        return []

    def handle_list(self, environ, start_response):
        _, path = base.get_endpoint_and_path(environ)
        query_params = self.parse_query_params(environ)

        last_modified = query_params.get('last_modified', (None, ))[0]
        if not last_modified:
            last_modified = int(time.time())

        logger.debug('Handling GET /list/%s (@%d)', path, last_modified)

        root_dir = os.path.join(self.dir, path)
        if not os.path.isdir(root_dir):
            raise base.HttpError('400 Bad Request',
                                 'Path doesn\'t exist or is not a directory')

        start_response('200 OK', [])
        return _list_files_iterator(root_dir, last_modified)

    def handle_version(self, environ, start_response):
        start_response('200 OK', [('Content-Type', 'application/json')])
        response = {
            'protocol_versions': [2],
        }
        return [json.dumps(response).encode('utf8')]
Ejemplo n.º 19
0
class FiletrackerServer(base.Server):
    """A WSGI application providing a filetracker server.

    Note that this wouldn't work as standalone server: a "manager"
    process should handle DB initialization and recovery, refer
    to ``filetracker.servers.run`` for more details.
    """

    def __init__(self, dir=None):
        if dir is None:
            if 'FILETRACKER_DIR' not in os.environ:
                raise AssertionError("LocalFileServer must have its working "
                        "directory specified either as a constructor argument "
                        "or passed via FILETRACKER_DIR environment variable.")
            dir = os.environ['FILETRACKER_DIR']
        self.storage = FileStorage(dir)
        self.dir = self.storage.links_dir

    def parse_query_params(self, environ):
        return parse_qs(environ.get('QUERY_STRING', ''))

    def handle_PUT(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint != 'files':
            raise base.HttpError('400 Bad Request',
                                 'PUT can be only performed on "/files/..."')

        content_length = int(environ.get('CONTENT_LENGTH'))

        query_params = self.parse_query_params(environ)
        last_modified = query_params.get('last_modified', (None,))[0]
        if last_modified:
            last_modified = email.utils.parsedate_tz(last_modified)
            last_modified = email.utils.mktime_tz(last_modified)
        else:
            raise base.HttpError('400 Bad Request',
                                 '"?last-modified=" is required')

        compressed = environ.get('HTTP_CONTENT_ENCODING', None) == 'gzip'

        digest = environ.get('HTTP_SHA256_CHECKSUM', None)
        logical_size = environ.get('HTTP_LOGICAL_SIZE', None)

        if compressed and digest and logical_size:
            logger.debug('Handling PUT %s.', path)
        else:
            logger.info('Handling PUT %s with unusual headers: '
                    'compressed=%s, digest=%s, logical_size=%s',
                    path, compressed, digest, logical_size)

        version = self.storage.store(name=path,
                                     data=environ['wsgi.input'],
                                     version=last_modified,
                                     size=content_length,
                                     compressed=compressed,
                                     digest=digest,
                                     logical_size=logical_size)
        start_response('200 OK', [
            ('Content-Type', 'text/plain'),
            ('Last-Modified', email.utils.formatdate(version)),
        ])
        return []

    def _file_headers(self, name):
        link_st = os.lstat(os.path.join(self.dir, name))
        blob_st = os.stat(os.path.join(self.dir, name))
        logical_size = self.storage.logical_size(name)
        return [
                ('Content-Type', 'application/octet-stream'),
                ('Content-Length', str(blob_st.st_size)),
                ('Content-Encoding', 'gzip'),
                ('Last-Modified', email.utils.formatdate(link_st.st_mtime)),
                ('Logical-Size', str(logical_size)),
            ]

    def handle_GET(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint == 'list':
            return self.handle_list(environ, start_response)
        elif endpoint == 'version':
            return self.handle_version(environ, start_response)
        elif endpoint == 'files':
            full_path = os.path.join(self.dir, path)

            if not os.path.isfile(full_path):
                raise base.HttpError('404 Not Found',
                                     'File "{}" not found'.format(full_path))

            start_response('200 OK', self._file_headers(path))
            return _FileIterator(open(full_path, 'rb'))
        else:
            raise base.HttpError(
                    '400 Bad Request',
                    'Unknown endpoint "{}", expected "files" or "list"'
                    .format(endpoint))

    def handle_DELETE(self, environ, start_response):
        endpoint, path = base.get_endpoint_and_path(environ)
        if endpoint != 'files':
            raise HttpError('400 Bad Request',
                            'DELETE can be only performed on "/files/..."')

        query_params = self.parse_query_params(environ)
        last_modified = query_params.get('last_modified', (None,))[0]
        if last_modified:
            last_modified = email.utils.parsedate_tz(last_modified)
            last_modified = email.utils.mktime_tz(last_modified)
        else:
            raise base.HttpError('400 Bad Request',
                                 '"?last-modified=" is required')

        logger.debug('Handling DELETE %s@%d', path, last_modified)

        try:
            self.storage.delete(name=path,
                                version=last_modified)
        except FiletrackerFileNotFoundError:
            raise base.HttpError('404 Not Found', '')

        start_response('200 OK', [])
        return []

    def handle_list(self, environ, start_response):
        _, path = base.get_endpoint_and_path(environ)
        query_params = self.parse_query_params(environ)

        last_modified = query_params.get('last_modified', (None,))[0]
        if not last_modified:
            last_modified = int(time.time())

        logger.debug('Handling GET /list/%s (@%d)', path, last_modified)

        root_dir = os.path.join(self.dir, path)
        if not os.path.isdir(root_dir):
            raise base.HttpError('400 Bad Request',
                            'Path doesn\'t exist or is not a directory')

        start_response('200 OK', [])
        return _list_files_iterator(root_dir, last_modified)

    def handle_version(self, environ, start_response):
        start_response('200 OK', [('Content-Type', 'application/json')])
        response = {
                'protocol_versions': [2],
        }
        return [json.dumps(response).encode('utf8')]