def test_files_has_been_entangled_enough(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() monkeypatch.setattr(FakeRedisPipeline, "execute", mock_pipeline_get_block) monkeypatch.setattr(files.redis, "pipeline", get_me_a_fake_redis_pipeline) assert files.has_been_entangled_enough("path", 0) is True assert files.has_been_entangled_enough("path", 1) is True assert files.has_been_entangled_enough("path", 2) is False
def test_files_get_provider_returns_empty_list_if_no_blocks(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() def return_empty_list(): return [] monkeypatch.setattr(files, "list_blocks", return_empty_list) result = files.get_blocks_from_provider("NonExistingProvider") assert isinstance(result, list) assert not result
def test_files_list_blocks(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() def get_block_range(my_range, start, end): return ["block1", "block2", "block3"] monkeypatch.setattr(files.redis, "zrange", get_block_range) blocks = files.list_blocks() assert isinstance(blocks, list) assert len(blocks) == 3 assert blocks == ["block1", "block2", "block3"]
def audit(): """ Downloads and checks the integrity of the blocks stored on the storage nodes. Failing blocks that cannot be recovered by copying from a replica are grouped by documents and added to a queue returned by the function. Returns: dict(str, set(int)): Documents and the indices of the blocks that need to be reconstructed """ reconstruction_needed = [] dispatcher = Dispatcher(get_dispatcher_configuration()) files = Files(host="metadata") # List blocks blocks = files.get_blocks(files.list_blocks()) # For each block for block in blocks: LOGGER.debug("Looking at block {:s}".format(block.key)) # For each replica of the block providers = list(set(block.providers)) for provider_name in providers: LOGGER.debug("Looking at replica of {:s} on {:s}".format(block.key, provider_name)) # Download replica replica = dispatcher.providers[provider_name].get(block.key) computed_checksum = None if replica: computed_checksum = hashlib.sha256(replica).digest() # If the replica does not match its checksum if not replica or computed_checksum != block.checksum: repaired = False if not replica: LOGGER.warn("Could not load replica of {:s} on {:s}".format(block.key, provider_name)) if computed_checksum: LOGGER.warn("Replica of {:s} on {:s} does not match expected checksum (actual = {:s}, expected = {:s})".format(block.key, provider_name, convert_binary_to_hex_digest(computed_checksum), convert_binary_to_hex_digest(block.checksum))) # Look for sane replicas other_providers = list(set(providers).difference(set([provider_name]))) if other_providers: for other_provider in other_providers: candidate_replica = dispatcher.providers[other_provider].get(block.key) if not candidate_replica: continue candidate_checksum = hashlib.sha256(candidate_replica).digest() if candidate_checksum == block.checksum: # Copy the new valid replica dispatcher.providers[provider_name].put(candidate_replica, block.key) repaired = True break # Otherwise if not repaired: # Queue the block for reconstruction LOGGER.warn("Replica of {:s} on {:s} must be reconstructed".format(block.key, provider_name)) reconstruction_needed.append(block.key) else: LOGGER.debug("Replica of {:s} on {:s} is OK".format(block.key, provider_name)) return group_blocks_by_path(reconstruction_needed)
def test_files_get_provider(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() def return_block_names(): return ["doc-00"] monkeypatch.setattr(files, "list_blocks", return_block_names) def return_fake_blocks(block_list): return [MetaBlock(key, providers=["FakeProvider"]) for key in block_list] monkeypatch.setattr(files, "get_blocks", return_fake_blocks) result = files.get_blocks_from_provider("FakeProvider") assert isinstance(result, list) assert len(result) == 1
def test_files_exists(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() def always_true(_): return True def always_false(_): return False monkeypatch.setattr(files.redis, "exists", always_false) assert files.exists("path") == False monkeypatch.delattr(files.redis, "exists") monkeypatch.setattr(files.redis, "exists", always_true) assert files.exists("path") == True
def get_block_metadata(path, index): """ Returns the block metadata Args: path(str): Path to the file index(int): Index of the block in the file Returns: MetaBlock: Block metadata """ files = Files() key = compute_block_key(path, index) return files.get_block(key)
def test_files_exists_bad_path(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() error_message = "path argument must be a non empty string" with pytest.raises(ValueError, match=error_message): files.exists(None) with pytest.raises(ValueError, match=error_message): files.exists("") with pytest.raises(ValueError, match=error_message): files.exists({"a": 0}) with pytest.raises(ValueError, match=error_message): files.exists([])
def find_pointing_documents(path, index): """ Returns the Metadata of the documents that use a given block as a pointer. Args: path(str): Path to the file index(int): Index of the block in the file Returns: list(Metadata): List of documents that used the block as a pointer """ block_metadata = get_block_metadata(path, index) files = Files() pointing_documents = files.get_files(block_metadata.entangled_with) return pointing_documents
def reconstruct_as_pointer(path, index): """ Reconstruct a block by reassembling a codeword it was part of and plucking it from the result. Args: path(str): Path to the file the block belongs to index(int): Index of the block Returns: bytes: The constructed block Raises: ValueError: If path is not a string or is empty or if the index is negative RuntimeError: If the block has never been pointed at """ if not isinstance(path, str) or not path: raise ValueError("path argument must be a non empty string") if not isinstance(index, int) or index < 0: raise ValueError("index argument must be an integer greater or equal to 0") metablock = get_block_metadata(path, index) documents = find_pointing_documents(path, index) if not documents: raise RuntimeError("Could not find any pointing document") with open(os.path.join(os.path.dirname(__file__), "..", "dispatcher.json"), "r") as handle: entanglement_configuration = json.load(handle)["entanglement"]["configuration"] source_blocks = entanglement_configuration["s"] pointer_blocks = entanglement_configuration["t"] offset = source_blocks + pointer_blocks files = Files() for document in documents: index_to_reconstruct = get_position_in_codeword(document, metablock) coder_client = CoderClient() reconstructed_block = coder_client.reconstruct(document.path, [index_to_reconstruct])[index_to_reconstruct].data elements = split_strip_header(reconstructed_block) fragment_header = elements[2] metadata = files.get(path) fragment_header.metadata.index = offset + index fragment_size = int(math.ceil(metadata.original_size / float(source_blocks))) if (fragment_size % 2) == 1: fragment_size += 1 fragment_header.metadata.size = fragment_size fragment_header.metadata.orig_data_size = fragment_size * (offset) brand_new = json.dumps(metadata.entangling_blocks) + HEADER_DELIMITER + \ str(metadata.original_size) + HEADER_DELIMITER + \ fragment_header.pack() + \ get_fragment(reconstructed_block) brand_new = brand_new[:metablock.size] computed_checksum = hashlib.sha256(brand_new).digest() if computed_checksum == metablock.checksum: return brand_new raise RuntimeError("Could not reconstruct {:s} from pointing documents".format(metablock.key))
def seed_system(): """ Checks if the system has enough pointers at disposal and creates extra anchor files if needed. """ seed_logger = logging.getLogger("seed") seed_logger.info("Checking if the system needs to be seeded") with open(os.path.join(os.path.dirname(__file__), "./dispatcher.json")) as handle: configuration = json.load(handle) if "entanglement" not in configuration or "type" not in configuration["entanglement"] or \ configuration["entanglement"]["type"] != "step": seed_logger.info("No need to see the system") return files = Files(host="metadata") pointers_available = files.get_number_of_blocks_available() pointers_needed = configuration["entanglement"]["configuration"]["t"] difference = pointers_needed - pointers_available if difference <= 0: seed_logger.info("Seeding done") return storage = offline.Storage("/tmp/") parities = configuration["entanglement"]["configuration"]["p"] driver = pyproxy.coder.entangled_driver.StepEntangler( storage, 1, 0, parities) raw_block = pyproxy.coder.entangled_driver.pad("", 1024 * 1024) coded_strips = driver.encode(raw_block) seed_logger.info("Creating {:d} anchoring blocks".format(difference)) documents_needed = int(math.ceil(float(pointers_needed) / parities)) for index in xrange(documents_needed): path = "anchor-{:d}".format(index) strips = [] for block_index, coded_strip in enumerate(coded_strips): strip = Strip() strip.id = "{:s}-{:02d}".format(path, block_index) strip.data = coded_strip strip.checksum = hashlib.sha256(coded_strip).digest() strip.type = Strip.DATA strips.append(strip) encoded_file = playcloud_pb2.File(path=path, strips=strips, original_size=len(raw_block)) metadata = DISPATCHER.put(path, encoded_file) metadata.entangling_blocks = extract_entanglement_data( encoded_file.strips[0].data) FILES.put(path, metadata) seed_logger.info("Seeding done")
def test_files_get(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() def mock_pipeline_get(path): return [{ "path": "path", "original_size": "0", "creation_date": "2017-01-01 00:00:00.42", "blocks": "", "providers": "", "block_type": BlockType.DATA.name, "entangling_blocks": "[]" }] monkeypatch.setattr(FakeRedisPipeline, "execute", mock_pipeline_get) monkeypatch.setattr(files.redis, "pipeline", get_me_a_fake_redis_pipeline) metadata = files.get("path") assert metadata.path == "path"
def repair(path, indices): """ Repairs one or multiple blocks of a document Args: path(str): Path to the document indices(list(int)): Indices of the blocks to retrieve """ if not isinstance(path, str) or not path: raise ValueError("Argument path must be a non-empty string") if not isinstance(indices, list): raise ValueError("Argument indices must be list") if not indices: return for index, index_to_reconstruct in enumerate(indices): if not isinstance(index_to_reconstruct, int): error_message = "indices[{:d}] is not an integer".format(index) raise ValueError(error_message) erasures_threshold = get_erasure_threshold() dispatcher = Dispatcher(get_dispatcher_configuration()) files = Files() if len(indices) <= erasures_threshold: reconstructed_blocks = reconstruct_with_RS(path, indices) for index in reconstructed_blocks: reconstructed_block = reconstructed_blocks[index] metablock = files.get_block(compute_block_key(path, index)) for provider_name in set(metablock.providers): dispatcher.providers[provider_name].put(reconstructed_block, metablock.key) else: #FIXME order of blocks to reach e erasures than do RS reconstuction indices.sort() while len(indices) > erasures_threshold: index = indices.pop(0) reconstructed_block = reconstruct_as_pointer(path, index) metablock = files.get_block(compute_block_key(path, index)) for provider_name in set(metablock.providers): dispatcher.providers[provider_name].put(reconstructed_block, metablock.key) reconstructed_blocks = reconstruct_with_RS(path, indices) for index in reconstructed_blocks: reconstructed_block = reconstructed_blocks[index] metablock = files.get_block(compute_block_key(path, index)) for provider_name in set(metablock.providers): dispatcher.providers[provider_name].put(reconstructed_block, metablock.key)
def test_files_has_been_entangled_enough_raises_ValueError_if_pointers_is_lower_than_0(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="pointers argument must be a valid integer greater or equal to 0"): files.has_been_entangled_enough("path", -1)
def test_files_has_been_entangled_enough_raises_ValueError_if_block_key_is_None(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="path argument must be a valid non-empty string"): files.has_been_entangled_enough(None, 1)
def test_files_put_raises_ValueError_if_path_is_empty(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="path argument must be a valid non-empty string"): files.put("", MetaDocument("path"))
def test_files_put_raises_ValueError_if_metadata_is_None(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="metadata argument must be a valid Metadata object"): files.put("path", None)
def test_files_get_raises_keyerror_when_path_not_in_files(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() monkeypatch.setattr(files.redis, "pipeline", get_me_a_fake_redis_pipeline) with pytest.raises(KeyError, match="path NonExistingKey not found"): files.get("NonExistingKey")
def test_files_get_raises_valueerror_when_path_is_empty(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="path argument must be a valid non-empty string"): files.get("")
def test_files_has_been_entangled_enough_raises_KeyError_if_block_key_does_not_match_any_existing_key(monkeypatch): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() monkeypatch.setattr(files.redis, "pipeline", get_me_a_fake_redis_pipeline) with pytest.raises(KeyError, match="key {:s} not found".format("NonExistingKey")): files.has_been_entangled_enough("NonExistingKey", 2)
def test_files_get_provider_with_empty_string_as_provider(): with mock.patch("socket.gethostbyname", return_value="127.0.0.1"): files = Files() with pytest.raises(ValueError, match="provider argument must be a non empty string"): files.get_blocks_from_provider("")
os.path.join(os.path.dirname(__file__), "logging.conf")) logging.config.fileConfig(LOG_CONFIG) LOGGER = logging.getLogger("proxy") # GRPC setup DEFAULT_GRPC_TIMEOUT_IN_SECONDS = 60 GRPC_MESSAGE_SIZE = (2 * 1024 * 1024 * 1024) - 1 # 2^31 - 1 GRPC_OPTIONS = [("grpc.max_receive_message_length", GRPC_MESSAGE_SIZE), ("grpc.max_send_message_length", GRPC_MESSAGE_SIZE)] CODER_CLIENT = pyproxy.coder_client.LocalCoderClient() # Loading dispatcher DISPATCHER = pyproxy.pyproxy_globals.get_dispatcher_instance() # Loading the Files metadata structure FILES = Files() # Bottle webapp configuration bottle.BaseRequest.MEMFILE_MAX = 1024 * 1024 * 1024 APP = bottle.app() # Setup kazoo KAZOO = None HOSTNAME = os.uname()[1] #FIXME persist empty files metadata EMPTY_FILES = set() @APP.route("/<key:path>/__meta", method="GET") def get_file_metadata(key):