def list_replicas_out_of_window_to_delete(window, host="metadata", port=6379): """ List the documents whose replicas can be deleted. Args: window(int): The number of most recent documents whose blocks need to be replicated host(str): Host of the metadata server port(int): Port number the metadata server is listening on Returns: list(MetaBlock): The list of blocks that are considered in the old generation and can be deleted Raises: ValueError: if window is not a number greater or equal to 0, if host is not a valid string or if port is not a number between 0 and 65535 """ if not isinstance(window, int) or window < 0: msg = "window must be an integer greater than 0" raise ValueError(msg) if not host or not isinstance(host, (str, unicode)): msg = "host argument must be a non-empty string" raise ValueError(msg) if not isinstance(port, int) or port < 0 or port > 65535: msg = "port argument must be an integer between 0 and 65535" raise ValueError(msg) metadata = mtdt.Files(host=host, port=port) blocks = metadata.get_blocks(metadata.list_blocks()[-window:]) blocks_to_cleanup = [b for b in blocks if has_replicas(b)] return blocks_to_cleanup
def list_replicas_to_delete(pointers, host="metadata", port=6379): """ List the blocks that match the given threshold Args: pointers(int): The minimum number of documents pointing to a block for the block to be considered for scrubbing host(str, optional): The host metadata server port(int, optional): The port the metadata server is listening in Returns: list(MetaBlock): The list of blocks that have passed the threshold Raises: ValueError: if the pointers argument is not integer or is lower than 0 """ if not isinstance(pointers, int) or pointers < 0: raise ValueError("pointers must be an integer greater or equal to 0") LOGGER.debug( "list_replicas_to_delete: pointers={:d}, host={:s}, port={:d}".format( pointers, host, port)) files = mtdt.Files(host=host, port=port) blocks = files.get_blocks(files.list_blocks()) LOGGER.debug( "list_replicas_to_delete: loaded {:d} blocks to inspect".format( len(blocks))) consider_for_scrubbing = [] for block in blocks: if files.has_been_entangled_enough(block.key, pointers) and \ len(set(block.providers)) > 1: consider_for_scrubbing.append(block) return consider_for_scrubbing
def write_metadata(host, port, documents): """ Args: host(str): Host of the metadata server to write to port(int): Port number the metadata server is listening on documents(dict(str, metadata.MetaDocument)): MetaDocuments to insert """ logger = logging.getLogger("REBUILD_METADATA") if not host or not isinstance(host, str): raise ValueError("argument host must be a non-empty string") if not isinstance(port, int) or port < 0 or port > 65535: raise ValueError( "argument port must be an integer in a range of 0 to 65535") if not isinstance(documents, dict): raise ValueError("argument documents must be a dictionary") metadata_server = metadata.Files(host=host, port=port) documents_grouped_by_pointers = group_documents_by_pointers( documents.values()) logger.info( "Formed {:d} groups of documents based on the number of pointers". format(len(documents_grouped_by_pointers))) for number_of_pointers in sorted(documents_grouped_by_pointers): documents_group = documents_grouped_by_pointers[number_of_pointers] logger.info("Grouped {:d} documents with {:d} pointers".format( len(documents_group), number_of_pointers)) while documents_group: last_inserts = [] current_documents_set = set( [unicode(doc.path) for doc in documents_group]) logger.info( "{:d} documents left in the current document set".format( len(current_documents_set))) for document in documents_group: documents_pointed = set( [eb[0] for eb in document.entangling_blocks]) if pointers_and_documents_overlap(documents_pointed, current_documents_set): continue metadata_server.put(document.path, document) last_inserts.append(document) logger.info("inserted metadata for document {:s}".format( document.path)) for inserted in last_inserts: documents_group.remove(inserted)
def delete_block(block, host="metadata", port=6379): """ Removes location of replicas from metadata. Args: block(MetaBlock): The block whose replicas need to be destroyed host(str, optional): The host metadata server port(int, optional): The port the metadata server is listening on Returns: bool: Whether the block was deleted Raises: ValueError: if the block is not a MetaBlock instance """ if not block or not isinstance(block, mtdt.MetaBlock): raise ValueError("block argument must be a valid MetaBlock") LOGGER.debug("delete_block: block={:s}, host={:s}, port={:d}".format( block.key, host, port)) files = mtdt.Files(host=host, port=port) filename = dsp.extract_path_from_key(block.key) with open("./dispatcher.json", "r") as handle: dispatcher_configuration = json.load(handle) dispatcher = dsp.Dispatcher(configuration=dispatcher_configuration) hostname = os.uname()[1] kazoo_resource = os.path.join("/", filename) kazoo_identifier = "repair-{:s}".format(hostname) with KAZOO_CLIENT.WriteLock(kazoo_resource, kazoo_identifier): metadata = files.get(filename) for metablock in metadata.blocks: if metablock.key == block.key: metablock.providers = metablock.providers[:1] for provider_name in metablock.providers[1:]: dispatcher.providers[provider_name].delete(metablock.key) LOGGER.debug( "delete_block: Removed replica of {:s} from {:s}". format(metablock.key, provider_name)) break files.put(metadata.path, metadata) return len(files.get_block(block.key).providers) == 1
def rebuild(configuration_path): """ Rebuilds metadata by data from the storage nodes Args: configuration_path(str): Path to the dispatcher configuration file Returns: list(metadata.MetaDocument): The rebuilt metadata information """ if not configuration_path or not isinstance(configuration_path, str): raise ValueError( "configuration_path argument must be a non-empty string") with open(configuration_path, "r") as handle: dispatcher_configuration = json.load(handle) metadata_server = metadata.Files() original_documents = metadata_server.get_files(metadata_server.keys()) total_documents = len(original_documents) total_blocks = count_blocks(original_documents) dispatcher = d.Dispatcher(configuration=dispatcher_configuration) completion = { "documents": [0], "documents_availability": [0], "blocks": [0], "blocks_availability": [0] } documents = {} provider_names = dispatcher.providers.keys() random.shuffle(provider_names) sys.stderr.write("total_blocks: {:d}\n".format(total_blocks)) available_blocks = set() for provider_name in provider_names: blocks_score = 0.0 provider = dispatcher.providers[provider_name] documents = rebuild_node(provider, provider_name, documents=documents) completion["documents"].append(float(len(documents)) / total_documents) current_block_count = count_blocks(documents.values()) sys.stderr.write("{:d} blocks read\n".format(current_block_count)) completion["blocks"].append(float(current_block_count / total_blocks)) for document in documents.values(): original_blocks = { b.key: b for b in metadata_server.get(document.path).blocks } for block in document.blocks: score_for_block = compute_block_completion( block, original_blocks[block.key]) blocks_score += score_for_block if score_for_block == 1.0: available_blocks.add(block.key) documents_availability = 0 sys.stderr.write("There are {:d} blocks available\n".format( len(available_blocks))) for doc_key in documents: document = metadata_server.get(doc_key) if is_document_available(document, available_blocks): documents_availability += 1 sys.stderr.write( "Document {:s} is available\n".format(doc_key)) completion["documents_availability"].append( float(documents_availability) / total_documents) completion["blocks_availability"].append( float(blocks_score) / total_blocks) return completion