Esempio n. 1
0
def list_replicas_out_of_window_to_delete(window, host="metadata", port=6379):
    """
    List the documents whose replicas can be deleted.
    Args:
        window(int): The number of most recent documents whose blocks need to be replicated
        host(str): Host of the metadata server
        port(int): Port number the metadata server is listening on
    Returns:
        list(MetaBlock): The list of blocks that are considered in the old generation and can be deleted
    Raises:
        ValueError: if window is not a number greater or equal to 0,
                    if host is not a valid string or
                    if port is not a number between 0 and 65535
    """
    if not isinstance(window, int) or window < 0:
        msg = "window must be an integer greater than 0"
        raise ValueError(msg)
    if not host or not isinstance(host, (str, unicode)):
        msg = "host argument must be a non-empty string"
        raise ValueError(msg)
    if not isinstance(port, int) or port < 0 or port > 65535:
        msg = "port argument must be an integer between 0 and 65535"
        raise ValueError(msg)
    metadata = mtdt.Files(host=host, port=port)
    blocks = metadata.get_blocks(metadata.list_blocks()[-window:])
    blocks_to_cleanup = [b for b in blocks if has_replicas(b)]
    return blocks_to_cleanup
Esempio n. 2
0
def list_replicas_to_delete(pointers, host="metadata", port=6379):
    """
    List the blocks that match the given threshold
    Args:
        pointers(int): The minimum number of documents pointing to a block for
                       the block to be considered for scrubbing
        host(str, optional): The host metadata server
        port(int, optional): The port the metadata server is listening in
    Returns:
        list(MetaBlock): The list of blocks that have passed the threshold
    Raises:
        ValueError: if the pointers argument is not integer or is lower than 0
    """
    if not isinstance(pointers, int) or pointers < 0:
        raise ValueError("pointers must be an integer greater or equal to 0")
    LOGGER.debug(
        "list_replicas_to_delete: pointers={:d}, host={:s}, port={:d}".format(
            pointers, host, port))
    files = mtdt.Files(host=host, port=port)
    blocks = files.get_blocks(files.list_blocks())
    LOGGER.debug(
        "list_replicas_to_delete: loaded {:d} blocks to inspect".format(
            len(blocks)))
    consider_for_scrubbing = []
    for block in blocks:
        if files.has_been_entangled_enough(block.key, pointers) and \
           len(set(block.providers)) > 1:
            consider_for_scrubbing.append(block)
    return consider_for_scrubbing
Esempio n. 3
0
def write_metadata(host, port, documents):
    """
    Args:
        host(str): Host of the metadata server to write to
        port(int): Port number the metadata server is listening on
        documents(dict(str, metadata.MetaDocument)): MetaDocuments to insert
    """
    logger = logging.getLogger("REBUILD_METADATA")
    if not host or not isinstance(host, str):
        raise ValueError("argument host must be a non-empty string")
    if not isinstance(port, int) or port < 0 or port > 65535:
        raise ValueError(
            "argument port must be an integer in a range of 0 to 65535")
    if not isinstance(documents, dict):
        raise ValueError("argument documents must be a dictionary")
    metadata_server = metadata.Files(host=host, port=port)
    documents_grouped_by_pointers = group_documents_by_pointers(
        documents.values())

    logger.info(
        "Formed {:d} groups of documents based on the number of pointers".
        format(len(documents_grouped_by_pointers)))
    for number_of_pointers in sorted(documents_grouped_by_pointers):
        documents_group = documents_grouped_by_pointers[number_of_pointers]
        logger.info("Grouped {:d} documents with {:d} pointers".format(
            len(documents_group), number_of_pointers))
        while documents_group:
            last_inserts = []
            current_documents_set = set(
                [unicode(doc.path) for doc in documents_group])
            logger.info(
                "{:d} documents left in the current document set".format(
                    len(current_documents_set)))
            for document in documents_group:
                documents_pointed = set(
                    [eb[0] for eb in document.entangling_blocks])
                if pointers_and_documents_overlap(documents_pointed,
                                                  current_documents_set):
                    continue
                metadata_server.put(document.path, document)
                last_inserts.append(document)
                logger.info("inserted metadata for document {:s}".format(
                    document.path))
            for inserted in last_inserts:
                documents_group.remove(inserted)
Esempio n. 4
0
def delete_block(block, host="metadata", port=6379):
    """
    Removes location of replicas from metadata.
    Args:
        block(MetaBlock): The block whose replicas need to be destroyed
        host(str, optional): The host metadata server
        port(int, optional): The port the metadata server is listening on
    Returns:
        bool: Whether the block was deleted
    Raises:
        ValueError: if the block is not a MetaBlock instance
    """
    if not block or not isinstance(block, mtdt.MetaBlock):
        raise ValueError("block argument must be a valid MetaBlock")
    LOGGER.debug("delete_block: block={:s}, host={:s}, port={:d}".format(
        block.key, host, port))
    files = mtdt.Files(host=host, port=port)
    filename = dsp.extract_path_from_key(block.key)
    with open("./dispatcher.json", "r") as handle:
        dispatcher_configuration = json.load(handle)
    dispatcher = dsp.Dispatcher(configuration=dispatcher_configuration)
    hostname = os.uname()[1]
    kazoo_resource = os.path.join("/", filename)
    kazoo_identifier = "repair-{:s}".format(hostname)
    with KAZOO_CLIENT.WriteLock(kazoo_resource, kazoo_identifier):
        metadata = files.get(filename)
        for metablock in metadata.blocks:
            if metablock.key == block.key:
                metablock.providers = metablock.providers[:1]
                for provider_name in metablock.providers[1:]:
                    dispatcher.providers[provider_name].delete(metablock.key)
                    LOGGER.debug(
                        "delete_block: Removed replica of {:s} from {:s}".
                        format(metablock.key, provider_name))
                break
        files.put(metadata.path, metadata)
        return len(files.get_block(block.key).providers) == 1
Esempio n. 5
0
def rebuild(configuration_path):
    """
    Rebuilds metadata by data from the storage nodes
    Args:
        configuration_path(str): Path to the dispatcher configuration file
    Returns:
        list(metadata.MetaDocument): The rebuilt metadata information
    """
    if not configuration_path or not isinstance(configuration_path, str):
        raise ValueError(
            "configuration_path argument must be a non-empty string")
    with open(configuration_path, "r") as handle:
        dispatcher_configuration = json.load(handle)

    metadata_server = metadata.Files()
    original_documents = metadata_server.get_files(metadata_server.keys())
    total_documents = len(original_documents)
    total_blocks = count_blocks(original_documents)
    dispatcher = d.Dispatcher(configuration=dispatcher_configuration)
    completion = {
        "documents": [0],
        "documents_availability": [0],
        "blocks": [0],
        "blocks_availability": [0]
    }
    documents = {}
    provider_names = dispatcher.providers.keys()
    random.shuffle(provider_names)
    sys.stderr.write("total_blocks: {:d}\n".format(total_blocks))
    available_blocks = set()
    for provider_name in provider_names:
        blocks_score = 0.0
        provider = dispatcher.providers[provider_name]
        documents = rebuild_node(provider, provider_name, documents=documents)
        completion["documents"].append(float(len(documents)) / total_documents)
        current_block_count = count_blocks(documents.values())
        sys.stderr.write("{:d} blocks read\n".format(current_block_count))
        completion["blocks"].append(float(current_block_count / total_blocks))
        for document in documents.values():
            original_blocks = {
                b.key: b
                for b in metadata_server.get(document.path).blocks
            }
            for block in document.blocks:
                score_for_block = compute_block_completion(
                    block, original_blocks[block.key])
                blocks_score += score_for_block
                if score_for_block == 1.0:
                    available_blocks.add(block.key)
        documents_availability = 0
        sys.stderr.write("There are {:d} blocks available\n".format(
            len(available_blocks)))
        for doc_key in documents:
            document = metadata_server.get(doc_key)
            if is_document_available(document, available_blocks):
                documents_availability += 1
                sys.stderr.write(
                    "Document {:s} is available\n".format(doc_key))
        completion["documents_availability"].append(
            float(documents_availability) / total_documents)
        completion["blocks_availability"].append(
            float(blocks_score) / total_blocks)
    return completion