def _handle_anti_entropy_audit_request(state, message, _data):
    """handle a requst to audit a specific collection, not some random one"""
    log = logging.getLogger("_handle_anti_entropy_audit_request")

    timestamp = create_timestamp()
    state_key = (message["collection-id"], timestamp, )

    database = AuditResultDatabase(state["central-database-connection"])
    row_id = database.start_audit(message["collection-id"], timestamp)
    database.close()

    state["active-requests"][state_key] = _request_state_tuple(
        client_tag=message["client-tag"],
        timestamp=timestamp,
        timeout=time.time()+_request_timeout,
        retry_count=max_retry_count,
        replies=dict(), 
        row_id=row_id,
    )

    request = {
        "message-type"  : "consistency-check",
        "collection-id"     : message["collection-id"],
        "timestamp-repr": repr(timestamp),
    }
    for anti_entropy_client in state["anti-entropy-clients"]:
        anti_entropy_client.queue_message_for_send(request)
    def run(self, halt_event):
        """pick a collection and start a new consistency check"""
        if halt_event.is_set():
            self._log.info("halt-event is set: exiting")
            return
                
        cutoff_timestamp = \
            datetime.datetime.now() - \
            datetime.timedelta(days=_audit_cutoff_days)
        database = AuditResultDatabase(
            self._state["central-database-connection"]
        )
        ineligible_collection_ids = set(
            database.ineligible_collection_ids(cutoff_timestamp)
        )
        eligible_collection_ids = self._state["collection-ids"] \
                            - ineligible_collection_ids

        self._log.info(
            "found %s collections eligible for consistency check" % (
                len(eligible_collection_ids),
            )
        )

        eligible_collection_id_list = list(eligible_collection_ids)
        while len(eligible_collection_id_list) > 0 \
        and len(self._state["active-requests"]) < _max_active_checks:
            collection_id = random.choice(eligible_collection_id_list)
            eligible_collection_id_list.remove(collection_id)
            self._start_consistency_check(self._state, collection_id)

        return [(self.run, self.next_run(), )]
def _start_consistency_check(state, collection_id, row_id=None, retry_count=0):
    log = logging.getLogger("_start_consistency_check")

    timestamp = create_timestamp()
    state_key = (collection_id, timestamp, )

    database = AuditResultDatabase(state["central-database-connection"])
    if row_id is None:
        row_id = database.start_audit(collection_id, timestamp)
    else:
        database.restart_audit(row_id, timestamp)
    database.close()

    state["active-requests"][state_key] = _request_state_tuple(
        client_tag=None,
        timestamp=timestamp,
        timeout=time.time()+_request_timeout,
        retry_count=retry_count,
        replies=dict(), 
        row_id=row_id,
    )

    request = {
        "message-type"  : "consistency-check",
        "collection-id" : collection_id,
        "timestamp-repr": repr(timestamp),
    }
    for anti_entropy_client in state["anti-entropy-clients"]:
        anti_entropy_client.queue_message_for_send(request)
def _handle_consistency_check_reply(state, message, _data):
    log = logging.getLogger("_handle_consistency_check_reply")
    
    timestamp = parse_timestamp_repr(message["timestamp-repr"])
    state_key = (message["collection-id"], timestamp, )

    try:
        request_state = state["active-requests"][state_key]
    except KeyError:
        log.warn("Unknown state_key %s from %s" % (
            state_key, message["node-name"]
        ))
        return

    if message["node-name"] in request_state.replies:
        error_message = "duplicate reply from %s %s" % (
            message["node-name"],
            state_key, 
        )
        log.error(error_message)
        return

    if message["result"] != "success":
        log.error("%s (%s) %s from %s" % (
            state_key, 
            message["result"],
            message["error-message"],
            message["node-name"],
        ))
        reply_value = _error_reply
    else:
        reply_value = (message["count"], message["encoded-md5-digest"], )

    request_state.replies[message["node-name"]] = reply_value

    # not done yet, wait for more replies
    if len(request_state.replies) < len(state["anti-entropy-clients"]):
        return

    # at this point we should have a reply from every node, so
    # we don't want to preserve state anymore
    del state["active-requests"][state_key]
    database = AuditResultDatabase(state["central-database-connection"])
    timestamp = create_timestamp()
    
    # push the results into a dict to see how many unique entries there are
    md5_digest_dict = dict()
    md5_digest_dict[_error_reply] = list()

    for node_name in request_state.replies.keys():
        node_reply = request_state.replies[node_name]
        if node_reply == _error_reply:
            md5_digest_dict[_error_reply].append(node_name)
            continue

        _count, encoded_md5_digest = node_reply
        if not encoded_md5_digest in md5_digest_dict:
            md5_digest_dict[encoded_md5_digest] = list()
        md5_digest_dict[encoded_md5_digest].append(node_name)

    # if this audit was started by an anti-entropy-audit-request message,
    # we want to send a reply
    if request_state.client_tag is not None:
        reply = {
            "message-type"  : "anti-entropy-audit-reply",
            "client-tag"    : request_state.client_tag,
            "collection-id" : message["collection-id"],
            "result"        : None,
            "error-message" : None,
        }
    else:
        reply = None

    error_reply_list = md5_digest_dict.pop(_error_reply)
    if reply is not None:
        reply["error-reply-nodes"] = error_reply_list


    if len(md5_digest_dict) > 1:
        log.error("found %s different hashes for (%s)" % (
            len(md5_digest_dict), 
            message["collection-id"],
        ))
        for index, value in enumerate(md5_digest_dict.values()):
            log.info(str(value))
            if reply is not None:
                reply["mistmatch-nodes-%s" % (index+1, )] = value
        
    # ok = no errors and all nodes have the same hash for every collection
    if len(error_reply_list) == 0 and len(md5_digest_dict) == 1:
        description = "collection %s compares ok" % (
            message["collection-id"], 
        )
        log.info(description)
        state["event-push-client"].info(
            "audit-ok", description, collection_id=message["collection-id"]
        )  
        database.successful_audit(request_state.row_id, timestamp)
        if reply is not None:
            reply["result"] = "success"
            state["resilient-server"].send_reply(reply)
        return

    # we have error(s), but the non-errors compare ok
    if len(error_reply_list) > 0 and len(md5_digest_dict) == 1:

        # if we come from anti-entropy-audit-request, don't retry
        if reply is not None:
            database.audit_error(request_state.row_id, timestamp)
            database.close()
            description = "There were error replies from %s nodes" % (
                len(error_reply_list) , 
            )
            log.error(description)
            state["event-push-client"].error(
                "consistency-check-errors-replies", 
                description, 
                collection_id=message["collection-id"],
                error_reply_nodes=error_reply_list
            )  
            reply["result"] = "error"
            reply["error-message"] = description
            state["resilient-server"].send_reply(reply)
            return
        
        if request_state.retry_count >= max_retry_count:
            description = "collection %s %s errors, too many retries" % (
                message["collection-id"], 
                len(error_reply_list) 
            )
            log.error(description)
            state["event-push-client"].error(
                "audit-errors", 
                description, 
                collection_id=message["collection-id"]
            )  
            database.audit_error(request_state.row_id, timestamp)
            # TODO: needto do something here
        else:
            description = "%s Error replies from %s nodes, will retry" % (
                message["collection-id"], 
                len(error_reply_list) 
            )
            log.warn(description)
            state["event-push-client"].warn(
                "audit-retry", 
                description, 
                collection_id=message["collection-id"]
            )  
            state["retry-list"].append(
                retry_entry_tuple(
                    retry_time=retry_time(), 
                    collection_id=message["collection-id"],
                    row_id=request_state.row_id,
                    retry_count=request_state.retry_count, 
                )
            )
            database.wait_for_retry(request_state.row_id)
        database.close()
        return

    # if we make it here, we have some form of mismatch, possibly mixed with
    # errors
    description = "%s error replies from %s nodes; hash mismatch(es) = %r" % (
        message["collection-id"], 
        len(error_reply_list),
        md5_digest_dict.values()
    )
    log.error(description)
    state["event-push-client"].warn(
        "audit-retry", 
        description, 
        collection_id=message["collection-id"]
    )  

    # if we come from anti-entropy-audit-request, don't retry
    if reply is not None:
        database.audit_error(request_state.row_id, timestamp)
        database.close()
        reply["result"] = "audit-error"
        reply["error-message"] = description
        state["resilient-server"].send_reply(reply)
        return

    if request_state.retry_count >= max_retry_count:
        log.error("%s too many retries" % (message["collection-id"], ))
        database.audit_error(request_state.row_id, timestamp)
        # TODO: need to do something here
    else:
        state["retry-list"].append(
            retry_entry_tuple(
                retry_time=retry_time(), 
                collection_id=message["collection-id"],
                row_id=request_state.row_id,
                retry_count=request_state.retry_count, 
            )
        )
        database.wait_for_retry(request_state.row_id)

    database.close()