def _start_consistency_check(state, collection_id, row_id=None, retry_count=0): log = logging.getLogger("_start_consistency_check") timestamp = create_timestamp() state_key = (collection_id, timestamp, ) database = AuditResultDatabase(state["central-database-connection"]) if row_id is None: row_id = database.start_audit(collection_id, timestamp) else: database.restart_audit(row_id, timestamp) database.close() state["active-requests"][state_key] = _request_state_tuple( client_tag=None, timestamp=timestamp, timeout=time.time()+_request_timeout, retry_count=retry_count, replies=dict(), row_id=row_id, ) request = { "message-type" : "consistency-check", "collection-id" : collection_id, "timestamp-repr": repr(timestamp), } for anti_entropy_client in state["anti-entropy-clients"]: anti_entropy_client.queue_message_for_send(request)
def _handle_anti_entropy_audit_request(state, message, _data): """handle a requst to audit a specific collection, not some random one""" log = logging.getLogger("_handle_anti_entropy_audit_request") timestamp = create_timestamp() state_key = (message["collection-id"], timestamp, ) database = AuditResultDatabase(state["central-database-connection"]) row_id = database.start_audit(message["collection-id"], timestamp) database.close() state["active-requests"][state_key] = _request_state_tuple( client_tag=message["client-tag"], timestamp=timestamp, timeout=time.time()+_request_timeout, retry_count=max_retry_count, replies=dict(), row_id=row_id, ) request = { "message-type" : "consistency-check", "collection-id" : message["collection-id"], "timestamp-repr": repr(timestamp), } for anti_entropy_client in state["anti-entropy-clients"]: anti_entropy_client.queue_message_for_send(request)
def _handle_consistency_check_reply(state, message, _data): log = logging.getLogger("_handle_consistency_check_reply") timestamp = parse_timestamp_repr(message["timestamp-repr"]) state_key = (message["collection-id"], timestamp, ) try: request_state = state["active-requests"][state_key] except KeyError: log.warn("Unknown state_key %s from %s" % ( state_key, message["node-name"] )) return if message["node-name"] in request_state.replies: error_message = "duplicate reply from %s %s" % ( message["node-name"], state_key, ) log.error(error_message) return if message["result"] != "success": log.error("%s (%s) %s from %s" % ( state_key, message["result"], message["error-message"], message["node-name"], )) reply_value = _error_reply else: reply_value = (message["count"], message["encoded-md5-digest"], ) request_state.replies[message["node-name"]] = reply_value # not done yet, wait for more replies if len(request_state.replies) < len(state["anti-entropy-clients"]): return # at this point we should have a reply from every node, so # we don't want to preserve state anymore del state["active-requests"][state_key] database = AuditResultDatabase(state["central-database-connection"]) timestamp = create_timestamp() # push the results into a dict to see how many unique entries there are md5_digest_dict = dict() md5_digest_dict[_error_reply] = list() for node_name in request_state.replies.keys(): node_reply = request_state.replies[node_name] if node_reply == _error_reply: md5_digest_dict[_error_reply].append(node_name) continue _count, encoded_md5_digest = node_reply if not encoded_md5_digest in md5_digest_dict: md5_digest_dict[encoded_md5_digest] = list() md5_digest_dict[encoded_md5_digest].append(node_name) # if this audit was started by an anti-entropy-audit-request message, # we want to send a reply if request_state.client_tag is not None: reply = { "message-type" : "anti-entropy-audit-reply", "client-tag" : request_state.client_tag, "collection-id" : message["collection-id"], "result" : None, "error-message" : None, } else: reply = None error_reply_list = md5_digest_dict.pop(_error_reply) if reply is not None: reply["error-reply-nodes"] = error_reply_list if len(md5_digest_dict) > 1: log.error("found %s different hashes for (%s)" % ( len(md5_digest_dict), message["collection-id"], )) for index, value in enumerate(md5_digest_dict.values()): log.info(str(value)) if reply is not None: reply["mistmatch-nodes-%s" % (index+1, )] = value # ok = no errors and all nodes have the same hash for every collection if len(error_reply_list) == 0 and len(md5_digest_dict) == 1: description = "collection %s compares ok" % ( message["collection-id"], ) log.info(description) state["event-push-client"].info( "audit-ok", description, collection_id=message["collection-id"] ) database.successful_audit(request_state.row_id, timestamp) if reply is not None: reply["result"] = "success" state["resilient-server"].send_reply(reply) return # we have error(s), but the non-errors compare ok if len(error_reply_list) > 0 and len(md5_digest_dict) == 1: # if we come from anti-entropy-audit-request, don't retry if reply is not None: database.audit_error(request_state.row_id, timestamp) database.close() description = "There were error replies from %s nodes" % ( len(error_reply_list) , ) log.error(description) state["event-push-client"].error( "consistency-check-errors-replies", description, collection_id=message["collection-id"], error_reply_nodes=error_reply_list ) reply["result"] = "error" reply["error-message"] = description state["resilient-server"].send_reply(reply) return if request_state.retry_count >= max_retry_count: description = "collection %s %s errors, too many retries" % ( message["collection-id"], len(error_reply_list) ) log.error(description) state["event-push-client"].error( "audit-errors", description, collection_id=message["collection-id"] ) database.audit_error(request_state.row_id, timestamp) # TODO: needto do something here else: description = "%s Error replies from %s nodes, will retry" % ( message["collection-id"], len(error_reply_list) ) log.warn(description) state["event-push-client"].warn( "audit-retry", description, collection_id=message["collection-id"] ) state["retry-list"].append( retry_entry_tuple( retry_time=retry_time(), collection_id=message["collection-id"], row_id=request_state.row_id, retry_count=request_state.retry_count, ) ) database.wait_for_retry(request_state.row_id) database.close() return # if we make it here, we have some form of mismatch, possibly mixed with # errors description = "%s error replies from %s nodes; hash mismatch(es) = %r" % ( message["collection-id"], len(error_reply_list), md5_digest_dict.values() ) log.error(description) state["event-push-client"].warn( "audit-retry", description, collection_id=message["collection-id"] ) # if we come from anti-entropy-audit-request, don't retry if reply is not None: database.audit_error(request_state.row_id, timestamp) database.close() reply["result"] = "audit-error" reply["error-message"] = description state["resilient-server"].send_reply(reply) return if request_state.retry_count >= max_retry_count: log.error("%s too many retries" % (message["collection-id"], )) database.audit_error(request_state.row_id, timestamp) # TODO: need to do something here else: state["retry-list"].append( retry_entry_tuple( retry_time=retry_time(), collection_id=message["collection-id"], row_id=request_state.row_id, retry_count=request_state.retry_count, ) ) database.wait_for_retry(request_state.row_id) database.close()