async def bucketCheck(app): """ Verify that contents of bucket are self-consistent """ now = int(time.time()) log.info("bucket check {}".format(unixTimeToUTC(now))) # do initial listKeys await listKeys(app) # clear used flags clearUsedFlags(app) # mark objs await markObjs(app) unlinked_count = 0 s3objs = app["s3objs"] for objid in s3objs: if isValidUuid(objid) and not isValidChunkId(objid): try: s3obj = await getS3Obj(app, objid) if s3obj.used is False: unlinked_count += 1 except HTTPInternalServerError as hpe: log.warn("got error retreiving {}: {}".format(objid, hpe.code)) domains = app["domains"] for domain in domains: print("domain:", domain) roots = app["roots"] for root in roots: print("root:", root) top_level_domains = [] for domain in domains: if domain[0] != '/': log.error("unexpected domain: {}".format(domain)) continue if domain[1:].find('/') == -1: top_level_domains.append(domain) print("top-level-domains:") for domain in top_level_domains: print(domain) print("=" * 80) print("total storage: {}".format(app["bytes_in_bucket"])) print("Num objects: {}".format(len(app["s3objs"]))) print("Num domains: {}".format(len(app["domains"]))) print("Num root groups: {}".format(len(app["roots"]))) print("Unlinked objects: {}".format(unlinked_count))
async def info(request): """HTTP Method to return node state to caller""" log.request(request) app = request.app resp = StreamResponse() resp.headers['Content-Type'] = 'application/json' answer = {} # copy relevant entries from state dictionary to response answer['id'] = request.app['id'] answer['start_time'] = unixTimeToUTC(app['start_time']) answer['last_health_check'] = unixTimeToUTC(app['last_health_check']) answer['up_time'] = elapsedTime(app['start_time']) answer['cluster_state'] = app['cluster_state'] answer['bucket_name'] = app['bucket_name'] answer['target_sn_count'] = getTargetNodeCount(app, "sn") answer['active_sn_count'] = getActiveNodeCount(app, "sn") answer['target_dn_count'] = getTargetNodeCount(app, "dn") answer['active_dn_count'] = getActiveNodeCount(app, "dn") resp = json_response(answer) log.response(request, resp=resp) return resp
async def healthCheck(app): """ Periodic method that pings each active node and verifies it is still healthy. If node doesn't respond, free up the node slot (the node can re-register if it comes back)'""" app["last_health_check"] = int(time.time()) nodes = app["nodes"] while True: # sleep for a bit sleep_secs = config.get("head_sleep_time") await asyncio.sleep(sleep_secs) now = int(time.time()) log.info("health check {}".format(unixTimeToUTC(now))) fail_count = 0 HEALTH_CHECK_RETRY_COUNT = 1 # times to try before calling a node dead for node in nodes: if node["host"] is None: fail_count += 1 continue url = getUrl(node["host"], node["port"]) + "/info" try: rsp_json = await http_get(app, url) if "node" not in rsp_json: log.error("Unexpected response from node") fail_count += 1 continue node_state = rsp_json["node"] node_id = node_state["id"] if node_id != node['id']: log.warn("unexpected node_id: {} (expecting: {})".format(node_id, node['id'])) node['host'] = None node['id'] = None fail_count += 1 continue if 'number' in node_state and node_state['number'] != node['node_number']: msg = "unexpected node_number got {} (expecting: {})" log.warn(msg.format(node_state["number"], node['node_number'])) node['host'] = None node['id'] = None fail_count += 1 continue # save off other useful info from the node app_node_stats = app["node_stats"] node_stats = {} for k in NODE_STAT_KEYS: node_stats[k] = rsp_json[k] app_node_stats[node_id] = node_stats # mark the last time we got a response from this node node["healthcheck"] = unixTimeToUTC(int(time.time())) node["failcount"] = 0 # rest except OSError as ose: log.warn("OSError for req: {}: {}".format(url, str(ose))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("node {}:{} not responding".format(node["host"], node["port"])) fail_count += 1 except HTTPInternalServerError as hpe: log.warn("HTTPInternalServerError for req: {}: {}".format(url, str(hpe))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("removing {}:{} from active list".format(node["host"], node["port"])) fail_count += 1 except TimeoutError as toe: log.warn("Timeout error for req: {}: {}".format(url, str(toe))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("removing {}:{} from active list".format(node["host"], node["port"])) fail_count += 1 log.info("node health check fail_count: {}".format(fail_count)) if fail_count > 0: if app["cluster_state"] == "READY": # go back to INITIALIZING state until another node is registered log.warn("Fail_count > 0, Setting cluster_state from READY to INITIALIZING") app["cluster_state"] = "INITIALIZING" elif fail_count == 0 and app["cluster_state"] != "READY": log.info("All nodes healthy, changing cluster state to READY") app["cluster_state"] = "READY"
async def register(request): """ HTTP method for nodes to register with head node""" log.request(request) app = request.app text = await request.text() # body = await request.json() body = json.loads(text) log.info("body: {}".format(body)) if 'id' not in body: msg = "Missing 'id'" log.response(request, code=400, message=msg) raise HTTPBadRequest(reason=msg) if 'port' not in body: msg = "missing key 'port'" log.response(request, code=400, message=msg) raise HTTPBadRequest(reason=msg) if 'node_type' not in body: raise HTTPBadRequest(reason="missing key 'node_type'") if body['node_type'] not in ('sn', 'dn', 'an'): msg="invalid node_type" log.response(request, code=400, message=msg) raise HTTPBadRequest(reason=msg) peername = request.transport.get_extra_info('peername') if peername is None: raise HTTPBadRequest(reason="Can not determine caller IP") host, req_port = peername log.info("register host: {}, port: {}".format(host, req_port)) nodes = None ret_node = None node_ids = app['node_ids'] if body['id'] in node_ids: # already registered? ret_node = node_ids[body['id']] else: nodes = app['nodes'] for node in nodes: if node['host'] is None and node['node_type'] == body['node_type']: # found a free node log.info("got free node: {}".format(node)) node['host'] = host node['port'] = body["port"] node['id'] = body["id"] node["connected"] = unixTimeToUTC(int(time.time())) node['failcount'] = 0 ret_node = node node_ids[body["id"]] = ret_node break if ret_node is None: log.info("no free node to assign") inactive_node_count = getInactiveNodeCount(app) log.info("inactive_node_count: {}".format(inactive_node_count)) if inactive_node_count == 0: # all the nodes have checked in log.info("setting cluster state to ready") app['cluster_state'] = "READY" resp = StreamResponse() resp.headers['Content-Type'] = 'application/json' answer = {} if ret_node is not None: answer["node_number"] = ret_node["node_number"] else: # all nodes allocated, let caller know it's in the reserve pool answer["node_number"] = -1 answer["node_count"] = app["target_dn_count"] resp = json_response(answer) log.response(request, resp=resp) return resp
async def healthCheck(app): """ Periodic method that pings each active node and verifies it is still healthy. If node doesn't respond, free up the node slot (the node can re-register if it comes back)'""" app["last_health_check"] = int(time.time()) # update/initialize root object before starting node updates headnode_key = getHeadNodeS3Key() log.info("headnode S3 key: {}".format(headnode_key)) headnode_obj_found = False head_url = getUrl(app["head_host"], app["head_port"]) nodes = app["nodes"] while True: # sleep for a bit sleep_secs = config.get("head_sleep_time") await asyncio.sleep(sleep_secs) now = int(time.time()) log.info("health check {}".format(unixTimeToUTC(now))) if not headnode_obj_found: log.info("checking for headnode_key: {}".format(headnode_key)) if await isS3Obj(app, headnode_key): headnode_obj_found = True headnode_stats = await getS3ObjStats(app, headnode_key) log.info("headnode_stats: {}".format(headnode_stats)) else: # first time hsds has run with this bucket name? log.warn("need to create headnode obj") head_state = {} head_state["created"] = int(time.time()) head_state["id"] = app["id"] head_state["last_health_check"] = app["last_health_check"] head_state["head_url"] = head_url log.info("write head_state to S3: {}".format(head_state)) try: await putS3JSONObj(app, headnode_key, head_state) except HTTPInternalServerError as hpe: # Might be bad AWS config, transient S3 error, or minio not initialized yet... log.warn( "HTTPInternalServerError writing head_state: {}: {}". format(headnode_key, str(hpe))) continue # start health check on next iteration head_state = await getS3JSONObj(app, headnode_key) log.info("head_state: {}".format(head_state)) log.info("elapsed time since last health check: {}".format( elapsedTime(head_state["last_health_check"]))) if head_state['id'] != app['id']: log.warn("mis-match bucket head id: {}".format(head_state["id"])) if now - head_state["last_health_check"] < sleep_secs * 4: log.warn("other headnode may be active") continue # skip node checks and loop around again else: log.warn( "other headnode is not active, making this headnode leader" ) head_state['id'] = app['id'] else: log.info("head_state id matches S3 Object") head_state["last_health_check"] = now app["last_health_check"] = now head_state["head_url"] = head_url log.info("write head_state to S3: {}".format(head_state)) await putS3JSONObj(app, headnode_key, head_state) log.info("putS3JSONObj complete") fail_count = 0 HEALTH_CHECK_RETRY_COUNT = 1 # times to try before calling a node dead for node in nodes: if node["host"] is None: fail_count += 1 continue url = getUrl(node["host"], node["port"]) + "/info" try: rsp_json = await http_get(app, url) if "node" not in rsp_json: log.error("Unexpected response from node") fail_count += 1 continue node_state = rsp_json["node"] node_id = node_state["id"] if node_id != node['id']: log.warn("unexpected node_id: {} (expecting: {})".format( node_id, node['id'])) node['host'] = None node['id'] = None fail_count += 1 continue if 'number' in node_state and node_state['number'] != node[ 'node_number']: msg = "unexpected node_number got {} (expecting: {})" log.warn( msg.format(node_state["number"], node['node_number'])) node['host'] = None node['id'] = None fail_count += 1 continue # save off other useful info from the node app_node_stats = app["node_stats"] node_stats = {} for k in NODE_STAT_KEYS: node_stats[k] = rsp_json[k] app_node_stats[node_id] = node_stats # mark the last time we got a response from this node node["healthcheck"] = unixTimeToUTC(int(time.time())) node["failcount"] = 0 # rest except OSError as ose: log.warn("OSError for req: {}: {}".format(url, str(ose))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("node {}:{} not responding".format( node["host"], node["port"])) fail_count += 1 except HTTPInternalServerError as hpe: log.warn("HTTPInternalServerError for req: {}: {}".format( url, str(hpe))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("removing {}:{} from active list".format( node["host"], node["port"])) fail_count += 1 except TimeoutError as toe: log.warn("Timeout error for req: {}: {}".format(url, str(toe))) # node has gone away? node["failcount"] += 1 if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT: log.warn("removing {}:{} from active list".format( node["host"], node["port"])) fail_count += 1 log.info("node health check fail_count: {}".format(fail_count)) if fail_count > 0: if app["cluster_state"] == "READY": # go back to INITIALIZING state until another node is registered log.warn( "Fail_count > 0, Setting cluster_state from READY to INITIALIZING" ) app["cluster_state"] = "INITIALIZING" elif fail_count == 0 and app["cluster_state"] != "READY": log.info("All nodes healthy, changing cluster state to READY") app["cluster_state"] = "READY"