Ejemplo n.º 1
0
async def bucketCheck(app):
    """ Verify that contents of bucket are self-consistent
    """

    now = int(time.time())
    log.info("bucket check {}".format(unixTimeToUTC(now)))

    # do initial listKeys
    await listKeys(app)

    # clear used flags
    clearUsedFlags(app)

    # mark objs
    await markObjs(app)

    unlinked_count = 0
    s3objs = app["s3objs"]
    for objid in s3objs:
        if isValidUuid(objid) and not isValidChunkId(objid):
            try:
                s3obj = await getS3Obj(app, objid)
                if s3obj.used is False:
                    unlinked_count += 1
            except HTTPInternalServerError as hpe:
                log.warn("got error retreiving {}: {}".format(objid, hpe.code))

    domains = app["domains"]
    for domain in domains:
        print("domain:", domain)
    roots = app["roots"]
    for root in roots:
        print("root:", root)

    top_level_domains = []
    for domain in domains:
        if domain[0] != '/':
            log.error("unexpected domain: {}".format(domain))
            continue
        if domain[1:].find('/') == -1:
            top_level_domains.append(domain)

    print("top-level-domains:")
    for domain in top_level_domains:
        print(domain)
    print("=" * 80)

    print("total storage: {}".format(app["bytes_in_bucket"]))
    print("Num objects: {}".format(len(app["s3objs"])))
    print("Num domains: {}".format(len(app["domains"])))
    print("Num root groups: {}".format(len(app["roots"])))
    print("Unlinked objects: {}".format(unlinked_count))
Ejemplo n.º 2
0
async def info(request):
    """HTTP Method to return node state to caller"""
    log.request(request) 
    app = request.app
    resp = StreamResponse()
    resp.headers['Content-Type'] = 'application/json'
    answer = {}
    # copy relevant entries from state dictionary to response
    answer['id'] = request.app['id']
    answer['start_time'] = unixTimeToUTC(app['start_time'])
    answer['last_health_check'] = unixTimeToUTC(app['last_health_check'])
    answer['up_time'] = elapsedTime(app['start_time'])
    answer['cluster_state'] = app['cluster_state']  
    answer['bucket_name'] = app['bucket_name']   
    answer['target_sn_count'] = getTargetNodeCount(app, "sn") 
    answer['active_sn_count'] = getActiveNodeCount(app, "sn")
    answer['target_dn_count'] = getTargetNodeCount(app, "dn") 
    answer['active_dn_count'] = getActiveNodeCount(app, "dn")

    resp = json_response(answer)
    log.response(request, resp=resp)
    return resp
Ejemplo n.º 3
0
async def healthCheck(app):
    """ Periodic method that pings each active node and verifies it is still healthy.  
    If node doesn't respond, free up the node slot (the node can re-register if it comes back)'"""

    app["last_health_check"] = int(time.time())
    
    nodes = app["nodes"]
    while True:
        # sleep for a bit
        sleep_secs = config.get("head_sleep_time")
        await  asyncio.sleep(sleep_secs)

        now = int(time.time())
        log.info("health check {}".format(unixTimeToUTC(now)))
        
        fail_count = 0
        HEALTH_CHECK_RETRY_COUNT = 1 # times to try before calling a node dead
        for node in nodes:         
            if node["host"] is None:
                fail_count += 1
                continue
            url = getUrl(node["host"], node["port"]) + "/info"  
            try:
                rsp_json = await http_get(app, url)
                if "node" not in rsp_json:
                    log.error("Unexpected response from node")
                    fail_count += 1
                    continue
                node_state = rsp_json["node"]
                node_id = node_state["id"]
                
                if node_id != node['id']:
                    log.warn("unexpected node_id: {} (expecting: {})".format(node_id, node['id']))
                    node['host'] = None
                    node['id'] = None
                    fail_count += 1
                    continue

                if 'number' in node_state and node_state['number'] != node['node_number']:
                    msg = "unexpected node_number got {} (expecting: {})"
                    log.warn(msg.format(node_state["number"], node['node_number']))
                    node['host'] = None
                    node['id'] = None
                    fail_count += 1
                    continue
                    
                # save off other useful info from the node
                app_node_stats = app["node_stats"]
                node_stats = {}
                for k in NODE_STAT_KEYS:
                    node_stats[k] = rsp_json[k]
                app_node_stats[node_id] = node_stats
                # mark the last time we got a response from this node
                node["healthcheck"] = unixTimeToUTC(int(time.time()))
                node["failcount"] = 0 # rest
            except OSError as ose:
                log.warn("OSError for req: {}: {}".format(url, str(ose)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("node {}:{} not responding".format(node["host"], node["port"]))
                    fail_count += 1
                
            except HTTPInternalServerError as hpe:
                log.warn("HTTPInternalServerError for req: {}: {}".format(url, str(hpe)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("removing {}:{} from active list".format(node["host"], node["port"]))
                    fail_count += 1
            except TimeoutError as toe:
                log.warn("Timeout error for req: {}: {}".format(url, str(toe)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("removing {}:{} from active list".format(node["host"], node["port"]))
                    fail_count += 1
        log.info("node health check fail_count: {}".format(fail_count))
        if fail_count > 0:
            if app["cluster_state"] == "READY":
                # go back to INITIALIZING state until another node is registered
                log.warn("Fail_count > 0, Setting cluster_state from READY to INITIALIZING")
                app["cluster_state"] = "INITIALIZING"
        elif fail_count == 0 and app["cluster_state"] != "READY":
            log.info("All nodes healthy, changing cluster state to READY")
            app["cluster_state"] = "READY"
Ejemplo n.º 4
0
async def register(request):
    """ HTTP method for nodes to register with head node"""
    log.request(request)   
    app = request.app
    text = await request.text()
    # body = await request.json()
    body = json.loads(text)
    log.info("body: {}".format(body))
    if 'id' not in body:
        msg = "Missing 'id'"
        log.response(request, code=400, message=msg)
        raise HTTPBadRequest(reason=msg)
    if 'port' not in body:
        msg = "missing key 'port'"
        log.response(request, code=400, message=msg)
        raise HTTPBadRequest(reason=msg)
    if 'node_type' not in body:
        raise HTTPBadRequest(reason="missing key 'node_type'")
    if body['node_type'] not in ('sn', 'dn', 'an'):
        msg="invalid node_type"
        log.response(request, code=400, message=msg)
        raise HTTPBadRequest(reason=msg)
    
    peername = request.transport.get_extra_info('peername')
    if peername is None:
        raise HTTPBadRequest(reason="Can not determine caller IP")
    host, req_port = peername
    log.info("register host: {}, port: {}".format(host, req_port))

    nodes = None
    ret_node = None  

    node_ids = app['node_ids']
    if body['id'] in node_ids:
        # already registered?  
        ret_node = node_ids[body['id']] 
    else:
        nodes = app['nodes']
        for node in nodes:
            if node['host'] is None and node['node_type'] == body['node_type']:
                # found a free node
                log.info("got free node: {}".format(node))
                node['host'] = host
                node['port'] = body["port"]
                node['id'] =   body["id"]
                node["connected"] = unixTimeToUTC(int(time.time()))
                node['failcount'] = 0
                ret_node = node
                node_ids[body["id"]] = ret_node
                break

    if ret_node is None:
        log.info("no free node to assign")
 
    inactive_node_count = getInactiveNodeCount(app)
    log.info("inactive_node_count: {}".format(inactive_node_count))
    if inactive_node_count == 0:
        # all the nodes have checked in
        log.info("setting cluster state to ready")
        app['cluster_state'] = "READY"
         
    resp = StreamResponse()
    resp.headers['Content-Type'] = 'application/json'
    answer = {}
    if ret_node is not None:
        answer["node_number"] = ret_node["node_number"]
    else:
        # all nodes allocated, let caller know it's in the reserve pool
        answer["node_number"] = -1
    
    answer["node_count"] = app["target_dn_count"]
        
    resp = json_response(answer)
    log.response(request, resp=resp)
    return resp
Ejemplo n.º 5
0
async def healthCheck(app):
    """ Periodic method that pings each active node and verifies it is still healthy.  
    If node doesn't respond, free up the node slot (the node can re-register if it comes back)'"""

    app["last_health_check"] = int(time.time())

    # update/initialize root object before starting node updates
    headnode_key = getHeadNodeS3Key()
    log.info("headnode S3 key: {}".format(headnode_key))
    headnode_obj_found = False
    head_url = getUrl(app["head_host"], app["head_port"])

    nodes = app["nodes"]
    while True:
        # sleep for a bit
        sleep_secs = config.get("head_sleep_time")
        await asyncio.sleep(sleep_secs)

        now = int(time.time())
        log.info("health check {}".format(unixTimeToUTC(now)))

        if not headnode_obj_found:
            log.info("checking for headnode_key: {}".format(headnode_key))
            if await isS3Obj(app, headnode_key):
                headnode_obj_found = True
                headnode_stats = await getS3ObjStats(app, headnode_key)
                log.info("headnode_stats: {}".format(headnode_stats))
            else:
                # first time hsds has run with this bucket name?
                log.warn("need to create headnode obj")
                head_state = {}
                head_state["created"] = int(time.time())
                head_state["id"] = app["id"]
                head_state["last_health_check"] = app["last_health_check"]
                head_state["head_url"] = head_url
                log.info("write head_state to S3: {}".format(head_state))
                try:
                    await putS3JSONObj(app, headnode_key, head_state)
                except HTTPInternalServerError as hpe:
                    # Might be bad AWS config, transient S3 error, or minio not initialized yet...
                    log.warn(
                        "HTTPInternalServerError writing head_state: {}: {}".
                        format(headnode_key, str(hpe)))
            continue  # start health check on next iteration

        head_state = await getS3JSONObj(app, headnode_key)
        log.info("head_state: {}".format(head_state))
        log.info("elapsed time since last health check: {}".format(
            elapsedTime(head_state["last_health_check"])))
        if head_state['id'] != app['id']:
            log.warn("mis-match bucket head id: {}".format(head_state["id"]))
            if now - head_state["last_health_check"] < sleep_secs * 4:
                log.warn("other headnode may be active")
                continue  # skip node checks and loop around again
            else:
                log.warn(
                    "other headnode is not active, making this headnode leader"
                )
                head_state['id'] = app['id']
        else:
            log.info("head_state id matches S3 Object")

        head_state["last_health_check"] = now
        app["last_health_check"] = now
        head_state["head_url"] = head_url
        log.info("write head_state to S3: {}".format(head_state))
        await putS3JSONObj(app, headnode_key, head_state)

        log.info("putS3JSONObj complete")
        fail_count = 0
        HEALTH_CHECK_RETRY_COUNT = 1  # times to try before calling a node dead
        for node in nodes:
            if node["host"] is None:
                fail_count += 1
                continue
            url = getUrl(node["host"], node["port"]) + "/info"
            try:
                rsp_json = await http_get(app, url)
                if "node" not in rsp_json:
                    log.error("Unexpected response from node")
                    fail_count += 1
                    continue
                node_state = rsp_json["node"]
                node_id = node_state["id"]

                if node_id != node['id']:
                    log.warn("unexpected node_id: {} (expecting: {})".format(
                        node_id, node['id']))
                    node['host'] = None
                    node['id'] = None
                    fail_count += 1
                    continue

                if 'number' in node_state and node_state['number'] != node[
                        'node_number']:
                    msg = "unexpected node_number got {} (expecting: {})"
                    log.warn(
                        msg.format(node_state["number"], node['node_number']))
                    node['host'] = None
                    node['id'] = None
                    fail_count += 1
                    continue

                # save off other useful info from the node
                app_node_stats = app["node_stats"]
                node_stats = {}
                for k in NODE_STAT_KEYS:
                    node_stats[k] = rsp_json[k]
                app_node_stats[node_id] = node_stats
                # mark the last time we got a response from this node
                node["healthcheck"] = unixTimeToUTC(int(time.time()))
                node["failcount"] = 0  # rest
            except OSError as ose:
                log.warn("OSError for req: {}: {}".format(url, str(ose)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("node {}:{} not responding".format(
                        node["host"], node["port"]))
                    fail_count += 1

            except HTTPInternalServerError as hpe:
                log.warn("HTTPInternalServerError for req: {}: {}".format(
                    url, str(hpe)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("removing {}:{} from active list".format(
                        node["host"], node["port"]))
                    fail_count += 1
            except TimeoutError as toe:
                log.warn("Timeout error for req: {}: {}".format(url, str(toe)))
                # node has gone away?
                node["failcount"] += 1
                if node["failcount"] >= HEALTH_CHECK_RETRY_COUNT:
                    log.warn("removing {}:{} from active list".format(
                        node["host"], node["port"]))
                    fail_count += 1
        log.info("node health check fail_count: {}".format(fail_count))
        if fail_count > 0:
            if app["cluster_state"] == "READY":
                # go back to INITIALIZING state until another node is registered
                log.warn(
                    "Fail_count > 0, Setting cluster_state from READY to INITIALIZING"
                )
                app["cluster_state"] = "INITIALIZING"
        elif fail_count == 0 and app["cluster_state"] != "READY":
            log.info("All nodes healthy, changing cluster state to READY")
            app["cluster_state"] = "READY"