Ejemplo n.º 1
0
def run(ceph_cluster, **kw):
    """
     1. Create a LRC profile and then create a ec pool
            #ceph osd erasure-code-profile set $profile \
            plugin=lrc \
            k=4 m=2 l=3 \
            ruleset-failure-domain=osd
             # ceph osd pool create $poolname 1 1  erasure $profile

    2. start writing a large object so that we will get \
            sometime to fail the osd while the reads and writes are
            in progress on an object

    # rados put -p lrcpool obj1 /src/path
    #rados get -p lrcpool obj1 /tmp/obj1

    while above command is in progress kill primary
    osd responsible for the PG.
    primary can be found from
    # ceph pg dump

    3. Bring back primary

    4. Repeat the step 2 but this time kill some secondary osds

    Args:
        ceph_cluster (ceph.ceph.Ceph):
    """

    log.info("Running test CEPH-9281")
    ceph_nodes = kw.get("ceph_nodes")
    config = kw.get("config")
    build = config.get("build", config.get("rhbuild"))

    mons = []
    role = "client"

    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))

    helper = RadosHelper(ctrlr, config, log)
    """ create LRC profile """
    sufix = random.randint(0, 10000)
    prof_name = "LRCprofile{suf}".format(suf=sufix)
    if build.startswith("4"):
        profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \
            crush-failure-domain=osd".format(LRCprofile=prof_name)
    else:
        profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \
            ruleset-failure-domain=osd crush-failure-domain=osd".format(
            LRCprofile=prof_name)
    try:
        (outbuf, err) = helper.raw_cluster_cmd(profile)
        log.info(outbuf)
        log.info("created profile {LRCprofile}".format(LRCprofile=prof_name))
    except Exception:
        log.error("LRC profile creation failed")
        log.error(traceback.format_exc())
        return 1
    """create LRC ec pool"""
    pool_name = "lrcpool{suf}".format(suf=sufix)
    try:
        helper.create_pool(pool_name, 1, prof_name)
        log.info("Pool {pname} created".format(pname=pool_name))
    except Exception:
        log.error("lrcpool create failed")
        log.error(traceback.format_exc())
        return 1
    """rados put and get in a parallel task"""
    with parallel() as p:
        p.spawn(do_rados_put, ctrlr, pool_name, 20)
        p.spawn(do_rados_get, ctrlr, pool_name, 10)

        for res in p:
            log.info(res)

    try:
        pri_osd_id = helper.get_pg_primary(pool_name, 0)
        log.info("PRIMARY={pri}".format(pri=pri_osd_id))
    except Exception:
        log.error("getting primary failed")
        log.error(traceback.format_exc())
        return 1

    log.info("SIGTERM osd")
    target_osd_hostname = ceph_cluster.get_osd_metadata(pri_osd_id).get(
        "hostname")
    pri_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id)
    try:
        helper.kill_osd(pri_osd_node, pri_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    if not helper.wait_until_osd_state(osd_id=pri_osd_id, down=True):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=pri_osd_id))

    try:
        if helper.revive_osd(pri_osd_node, pri_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    if not helper.wait_until_osd_state(pri_osd_id):
        log.error("osd is DOWN")
        return 1
    log.info(
        f"Revival of Primary OSD : {pri_osd_id} is complete\n Killing random OSD"
    )

    time.sleep(10)
    try:
        rand_osd_id = helper.get_pg_random(pool_name, 0)
        log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id))
    except Exception:
        log.error("getting  random osd failed")
        log.error(traceback.format_exc())
        return 1
    log.info("SIGTERM osd")
    target_osd_hostname = ceph_cluster.get_osd_metadata(rand_osd_id).get(
        "hostname")
    rand_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id)
    try:
        helper.kill_osd(rand_osd_node, rand_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    if not helper.wait_until_osd_state(osd_id=rand_osd_id, down=True):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=rand_osd_id))
    try:
        if helper.revive_osd(rand_osd_node, rand_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    if not helper.wait_until_osd_state(rand_osd_id):
        log.error("osd is DOWN")
        return 1
    log.info(f"Revival of Random OSD : {rand_osd_id} is complete")
    return 0
Ejemplo n.º 2
0
def run(ceph_cluster, **kw):
    """
     1. Create a LRC profile and then create a ec pool
            #ceph osd erasure-code-profile set $profile \
            plugin=lrc \
            k=4 m=2 l=3 \
            ruleset-failure-domain=osd
             # ceph osd pool create $poolname 1 1  erasure $profile

    2. start writing a large object so that we will get \
            sometime to fail the osd while the reads and writes are
            in progress on an object

    # rados put -p lrcpool obj1 /src/path
    #rados get -p lrcpool obj1 /tmp/obj1

    while above command is in progress kill primary
    osd responsible for the PG.
    primary can be found from
    # ceph pg dump

    3. Bring back primary

    4. Repeat the step 2 but this time kill some secondary osds

    Args:
        ceph_cluster (ceph.ceph.Ceph):
    """

    log.info("Running test CEPH-9281")
    ceph_nodes = kw.get('ceph_nodes')
    config = kw.get('config')

    mons = []
    role = 'client'

    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))

    helper = RadosHelper(ctrlr, config, log)
    ''' create LRC profile '''
    sufix = random.randint(0, 10000)
    prof_name = "LRCprofile{suf}".format(suf=sufix)
    profile = "osd erasure-code-profile set {LRCprofile} \
        plugin=lrc\
        k=4 m=2 l=3 \
        ruleset-failure-domain=osd \
        crush-failure-domain=osd".format(LRCprofile=prof_name)
    try:
        (out, err) = helper.raw_cluster_cmd(profile)
        outbuf = out.read().decode()
        log.info(outbuf)
        log.info("created profile {LRCprofile}".format(LRCprofile=prof_name))
    except Exception:
        log.error("LRC profile creation failed")
        log.error(traceback.format_exc())
        return 1
    '''create LRC ec pool'''
    pool_name = "lrcpool{suf}".format(suf=sufix)
    try:
        helper.create_pool(pool_name, 1, prof_name)
        log.info("Pool {pname} created".format(pname=pool_name))
    except Exception:
        log.error("lrcpool create failed")
        log.error(traceback.format_exc())
        return 1
    '''rados put and get in a parallel task'''
    with parallel() as p:
        p.spawn(do_rados_put, ctrlr, pool_name, 20)
        p.spawn(do_rados_get, ctrlr, pool_name, 10)

        for res in p:
            log.info(res)

    try:
        pri_osd_id = helper.get_pg_primary(pool_name, 0)
        log.info("PRIMARY={pri}".format(pri=pri_osd_id))
    except Exception:
        log.error("getting primary failed")
        log.error(traceback.format_exc())
        return 1

    log.info("SIGTERM osd")
    pri_osd = ceph_cluster.get_osd_by_id(pri_osd_id)
    pri_osd_node = pri_osd.node
    pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id)
    try:
        helper.kill_osd(pri_osd_node, pri_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    time.sleep(10)
    if helper.is_up(pri_osd_id):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=pri_osd_id))

    try:
        if helper.revive_osd(pri_osd_node, pri_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    time.sleep(10)
    if helper.is_up(pri_osd_id):
        log.info("osd is UP")
    else:
        log.error("osd is DOWN")
        return 1

    time.sleep(10)
    try:
        rand_osd_id = helper.get_pg_random(pool_name, 0)
        log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id))
    except Exception:
        log.error("getting  random osd failed")
        log.error(traceback.format_exc())
        return 1
    log.info("SIGTERM osd")
    rand_osd = ceph_cluster.get_osd_by_id(rand_osd_id)
    rand_osd_node = rand_osd.node
    rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id)
    try:
        helper.kill_osd(rand_osd_node, rand_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    time.sleep(10)
    if helper.is_up(rand_osd_id):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=rand_osd_id))
    try:
        if helper.revive_osd(rand_osd_node, rand_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    time.sleep(30)
    if helper.is_up(pri_osd_id):
        log.info("osd is UP")
    else:
        log.error("osd is DOWN")
        return 1

    return 0