コード例 #1
0
ファイル: test_9281.py プロジェクト: udaysk23/cephci
def run(ceph_cluster, **kw):
    """
     1. Create a LRC profile and then create a ec pool
            #ceph osd erasure-code-profile set $profile \
            plugin=lrc \
            k=4 m=2 l=3 \
            ruleset-failure-domain=osd
             # ceph osd pool create $poolname 1 1  erasure $profile

    2. start writing a large object so that we will get \
            sometime to fail the osd while the reads and writes are
            in progress on an object

    # rados put -p lrcpool obj1 /src/path
    #rados get -p lrcpool obj1 /tmp/obj1

    while above command is in progress kill primary
    osd responsible for the PG.
    primary can be found from
    # ceph pg dump

    3. Bring back primary

    4. Repeat the step 2 but this time kill some secondary osds

    Args:
        ceph_cluster (ceph.ceph.Ceph):
    """

    log.info("Running test CEPH-9281")
    ceph_nodes = kw.get("ceph_nodes")
    config = kw.get("config")
    build = config.get("build", config.get("rhbuild"))

    mons = []
    role = "client"

    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))

    helper = RadosHelper(ctrlr, config, log)
    """ create LRC profile """
    sufix = random.randint(0, 10000)
    prof_name = "LRCprofile{suf}".format(suf=sufix)
    if build.startswith("4"):
        profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \
            crush-failure-domain=osd".format(LRCprofile=prof_name)
    else:
        profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \
            ruleset-failure-domain=osd crush-failure-domain=osd".format(
            LRCprofile=prof_name)
    try:
        (outbuf, err) = helper.raw_cluster_cmd(profile)
        log.info(outbuf)
        log.info("created profile {LRCprofile}".format(LRCprofile=prof_name))
    except Exception:
        log.error("LRC profile creation failed")
        log.error(traceback.format_exc())
        return 1
    """create LRC ec pool"""
    pool_name = "lrcpool{suf}".format(suf=sufix)
    try:
        helper.create_pool(pool_name, 1, prof_name)
        log.info("Pool {pname} created".format(pname=pool_name))
    except Exception:
        log.error("lrcpool create failed")
        log.error(traceback.format_exc())
        return 1
    """rados put and get in a parallel task"""
    with parallel() as p:
        p.spawn(do_rados_put, ctrlr, pool_name, 20)
        p.spawn(do_rados_get, ctrlr, pool_name, 10)

        for res in p:
            log.info(res)

    try:
        pri_osd_id = helper.get_pg_primary(pool_name, 0)
        log.info("PRIMARY={pri}".format(pri=pri_osd_id))
    except Exception:
        log.error("getting primary failed")
        log.error(traceback.format_exc())
        return 1

    log.info("SIGTERM osd")
    target_osd_hostname = ceph_cluster.get_osd_metadata(pri_osd_id).get(
        "hostname")
    pri_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id)
    try:
        helper.kill_osd(pri_osd_node, pri_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    if not helper.wait_until_osd_state(osd_id=pri_osd_id, down=True):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=pri_osd_id))

    try:
        if helper.revive_osd(pri_osd_node, pri_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    if not helper.wait_until_osd_state(pri_osd_id):
        log.error("osd is DOWN")
        return 1
    log.info(
        f"Revival of Primary OSD : {pri_osd_id} is complete\n Killing random OSD"
    )

    time.sleep(10)
    try:
        rand_osd_id = helper.get_pg_random(pool_name, 0)
        log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id))
    except Exception:
        log.error("getting  random osd failed")
        log.error(traceback.format_exc())
        return 1
    log.info("SIGTERM osd")
    target_osd_hostname = ceph_cluster.get_osd_metadata(rand_osd_id).get(
        "hostname")
    rand_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id)
    try:
        helper.kill_osd(rand_osd_node, rand_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    if not helper.wait_until_osd_state(osd_id=rand_osd_id, down=True):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=rand_osd_id))
    try:
        if helper.revive_osd(rand_osd_node, rand_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    if not helper.wait_until_osd_state(rand_osd_id):
        log.error("osd is DOWN")
        return 1
    log.info(f"Revival of Random OSD : {rand_osd_id} is complete")
    return 0
コード例 #2
0
def run(ceph_cluster, **kw):
    """
     CEPH-9311 - RADOS: Pyramid erasure codes (Local Repai     rable erasure codes):
     Bring down 2 osds (in case of k=4) from 2 localities      so that recovery happens from local repair code

     1. Create a LRC profile and then create a ec pool
     #ceph osd erasure-code-profile set $profile \
        plugin=lrc \
        k=4 m=2 l=3 \
        ruleset-failure-domain=osd
     # ceph osd pool create $poolname 1 1  erasure $profile

    2. start writing objects to the pool

    # rados -p poolname bench 1000 write --no-cleanup

    3. Bring down 2 osds from 2 different localities which    contains data chunk:(for this we need to figure out
    mapping) for ex: with k=4, m=2, l=3 mapping looks like
    chunk nr    01234567
    step 1      _cDD_cDD    (Here DD are data chunks )
    step 2      cDDD____
    step 3      ____cDDD

    from "step 1" in the above mapping we can see that
    data chunk is divided into 2 localities which is
    anlogous to 2 data center. so in our case for ex
    we have to bring down (3,7) OR (2,7) OR (2,6) OR (3,6)    .

    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """

    log.info("Running test ceph-9311")
    ceph_nodes = kw.get('ceph_nodes')
    config = kw.get('config')

    mons = []
    role = 'client'

    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))

    helper = RadosHelper(ctrlr, config, log)
    '''Create an LRC profile'''
    sufix = random.randint(0, 10000)
    prof_name = "LRCprofile{suf}".format(suf=sufix)
    profile = "osd erasure-code-profile set {LRCprofile} \
            plugin=lrc\
            k=4 m=2 l=3 \
            ruleset-failure-domain=osd \
            crush-failure-domain=osd".format(LRCprofile=prof_name)
    try:
        (out, err) = helper.raw_cluster_cmd(profile)
        outbuf = out.read().decode()
        log.info(outbuf)
        log.info("created profile {LRCprofile}".format(LRCprofile=prof_name))
    except Exception:
        log.error("LRC profile creation failed")
        log.error(traceback.format_exc())
        return 1
    '''create LRC ec pool'''
    pool_name = "lrcpool{suf}".format(suf=sufix)
    try:
        helper.create_pool(pool_name, 1, prof_name)
        log.info("Pool {pname} created".format(pname=pool_name))
    except Exception:
        log.error("lrcpool create failed")
        log.error(traceback.format_exc())
        return 1
    ''' Bringdown 2 osds which contains a 'D' from both localities
        we will be chosing osd at 2 and 7 from the given active set list
    '''
    oname = "UNIQUEOBJECT{i}".format(i=random.randint(0, 10000))
    cmd = "osd map {pname} {obj} --format json".format(pname=pool_name,
                                                       obj=oname)
    (out, err) = helper.raw_cluster_cmd(cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    cmdout = json.loads(outbuf)
    # targt_pg = cmdout['pgid']
    target_osds_ids = []
    for i in [2, 7]:
        target_osds_ids.append(cmdout['up'][i])

    # putobj = "sudo rados -p {pool} put {obj} {path}".format(
    #     pool=pool_name, obj=oname, path="/etc/hosts"
    # )
    for i in range(10):
        putobj = "sudo rados -p {pool} put {obj} {path}".format(
            pool=pool_name,
            obj="{oname}{i}".format(oname=oname, i=i),
            path="/etc/hosts")
        (out, err) = ctrlr.exec_command(cmd=putobj)
    '''Bringdown tosds'''
    osd_service_map_list = []
    for osd_id in target_osds_ids:
        target_osd_node = ceph_cluster.get_osd_by_id(osd_id).node
        osd_service = ceph_cluster.get_osd_service_name(osd_id)
        osd_service_map_list.append({
            'osd_node': target_osd_node,
            'osd_service': osd_service
        })
        helper.kill_osd(target_osd_node, osd_service)
        time.sleep(5)

        outbuf = "degrade"
        timeout = 10
        found = 0
        status = '-s --format json'
        while timeout:
            if 'active' not in outbuf:
                (out, err) = helper.raw_cluster_cmd(status)
                outbuf = out.read().decode()
                time.sleep(1)
                timeout = timeout - 1
            else:
                found = 1
                break
        if timeout == 0 and found == 0:
            log.error("cluster didn't become active+clean..timeout")
            return 1
    '''check whether read/write can be done on the pool'''
    for i in range(10):
        putobj = "sudo rados -p {pool} put {obj} {path}".format(
            pool=pool_name,
            obj="{oname}{i}".format(oname=oname, i=i),
            path="/etc/hosts")
        (out, err) = ctrlr.exec_command(cmd=putobj)
        log.info(out.read().decode())
    for i in range(10):
        putobj = "sudo rados -p {pool} get {obj} {path}".format(
            pool=pool_name,
            obj="{oname}{i}".format(oname=oname, i=i),
            path="/tmp/{obj}{i}".format(obj=oname, i=i))
        (out, err) = ctrlr.exec_command(cmd=putobj)
        log.info(out.read().decode())
    '''donewith the test ,revive osds'''
    for osd_service_map in osd_service_map_list:
        helper.revive_osd(osd_service_map.get('osd_node'),
                          osd_service_map.get('osd_service'))

    return 0
コード例 #3
0
ファイル: test_9928.py プロジェクト: rpratap-bot/cephci
def run(ceph_cluster, **kw):
    """
    CEPH-9928 RADOS:
    Corrupt snap info of an object and run
    list-inconsistent-snapset
    Steps:
        1. create a replica 3 pool
        2. take few pool snaps with writes on objects b/w every snap
        3. chose primary osd and bring it down
        4. go to backend and using ceph-object-store tool corrupt the
           snapset of the object
        5. run deep-scrub on the pg
        6. check rados list-inconsistent-pg <pool>
        7. rados list-inconsistent-snapset <pg>

    Args:
        ceph_cluster (ceph.ceph.Ceph):
    """
    log.info("Running CEPH-9928")
    log.info(run.__doc__)
    ceph_nodes = kw.get('ceph_nodes')
    config = kw.get('config')
    mons = []

    role = 'client'
    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))
    helper = RadosHelper(ctrlr, config, log)
    """create a replica pool"""
    pname = "snapcorrupt_{rand}".format(rand=random.randint(0, 10000))
    try:
        helper.create_pool(pname, 1)
        log.info("Pool {pname} created".format(pname=pname))
    except Exception:
        log.error("Failed to create pool")
        log.error(traceback.format_exc())
        return 1
    time.sleep(5)
    """Get the target PG,osd for corruption operation"""
    oname = "UNIQUEOBJECT{i}".format(i=random.randint(0, 10000))
    cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname)
    (out, err) = helper.raw_cluster_cmd(cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    cmdout = json.loads(outbuf)
    targt_pg = cmdout['pgid']
    targt_osd_id = cmdout['up'][0]
    '''write data and take snaps'''
    putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname,
                                                            obj=oname,
                                                            path="/etc/hosts")
    for i in range(10):
        (out, err) = ctrlr.exec_command(cmd=putobj)
        snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname,
                                                               sname="snap" +
                                                               str(i))
        (out, err) = ctrlr.exec_command(cmd=snapcmd)
        log.info("put {obj}, snap {snap}".format(obj=oname,
                                                 snap="snap" + str(i)))
    '''
    Goto destination osd, stop the osd
    use ceph-objectstore-tool to corrupt
    snap info
    '''
    #    target_osd = ceph_cluster.get_osd_by_id(targt_osd_id)
    #    target_osd_node = target_osd.node
    target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        'hostname')
    log.info(target_osd_hostname)
    target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    cot_environment = target_osd_node
    osd_service = ceph_cluster.get_osd_service_name(targt_osd_id)
    partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        'osd_data')
    helper.kill_osd(target_osd_node, osd_service)
    time.sleep(10)
    osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id)
    osd_data = osd_metadata.get('osd_data')
    osd_journal = osd_metadata.get('osd_journal')
    if ceph_cluster.containerized:
        docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format(
            docker_registry=ceph_cluster.ansible_config.get(
                'ceph_docker_registry'),
            docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'),
            docker_tag=ceph_cluster.ansible_config.get(
                'ceph_docker_image_tag'))
        cot_environment = helper.get_mgr_proxy_container(
            target_osd_node, docker_image_string)
        out, err = cot_environment.exec_command(
            cmd='mount | grep "{partition_path} "'.format(
                partition_path=partition_path),
            check_ec=False)
        device_mount_data = out.read().decode()  # type: str
        if not device_mount_data:
            cot_environment.exec_command(
                cmd='sudo mount {partition_path} {directory}'.format(
                    partition_path=partition_path, directory=osd_data))

    slist_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            --head --op list {obj}".format(osd_data=osd_data,
                                           osd_journal=osd_journal,
                                           obj=oname)
    (out, err) = cot_environment.exec_command(cmd=slist_cmd)
    outbuf = out.read().decode()
    log.info(outbuf)

    corrupt_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            {outbuf} clear-snapset \
            corrupt".format(osd_data=osd_data,
                            osd_journal=osd_journal,
                            outbuf="'" + (outbuf) + "'")
    (out, err) = cot_environment.exec_command(cmd=corrupt_cmd)
    outbuf = out.read().decode()
    log.info(outbuf)

    helper.revive_osd(target_osd_node, osd_service)
    time.sleep(10)
    run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg)
    (out, err) = helper.raw_cluster_cmd(run_scrub)
    outbuf = out.read().decode()
    log.info(outbuf)

    while 'HEALTH_ERR' and 'active+clean+inconsistent' not in outbuf:
        status = "-s --format json"
        (out, err) = helper.raw_cluster_cmd(status)
        outbuf = out.read().decode()
    log.info("HEALTH_ERR found as expected")
    log.info("inconsistent foud as expected")

    timeout = 300
    found = 0
    while timeout:
        incon_pg = "sudo rados list-inconsistent-pg \
                    {pname}".format(pname=pname)
        (out, err) = ctrlr.exec_command(cmd=incon_pg)
        outbuf = out.read().decode()
        log.info(outbuf)
        if targt_pg not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("pg not listed as inconsistent")
        return 1

    timeout = 300
    found = 0
    while timeout:
        incon_snap = "sudo rados list-inconsistent-snapset \
                      {pg}".format(pg=targt_pg)
        (out, err) = ctrlr.exec_command(cmd=incon_snap)
        outbuf = out.read().decode()
        log.info(outbuf)
        if oname not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("object is not listed in inconsistent snap")
        return 1

    return 0
コード例 #4
0
ファイル: test_83571453.py プロジェクト: udaysk23/cephci
def run(ceph_cluster, **kw):
    """
    CEPH-83571453-RADOS:
    Corrupt an object in ec pool followed by
    list-inconsistent-* commands
    1. create a jerasure ec pool with k=4,m=2
    2. create an object in the pool
    3. chose primary osd from the acting set and go to the backend
    4. corrupt object attrib from the backend
    5. run deep-scrub on the pool
    6. rados list-inconsistent-pg <pool>
    7. rados list-inconsistent-obj <pg>

    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Running CEPH-83571453")
    log.info(run.__doc__)

    ceph_nodes = kw.get("ceph_nodes")
    config = kw.get("config")
    build = config.get("build", config.get("rhbuild"))
    mons = []
    role = "client"
    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))
    helper = RadosHelper(ctrlr, config, log)
    """create ec pool with k=4, m=2"""
    k = 4
    m = 2
    pname = "eccorrupt_{rand}_{k}_{m}".format(rand=random.randint(0, 10000),
                                              k=k,
                                              m=m)
    profile = pname
    if build.startswith("4"):
        prof_cmd = "osd erasure-code-profile set {profile} k={k} m={m} \
            crush-failure-domain=osd".format(profile=profile, k=k, m=m)
    else:
        prof_cmd = "osd erasure-code-profile set {profile} k={k} m={m} \
            ruleset-failure-domain=osd crush-failure-domain=osd".format(
            profile=profile, k=k, m=m)
    try:
        (outbuf, err) = helper.raw_cluster_cmd(prof_cmd)
        log.info(outbuf)
        log.info("created profile {ec}".format(ec=profile))
    except Exception:
        log.error("ec profile creation failed")
        log.error(traceback.format_exc())
        return 1
    """create ec pool"""
    try:
        helper.create_pool(pname, 1, profile)
        log.info("Pool {pname} is create".format(pname=pname))
    except Exception:
        log.error("failed to create pool")
        log.error(traceback.format_exc())
        return 1
    """check whether pool exists"""
    try:
        helper.get_pool_num(pname)
    except Exception:
        log.error("Unable to find pool")
        log.error(traceback.format_exc())
        return 1
    time.sleep(10)

    oname = "OBJ_{pname}".format(pname=pname)
    cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname)
    (outbuf, err) = helper.raw_cluster_cmd(cmd)
    log.info(outbuf)
    cmdout = json.loads(outbuf)
    targt_pg = cmdout["pgid"]
    """considering primary only as of now because of bug
    1544680
    """
    targt_osd_id = cmdout["up"][0]
    """write data and take snaps"""
    putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname,
                                                            obj=oname,
                                                            path="/etc/hosts")
    for i in range(10):
        (out, err) = ctrlr.exec_command(cmd=putobj)
        snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname,
                                                               sname="snap" +
                                                               str(i))
        (out, err) = ctrlr.exec_command(cmd=snapcmd)
        log.info("put {obj}, snap {snap}".format(obj=oname,
                                                 snap="snap" + str(i)))
    """
    Goto destination osd, stop the osd
    use ceph-objectstore-tool to corrupt
    snap info
    """
    #    target_osd = ceph_cluster.get_osd_by_id(targt_osd_id)
    #    target_osd_node = target_osd.node
    target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        "hostname")
    log.info(target_osd_hostname)
    target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    cot_environment = target_osd_node
    osd_service = ceph_cluster.get_osd_service_name(targt_osd_id)
    partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        "osd_data")
    helper.kill_osd(target_osd_node, osd_service)
    time.sleep(10)
    osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id)
    osd_data = osd_metadata.get("osd_data")
    osd_journal = osd_metadata.get("osd_journal")
    if ceph_cluster.containerized:
        docker_image_string = "{docker_registry}/{docker_image}:{docker_tag}".format(
            docker_registry=ceph_cluster.ansible_config.get(
                "ceph_docker_registry"),
            docker_image=ceph_cluster.ansible_config.get("ceph_docker_image"),
            docker_tag=ceph_cluster.ansible_config.get(
                "ceph_docker_image_tag"),
        )
        cot_environment = helper.get_mgr_proxy_container(
            target_osd_node, docker_image_string)
        device_mount_data, err = cot_environment.exec_command(
            cmd='mount | grep "{partition_path} "'.format(
                partition_path=partition_path),
            check_ec=False,
        )
        if not device_mount_data:
            cot_environment.exec_command(
                cmd="sudo mount {partition_path} {directory}".format(
                    partition_path=partition_path, directory=osd_data))

    slist_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            --head --op list {obj}".format(osd_data=osd_data,
                                           osd_journal=osd_journal,
                                           obj=oname)
    (outbuf, err) = cot_environment.exec_command(cmd=slist_cmd)
    log.info(outbuf)
    corrupt_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            {outbuf} clear-snapset \
            corrupt".format(osd_data=osd_data,
                            osd_journal=osd_journal,
                            outbuf="'" + (outbuf) + "'")
    (outbuf, err) = cot_environment.exec_command(cmd=corrupt_cmd)
    log.info(outbuf)

    helper.revive_osd(target_osd_node, osd_service)
    time.sleep(10)
    run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg)
    (outbuf, err) = helper.raw_cluster_cmd(run_scrub)
    log.info(outbuf)

    while "HEALTH_ERR" and "active+clean+inconsistent" not in outbuf:
        status = "-s --format json"
        (outbuf, err) = helper.raw_cluster_cmd(status)
    log.info("HEALTH_ERR found as expected")
    log.info("inconsistent foud as expected")

    timeout = 300
    found = 0
    while timeout:
        incon_pg = "sudo rados list-inconsistent-pg {pname}".format(
            pname=pname)
        (outbuf, err) = ctrlr.exec_command(cmd=incon_pg)
        log.info(outbuf)
        if targt_pg not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("pg not listed as inconsistent")
        return 1

    timeout = 300
    found = 0
    while timeout:
        incon_obj = "sudo rados list-inconsistent-snapset \
                     {pg}".format(pg=targt_pg)
        (outbuf, err) = ctrlr.exec_command(cmd=incon_obj)
        log.info(outbuf)
        if oname not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("object is not listed in inconsistent obj")
        return 1

    return 0
コード例 #5
0
def run(**kw):
    log.info("Running radoslib test")
    ceph_nodes = kw.get("ceph_nodes")
    config = kw.get("config")

    mons = []
    osds = []
    role = "client"
    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)
    for osd in ceph_nodes:
        if osd.role == "osd":
            osds.append(osd)

    idx = 0
    mon = mons[idx]
    print(mon.hostname)

    helper = RadosHelper(mon, config, log)

    """	try:
        Helper.create_pool("blabla1",4)
        log.info("poll created successfully")
    except:
        log.error("pool creation failed")
        return 1

    try:
        pri_osd=Helper.get_pg_primary("new", 0)
        print pri_osd
    except:
        return 1

    try:
        osdhost=Helper.get_osd_host(0)
        print osdhost
    except:
        log.error("getting osd host failed")
        return 1

    ret=1
    try:
        log.info("TRYING KILL")
        ret=Helper.kill_osd(1, osds)
        log.info("ret={ret}".format(ret=ret))
    finally:
        return ret

    try:
        ret=Helper.is_up(1)
        if ret:
            log.info("UP")
        else:
            log.info("DOWN")
        return ret
    except:
        log.error("staus check failed")
        return 1
    """

    try:
        ret = helper.revive_osd(1, osds)
        return ret
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
コード例 #6
0
ファイル: test_9924.py プロジェクト: red-hat-storage/cephci
def run(ceph_cluster, **kw):
    """
    CEPH-9925 - [RADOS]:
    Rewrite a known omap item of a replica and list-inconsistent-obj
    Steps:
        1. create an object in a replica pool
        2. add some omap keys and corresponding values to the object
        3. chose one of the replica and using ceph-objectstore-rool corrupt omap key or
           value
        4. Run deep-scrub - scrub should report inconsistency
        5. run rados list-inconsistent-pg <pool> - should list the pg in
           which object is inconsistent
        6. Run rados list-inconsistent-obj <pg> should report omap digest mismatch error
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Running CEPH-9924")
    log.info(run.__doc__)
    ceph_nodes = kw.get("ceph_nodes")
    config = kw.get("config")
    mons = []
    role = "client"
    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))
    helper = RadosHelper(ctrlr, config, log)
    """create an replica pool"""
    pname0 = "replica_pool_{rand}".format(rand=random.randint(0, 10000))
    pname = pname0
    try:
        helper.create_pool(pname, 128)
        log.info("Pool {pname} is create".format(pname=pname))
    except Exception:
        log.error("failed to create pool")
        log.error(traceback.format_exc())
        return 1
    """check whether pool exists"""
    try:
        helper.get_pool_num(pname)
    except Exception:
        log.error("Unable to find pool")
        log.error(traceback.format_exc())
        return 1

    time.sleep(10)

    oname = "OBJ_{pname}".format(pname=pname)

    putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname,
                                                            obj=oname,
                                                            path="/etc/hosts")
    (out, err) = ctrlr.exec_command(cmd=putobj)
    """ creating omap key/value pairs for an object"""

    for i in range(4):
        omapcmd = "sudo rados -p {pool} setomapval {obj} {keey} {valu}".format(
            pool=pname, obj=oname, keey="key" + str(i), valu="value" + str(i))
        (out, err) = ctrlr.exec_command(cmd=omapcmd)
        log.info("put {obj}, omap key {keey} value {valu}".format(
            obj=oname, keey="key" + str(i), valu="value" + str(i)))
    """
    Goto destination osd, stop the osd service to
    use ceph-objectstore-tool to corrupt
    omap keys
    """

    cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname)
    (out, err) = helper.raw_cluster_cmd(cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    cmdout = json.loads(outbuf)
    targt_pg = cmdout["pgid"]
    """Considering non primary osd"""
    targt_osd_id = cmdout["up"][1]
    #    target_osd = ceph_cluster.get_osd_by_id(targt_osd_id)
    #    target_osd_node = target_osd.node
    target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        "hostname")
    log.info(target_osd_hostname)
    target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname)
    cot_environment = target_osd_node
    osd_service = ceph_cluster.get_osd_service_name(targt_osd_id)
    partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get(
        "osd_data")
    helper.kill_osd(target_osd_node, osd_service)
    time.sleep(10)
    osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id)
    osd_data = osd_metadata.get("osd_data")
    osd_journal = osd_metadata.get("osd_journal")

    if ceph_cluster.containerized:
        # target_osd_node.exec_command(cmd='sudo yum install -y ceph-osd', check_ec=False)
        docker_image_string = "{docker_registry}/{docker_image}:{docker_tag}".format(
            docker_registry=ceph_cluster.ansible_config.get(
                "ceph_docker_registry"),
            docker_image=ceph_cluster.ansible_config.get("ceph_docker_image"),
            docker_tag=ceph_cluster.ansible_config.get(
                "ceph_docker_image_tag"),
        )
        cot_environment = helper.get_mgr_proxy_container(
            target_osd_node, docker_image_string)
        out, err = cot_environment.exec_command(
            cmd='mount | grep "{partition_path} "'.format(
                partition_path=partition_path),
            check_ec=False,
        )
        device_mount_data = out.read().decode()  # type: str
        if not device_mount_data:
            cot_environment.exec_command(
                cmd="sudo mount {partition_path} {directory}".format(
                    partition_path=partition_path, directory=osd_data))

    # docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format(
    #     docker_registry=ceph_cluster.ansible_config.get('ceph_docker_registry'),
    #     docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'),
    #     docker_tag=ceph_cluster.ansible_config.get('ceph_docker_image_tag'))
    # mgr_proxy = helper.get_mgr_container_proxy(target_osd_node, docker_image_string)

    slist_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            --pgid {pgid} {obj} list-omap".format(osd_data=osd_data,
                                                  osd_journal=osd_journal,
                                                  obj=oname,
                                                  pgid=targt_pg)
    (out, err) = cot_environment.exec_command(cmd=slist_cmd)
    outbuf = out.read().decode()
    keylist = outbuf.split()
    log.info(outbuf)
    """corrupting an omap key by rewriting the omap key with different value"""
    corrupt_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
                   --pgid {pgid} {obj} set-omap \
                   {outbuf} {path}".format(
        osd_data=osd_data,
        osd_journal=osd_journal,
        obj=oname,
        pgid=targt_pg,
        outbuf=keylist[0],
        path="/etc/hosts",
    )
    (out, err) = cot_environment.exec_command(cmd=corrupt_cmd)
    outbuf = out.read().decode()
    log.info(outbuf)

    helper.revive_osd(target_osd_node, osd_service)
    time.sleep(10)
    run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg)
    (out, err) = helper.raw_cluster_cmd(run_scrub)
    outbuf = out.read().decode()
    log.info(outbuf)

    while "HEALTH_ERR" and "active+clean+inconsistent" not in outbuf:
        status = "-s --format json"
        (out, err) = helper.raw_cluster_cmd(status)
        outbuf = out.read().decode()
    log.info("HEALTH_ERR found as expected")
    log.info("inconsistent found as expected")

    timeout = 100
    found = 0
    while timeout:
        incon_pg = "sudo rados list-inconsistent-pg {pname}".format(
            pname=pname)
        (out, err) = ctrlr.exec_command(cmd=incon_pg)
        outbuf = out.read().decode()
        log.info(outbuf)
        if targt_pg not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("pg not listed as inconsistent")
        return 1

    timeout = 100
    found = 0
    while timeout:
        incon_obj = "sudo rados list-inconsistent-obj {pg}".format(pg=targt_pg)
        (out, err) = ctrlr.exec_command(cmd=incon_obj)
        outbuf = out.read().decode()
        log.info(outbuf)
        if oname not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("object is not listed in inconsistent obj")
        return 1

    return 0
コード例 #7
0
ファイル: test_9939.py プロジェクト: sidhant-agrawal/cephci
def run(ceph_cluster, **kw):
    """
    CEPH-9939:
    Delete snapset objects in ec pool followed by
    list-inconsistent-* commands
      1. create a jerasure ec pool with k=4,m=2
    2. create an object in the pool
    3. chose any of the osd from the acting set and go to the backend
    4. delete snap object from the backend
    5. run deep-scrub on the pool
    6. rados list-inconsistent-pg <pool>
    7. rados list-inconsistent-obj <pg>

    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Running CEPH-9939")
    log.info(run.__doc__)

    ceph_nodes = kw.get('ceph_nodes')
    config = kw.get('config')
    mons = []
    osds = []
    role = 'client'
    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)
    role = 'osd'
    for osd in ceph_nodes:
        if osd.role == role:
            osds.append(osd)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))
    helper = RadosHelper(ctrlr, config, log)
    """create ec pool with k=4,m=2"""
    k = 4
    m = 2
    pname = "ecsnapdelete_{rand}_{k}_{m}".format(rand=random.randint(0, 1000),
                                                 k=k,
                                                 m=m)
    profile = pname
    prof_cmd = "osd erasure-code-profile set {profile}\
                k={k}\
                m={m}\
                rulset-failure-domain=osd\
                crush-failure-domain=osd".format(profile=profile, k=k, m=m)
    try:
        (out, err) = helper.raw_cluster_cmd(prof_cmd)
        outbuf = out.read().decode()
        log.info(outbuf)
        log.info("created profile {ec}".format(ec=profile))
    except Exception:
        log.error("ec profile creation failed")
        log.error(traceback.format_exc())
        return 1
    '''create ec pool'''
    try:
        helper.create_pool(pname, 1, profile)
        log.info("Pool {pname} is create".format(pname=pname))
    except Exception:
        log.error("failed to create pool")
        log.error(traceback.format_exc())
        return 1
    '''check whether pool exists'''
    try:
        helper.get_pool_num(pname)
    except Exception:
        log.error("Unable to find pool")
        log.error(traceback.format_exc())
        return 1
    time.sleep(10)

    oname = "OBJ_{pname}".format(pname=pname)
    cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname)
    (out, err) = helper.raw_cluster_cmd(cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    cmdout = json.loads(outbuf)
    targt_pg = cmdout['pgid']
    '''considering primary only as of now because of bug
    1544680
    '''
    targt_osd_id = cmdout['up'][0]
    '''write data and take snaps'''
    putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname,
                                                            obj=oname,
                                                            path="/etc/hosts")
    for i in range(10):
        (out, err) = ctrlr.exec_command(cmd=putobj)
        snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname,
                                                               sname="snap" +
                                                               str(i))
        (out, err) = ctrlr.exec_command(cmd=snapcmd)
        log.info("put {obj}, snap {snap}".format(obj=oname,
                                                 snap="snap" + str(i)))
    """Goto destination osd , stop osd use ceph-objectstore-tool
       to delete snap """
    target_osd = ceph_cluster.get_osd_by_id(targt_osd_id)
    target_osd_node = target_osd.node
    cot_environment = target_osd_node
    osd_service = ceph_cluster.get_osd_service_name(targt_osd_id)
    partition_path = ceph_cluster.get_osd_data_partition_path(targt_osd_id)
    helper.kill_osd(target_osd_node, osd_service)
    time.sleep(10)
    osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id)
    osd_data = osd_metadata.get('osd_data')
    osd_journal = osd_metadata.get('osd_journal')
    if ceph_cluster.containerized:
        docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format(
            docker_registry=ceph_cluster.ansible_config.get(
                'ceph_docker_registry'),
            docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'),
            docker_tag=ceph_cluster.ansible_config.get(
                'ceph_docker_image_tag'))
        cot_environment = helper.get_mgr_proxy_container(
            target_osd_node, docker_image_string)
        out, err = cot_environment.exec_command(
            cmd='mount | grep "{partition_path} "'.format(
                partition_path=partition_path),
            check_ec=False)
        device_mount_data = out.read().decode()  # type: str
        if not device_mount_data:
            cot_environment.exec_command(
                cmd='sudo mount {partition_path} {directory}'.format(
                    partition_path=partition_path, directory=osd_data))
    slist_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            --op list \
            {obj}|grep \\\"snapid\\\":1".format(osd_data=osd_data,
                                                osd_journal=osd_journal,
                                                obj=oname)
    (out, err) = cot_environment.exec_command(cmd=slist_cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    corrupt_cmd = "sudo ceph-objectstore-tool --data-path \
            {osd_data} --journal-path \
            {osd_journal} \
            {outbuf} remove".format(osd_data=osd_data,
                                    osd_journal=osd_journal,
                                    outbuf="'" + (outbuf) + "'")
    (out, err) = cot_environment.exec_command(cmd=corrupt_cmd)
    outbuf = out.read().decode()
    log.info(outbuf)
    helper.revive_osd(target_osd_node, osd_service)
    time.sleep(10)
    run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg)
    (out, err) = helper.raw_cluster_cmd(run_scrub)
    outbuf = out.read().decode()
    log.info(outbuf)

    while 'HEALTH_ERR' and 'active+clean+inconsistent' not in outbuf:
        status = "-s --format json"
        (out, err) = helper.raw_cluster_cmd(status)
        outbuf = out.read().decode()
    log.info("HEALTH_ERR found as expected")
    log.info("inconsistent foud as expected")

    timeout = 300
    found = 0
    while timeout:
        incon_pg = "sudo rados list-inconsistent-pg \
                    {pname}".format(pname=pname)
        (out, err) = ctrlr.exec_command(cmd=incon_pg)
        outbuf = out.read().decode()
        log.info(outbuf)
        if targt_pg not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("pg not listed as inconsistent")
        return 1

    timeout = 300
    found = 0
    while timeout:
        incon_obj = "sudo rados list-inconsistent-obj {pg}".format(pg=targt_pg)
        (out, err) = ctrlr.exec_command(cmd=incon_obj)
        outbuf = out.read().decode()
        log.info(outbuf)
        if oname not in outbuf:
            time.sleep(1)
            timeout = timeout - 1
        else:
            found = 1
            break
    if timeout == 0 and found == 0:
        log.error("object is not listed in inconsistent obj")
        return 1

    return 0
コード例 #8
0
def run(ceph_cluster, **kw):
    """
     1. Create a LRC profile and then create a ec pool
            #ceph osd erasure-code-profile set $profile \
            plugin=lrc \
            k=4 m=2 l=3 \
            ruleset-failure-domain=osd
             # ceph osd pool create $poolname 1 1  erasure $profile

    2. start writing a large object so that we will get \
            sometime to fail the osd while the reads and writes are
            in progress on an object

    # rados put -p lrcpool obj1 /src/path
    #rados get -p lrcpool obj1 /tmp/obj1

    while above command is in progress kill primary
    osd responsible for the PG.
    primary can be found from
    # ceph pg dump

    3. Bring back primary

    4. Repeat the step 2 but this time kill some secondary osds

    Args:
        ceph_cluster (ceph.ceph.Ceph):
    """

    log.info("Running test CEPH-9281")
    ceph_nodes = kw.get('ceph_nodes')
    config = kw.get('config')

    mons = []
    role = 'client'

    for mnode in ceph_nodes:
        if mnode.role == role:
            mons.append(mnode)

    ctrlr = mons[0]
    log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname))

    helper = RadosHelper(ctrlr, config, log)
    ''' create LRC profile '''
    sufix = random.randint(0, 10000)
    prof_name = "LRCprofile{suf}".format(suf=sufix)
    profile = "osd erasure-code-profile set {LRCprofile} \
        plugin=lrc\
        k=4 m=2 l=3 \
        ruleset-failure-domain=osd \
        crush-failure-domain=osd".format(LRCprofile=prof_name)
    try:
        (out, err) = helper.raw_cluster_cmd(profile)
        outbuf = out.read().decode()
        log.info(outbuf)
        log.info("created profile {LRCprofile}".format(LRCprofile=prof_name))
    except Exception:
        log.error("LRC profile creation failed")
        log.error(traceback.format_exc())
        return 1
    '''create LRC ec pool'''
    pool_name = "lrcpool{suf}".format(suf=sufix)
    try:
        helper.create_pool(pool_name, 1, prof_name)
        log.info("Pool {pname} created".format(pname=pool_name))
    except Exception:
        log.error("lrcpool create failed")
        log.error(traceback.format_exc())
        return 1
    '''rados put and get in a parallel task'''
    with parallel() as p:
        p.spawn(do_rados_put, ctrlr, pool_name, 20)
        p.spawn(do_rados_get, ctrlr, pool_name, 10)

        for res in p:
            log.info(res)

    try:
        pri_osd_id = helper.get_pg_primary(pool_name, 0)
        log.info("PRIMARY={pri}".format(pri=pri_osd_id))
    except Exception:
        log.error("getting primary failed")
        log.error(traceback.format_exc())
        return 1

    log.info("SIGTERM osd")
    pri_osd = ceph_cluster.get_osd_by_id(pri_osd_id)
    pri_osd_node = pri_osd.node
    pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id)
    try:
        helper.kill_osd(pri_osd_node, pri_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    time.sleep(10)
    if helper.is_up(pri_osd_id):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=pri_osd_id))

    try:
        if helper.revive_osd(pri_osd_node, pri_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    time.sleep(10)
    if helper.is_up(pri_osd_id):
        log.info("osd is UP")
    else:
        log.error("osd is DOWN")
        return 1

    time.sleep(10)
    try:
        rand_osd_id = helper.get_pg_random(pool_name, 0)
        log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id))
    except Exception:
        log.error("getting  random osd failed")
        log.error(traceback.format_exc())
        return 1
    log.info("SIGTERM osd")
    rand_osd = ceph_cluster.get_osd_by_id(rand_osd_id)
    rand_osd_node = rand_osd.node
    rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id)
    try:
        helper.kill_osd(rand_osd_node, rand_osd_service)
        log.info("osd killed")
    except Exception:
        log.error("killing osd failed")
        log.error(traceback.format_exc())
    time.sleep(10)
    if helper.is_up(rand_osd_id):
        log.error("unexpected! osd is still up")
        return 1
    time.sleep(5)
    log.info("Reviving osd {osd}".format(osd=rand_osd_id))
    try:
        if helper.revive_osd(rand_osd_node, rand_osd_service):
            log.error("revive failed")
            return 1
    except Exception:
        log.error("revive failed")
        log.error(traceback.format_exc())
        return 1
    time.sleep(30)
    if helper.is_up(pri_osd_id):
        log.info("osd is UP")
    else:
        log.error("osd is DOWN")
        return 1

    return 0