def run(ceph_cluster, **kw): """ 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k=4 m=2 l=3 \ ruleset-failure-domain=osd # ceph osd pool create $poolname 1 1 erasure $profile 2. start writing a large object so that we will get \ sometime to fail the osd while the reads and writes are in progress on an object # rados put -p lrcpool obj1 /src/path #rados get -p lrcpool obj1 /tmp/obj1 while above command is in progress kill primary osd responsible for the PG. primary can be found from # ceph pg dump 3. Bring back primary 4. Repeat the step 2 but this time kill some secondary osds Args: ceph_cluster (ceph.ceph.Ceph): """ log.info("Running test CEPH-9281") ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") build = config.get("build", config.get("rhbuild")) mons = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """ create LRC profile """ sufix = random.randint(0, 10000) prof_name = "LRCprofile{suf}".format(suf=sufix) if build.startswith("4"): profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \ crush-failure-domain=osd".format(LRCprofile=prof_name) else: profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \ ruleset-failure-domain=osd crush-failure-domain=osd".format( LRCprofile=prof_name) try: (outbuf, err) = helper.raw_cluster_cmd(profile) log.info(outbuf) log.info("created profile {LRCprofile}".format(LRCprofile=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 """create LRC ec pool""" pool_name = "lrcpool{suf}".format(suf=sufix) try: helper.create_pool(pool_name, 1, prof_name) log.info("Pool {pname} created".format(pname=pool_name)) except Exception: log.error("lrcpool create failed") log.error(traceback.format_exc()) return 1 """rados put and get in a parallel task""" with parallel() as p: p.spawn(do_rados_put, ctrlr, pool_name, 20) p.spawn(do_rados_get, ctrlr, pool_name, 10) for res in p: log.info(res) try: pri_osd_id = helper.get_pg_primary(pool_name, 0) log.info("PRIMARY={pri}".format(pri=pri_osd_id)) except Exception: log.error("getting primary failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") target_osd_hostname = ceph_cluster.get_osd_metadata(pri_osd_id).get( "hostname") pri_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id) try: helper.kill_osd(pri_osd_node, pri_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) if not helper.wait_until_osd_state(osd_id=pri_osd_id, down=True): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=pri_osd_id)) try: if helper.revive_osd(pri_osd_node, pri_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 if not helper.wait_until_osd_state(pri_osd_id): log.error("osd is DOWN") return 1 log.info( f"Revival of Primary OSD : {pri_osd_id} is complete\n Killing random OSD" ) time.sleep(10) try: rand_osd_id = helper.get_pg_random(pool_name, 0) log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id)) except Exception: log.error("getting random osd failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") target_osd_hostname = ceph_cluster.get_osd_metadata(rand_osd_id).get( "hostname") rand_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id) try: helper.kill_osd(rand_osd_node, rand_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) if not helper.wait_until_osd_state(osd_id=rand_osd_id, down=True): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=rand_osd_id)) try: if helper.revive_osd(rand_osd_node, rand_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 if not helper.wait_until_osd_state(rand_osd_id): log.error("osd is DOWN") return 1 log.info(f"Revival of Random OSD : {rand_osd_id} is complete") return 0
def run(ceph_cluster, **kw): """ 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k=4 m=2 l=3 \ ruleset-failure-domain=osd # ceph osd pool create $poolname 1 1 erasure $profile 2. start writing a large object so that we will get \ sometime to fail the osd while the reads and writes are in progress on an object # rados put -p lrcpool obj1 /src/path #rados get -p lrcpool obj1 /tmp/obj1 while above command is in progress kill primary osd responsible for the PG. primary can be found from # ceph pg dump 3. Bring back primary 4. Repeat the step 2 but this time kill some secondary osds Args: ceph_cluster (ceph.ceph.Ceph): """ log.info("Running test CEPH-9281") ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') mons = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) ''' create LRC profile ''' sufix = random.randint(0, 10000) prof_name = "LRCprofile{suf}".format(suf=sufix) profile = "osd erasure-code-profile set {LRCprofile} \ plugin=lrc\ k=4 m=2 l=3 \ ruleset-failure-domain=osd \ crush-failure-domain=osd".format(LRCprofile=prof_name) try: (out, err) = helper.raw_cluster_cmd(profile) outbuf = out.read().decode() log.info(outbuf) log.info("created profile {LRCprofile}".format(LRCprofile=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 '''create LRC ec pool''' pool_name = "lrcpool{suf}".format(suf=sufix) try: helper.create_pool(pool_name, 1, prof_name) log.info("Pool {pname} created".format(pname=pool_name)) except Exception: log.error("lrcpool create failed") log.error(traceback.format_exc()) return 1 '''rados put and get in a parallel task''' with parallel() as p: p.spawn(do_rados_put, ctrlr, pool_name, 20) p.spawn(do_rados_get, ctrlr, pool_name, 10) for res in p: log.info(res) try: pri_osd_id = helper.get_pg_primary(pool_name, 0) log.info("PRIMARY={pri}".format(pri=pri_osd_id)) except Exception: log.error("getting primary failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") pri_osd = ceph_cluster.get_osd_by_id(pri_osd_id) pri_osd_node = pri_osd.node pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id) try: helper.kill_osd(pri_osd_node, pri_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) time.sleep(10) if helper.is_up(pri_osd_id): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=pri_osd_id)) try: if helper.revive_osd(pri_osd_node, pri_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 time.sleep(10) if helper.is_up(pri_osd_id): log.info("osd is UP") else: log.error("osd is DOWN") return 1 time.sleep(10) try: rand_osd_id = helper.get_pg_random(pool_name, 0) log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id)) except Exception: log.error("getting random osd failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") rand_osd = ceph_cluster.get_osd_by_id(rand_osd_id) rand_osd_node = rand_osd.node rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id) try: helper.kill_osd(rand_osd_node, rand_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) time.sleep(10) if helper.is_up(rand_osd_id): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=rand_osd_id)) try: if helper.revive_osd(rand_osd_node, rand_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 time.sleep(30) if helper.is_up(pri_osd_id): log.info("osd is UP") else: log.error("osd is DOWN") return 1 return 0