def initialize(self): super(RawFio, self).initialize() common.pdsh(settings.getnodes('clients'), 'sudo rm -rf %s' % self.run_dir, continue_if_error=False).communicate() common.make_remote_dir(self.run_dir) clnts = settings.getnodes('clients') logger.info('creating mountpoints...') logger.info('Attempting to initialize fio files...') initializer_list = [] for i in range(self.concurrent_procs): b = self.block_devices[i % len(self.block_devices)] fiopath = b pre_cmd = 'sudo %s --rw=write -ioengine=%s --bs=%s ' % (self.fio_cmd, self.ioengine, self.op_size) pre_cmd = '%s --size %dM --name=%s --output-format=%s> /dev/null' % ( pre_cmd, self.vol_size, fiopath, self.fio_out_format) initializer_list.append(common.pdsh(clnts, pre_cmd, continue_if_error=False)) for p in initializer_list: p.communicate() # Create the run directory common.pdsh(clnts, 'rm -rf %s' % self.run_dir, continue_if_error=False).communicate() common.make_remote_dir(self.run_dir)
def initialize(self): pass # self.cleanup() # super(KvmRbdFio, self).initialize() common.setup_cluster() # common.setup_ceph() # Setup the pools # common.pdsh(settings.cluster.get('head'), 'sudo ceph osd pool create rbdfio %d %d' % (self.pgs, self.pgs)).communicate() # common.pdsh(settings.cluster.get('head'), 'sudo ceph osd pool set rbdfio size 1').communicate() # print 'Checking Healh after pool creation.' # common.check_health() # common.pdsh(settings.cluster.get('clients'), 'sudo modprobe rbd').communicate() # for i in xrange(self.concurrent_procs): names = "" for i in xrange(self.concurrent_procs): letter = string.ascii_lowercase[i+1] # common.pdsh(settings.cluster.get('clients'), 'sudo rbd create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (i, self.vol_size)).communicate() # common.pdsh(settings.cluster.get('clients'), 'sudo rbd map rbdfio-`hostname -s`-%d --pool rbdfio --id admin' % i).communicate() # common.pdsh(settings.cluster.get('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/vd%s' % letter).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/vd%s /srv/rbdfio-`hostname -s`-%d' %(letter, i)).communicate() # Create the run directory common.make_remote_dir(self.run_dir)
def run(self): super(KvmRbdFio, self).run() # We'll always drop caches for rados bench self.dropcaches() monitoring.start(self.run_dir) time.sleep(5) names = "" for i in xrange(self.concurrent_procs): names += "--name=/srv/rbdfio-`hostname -s`-%d/cbt-kvmrbdfio " % i out_file = '%s/output' % self.run_dir pre_cmd = 'sudo fio --rw=read -ioengine=sync --numjobs=1 --bs=4M --runtime=1 --size %dM %s > /dev/null' % (self.vol_size * 9/10, names) fio_cmd = 'sudo fio --rw=%s -ioengine=%s --runtime=%s --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM %s > %s' % (self.mode, self.ioengine, self.time, self.op_size, self.iodepth, self.vol_size * 9/10, names, out_file) print 'Attempting to populating fio files...' common.pdsh(settings.getnodes('clients'), pre_cmd).communicate() print 'Running rbd fio %s test.' % self.mode common.pdsh(settings.getnodes('clients'), fio_cmd).communicate() # ps = [] # for i in xrange(self.concurrent_procs): # out_file = '%s/output.%s' % (self.run_dir, i) # p = common.pdsh(settings.cluster.get('clients'), 'sudo fio --rw=%s -ioengine=%s --runtime=%s --name=/srv/rbdfio-`hostname -s`-%d/cbt-rbdfio --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM > %s' % (self.mode, self.ioengine, self.time, i, self.op_size, self.iodepth, self.vol_size * 9/10, out_file)) # ps.append(p) # for p in ps: # p.wait() monitoring.stop(self.run_dir) common.sync_files('%s/*' % self.run_dir, self.out_dir)
def initialize(self): common.cleanup_tests() if not self.use_existing: common.setup_cluster() common.setup_ceph() # Create the run directory common.make_remote_dir(self.run_dir) # Setup the pools monitoring.start("%s/pool_monitoring" % self.run_dir) for i in xrange(self.concurrent_procs): for node in settings.getnodes('clients').split(','): node = node.rpartition("@")[2] common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool create rados-bench-%s-%s %d %d' % (node, i, self.pgs_per_pool, self.pgs_per_pool)).communicate() common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool set rados-bench-%s-%s size 1' % (node, i)).communicate() # check the health for each pool. print 'Checking Healh after pool creation.' common.check_health() monitoring.stop() print 'Running scrub monitoring.' monitoring.start("%s/scrub_monitoring" % self.run_dir) common.check_scrub() monitoring.stop() print 'Pausing for 60s for idle monitoring.' monitoring.start("%s/idle_monitoring" % self.run_dir) time.sleep(60) monitoring.stop() common.sync_files('%s/*' % self.run_dir, self.out_dir) return True
def rmpool(self, name, profile_name): pool_profiles = self.config.get("pool_profiles", {"default": {}}) profile = pool_profiles.get(profile_name, {}) cache_profile = profile.get("cache_profile", None) if cache_profile: cache_name = "%s-cache" % name # flush and remove the overlay and such common.pdsh( settings.getnodes("head"), "sudo ceph -c %s osd tier cache-mode %s forward" % (self.tmp_conf, cache_name), ).communicate() common.pdsh( settings.getnodes("head"), "sudo rados -c %s -p %s cache-flush-evict-all" % (self.tmp_conf, cache_name) ).communicate() common.pdsh( settings.getnodes("head"), "sudo ceph -c %s osd tier remove-overlay %s" % (self.tmp_conf, name) ).communicate() common.pdsh( settings.getnodes("head"), "sudo ceph -c %s osd tier remove %s %s" % (self.tmp_conf, name, cache_name) ).communicate() # delete the cache pool self.rmpool(cache_name, cache_profile) common.pdsh( settings.getnodes("head"), "sudo ceph -c %s osd pool delete %s %s --yes-i-really-really-mean-it" % (self.tmp_conf, name, name), ).communicate()
def pre(self): pre_time = self.config.get("pre_time", 60) common.pdsh(settings.getnodes('head'), self.logcmd('Starting Recovery Test Thread, waiting %s seconds.' % pre_time)).communicate() time.sleep(pre_time) lcmd = self.logcmd("Setting the ceph osd noup flag") common.pdsh(settings.getnodes('head'), '%s -c %s osd set noup;%s' % (self.ceph_cmd, self.cluster.tmp_conf, lcmd)).communicate() self.state = 'markdown'
def mkimages(self): monitoring.start("%s/pool_monitoring" % self.run_dir) self.cluster.rmpool(self.poolname, self.pool_profile) self.cluster.mkpool(self.poolname, self.pool_profile) for node in settings.getnodes('clients').split(','): node = node.rpartition("@")[2] common.pdsh(settings.getnodes('head'), '/usr/bin/rbd create cbt-librbdfio-%s --size %s --pool %s --order %s' % (node, self.vol_size, self.poolname, self.vol_order)).communicate() monitoring.stop()
def markdown(self): for osdnum in self.config.get('osds'): lcmd = self.logcmd("Marking OSD %s down." % osdnum) common.pdsh(settings.getnodes('head'), '%s -c %s osd down %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate() lcmd = self.logcmd("Marking OSD %s out." % osdnum) common.pdsh(settings.getnodes('head'), '%s -c %s osd out %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate() common.pdsh(settings.getnodes('head'), self.logcmd('Waiting for the cluster to break and heal')).communicate() self.state = 'osdout'
def stop(directory=None): nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws') common.pdsh(nodes, 'pkill -SIGINT -f collectl').communicate() common.pdsh(nodes, 'sudo pkill -SIGINT -f perf_3.6').communicate() common.pdsh(settings.getnodes('osds'), 'sudo pkill -SIGINT -f blktrace').communicate() if directory: sc = settings.cluster common.pdsh(nodes, 'cd %s/perf;sudo chown %s.%s perf.data' % (directory, sc.get('user'), sc.get('user'))) make_movies(directory)
def mkimages(self): monitoring.start("%s/pool_monitoring" % self.run_dir) self.cluster.rmpool(self.poolname, self.pool_profile) self.cluster.mkpool(self.poolname, self.pool_profile) common.pdsh(settings.getnodes('clients'), '/usr/bin/rbd create cbt-kernelrbdfio-`hostname -s` --size %s --pool %s' % (self.vol_size, self.poolname)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo rbd map cbt-kernelrbdfio-`hostname -s` --pool %s --id admin' % self.poolname).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s`').communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p -m0755 -- %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s` %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate() monitoring.stop()
def _run(self, mode, run_dir, out_dir): # We'll always drop caches for rados bench self.dropcaches() if self.concurrent_ops: concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops #determine rados version rados_version_str, err = common.pdsh(settings.getnodes('head'), '/usr/bin/rados -v').communicate() m = re.findall("version (\d+)", rados_version_str) rados_version = int(m[0]) if mode in ['write'] or rados_version < 9: op_size_str = '-b %s' % self.op_size else: op_size_str = '' common.make_remote_dir(run_dir) # dump the cluster config self.cluster.dump_config(run_dir) # Run the backfill testing thread if requested if 'recovery_test' in self.cluster.config: recovery_callback = self.recovery_callback self.cluster.create_recovery_test(run_dir, recovery_callback) # Run rados bench monitoring.start(run_dir) logger.info('Running radosbench %s test.' % mode) ps = [] for i in xrange(self.concurrent_procs): out_file = '%s/output.%s' % (run_dir, i) objecter_log = '%s/objecter.%s.log' % (run_dir, i) # default behavior is to use a single storage pool pool_name = self.pool run_name = '--run-name %s`hostname -s`-%s'%(self.object_set_id, i) if self.pool_per_proc: # support previous behavior of 1 storage pool per rados process pool_name = 'rados-bench-`hostname -s`-%s'%i run_name = '' rados_bench_cmd = '%s -c %s -p %s bench %s %s %s %s %s --no-cleanup 2> %s > %s' % \ (self.cmd_path_full, self.tmp_conf, pool_name, op_size_str, self.time, mode, concurrent_ops_str, run_name, objecter_log, out_file) p = common.pdsh(settings.getnodes('clients'), rados_bench_cmd) ps.append(p) for p in ps: p.wait() monitoring.stop(run_dir) # If we were doing recovery, wait until it's done. if 'recovery_test' in self.cluster.config: self.cluster.wait_recovery_done() # Finally, get the historic ops self.cluster.dump_historic_ops(run_dir) common.sync_files('%s/*' % run_dir, out_dir)
def osdin(self): # Wait until the cluster is healthy. ret = self.cluster.check_health(self.health_checklist, "%s/recovery.log" % self.config.get('run_dir')) if self.inhealthtries < self.maxhealthtries and ret == 0: self.inhealthtries = self.inhealthtries + 1 return # Cluster hasn't become unhealthy yet. if ret == 0: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate() else: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate() self.state = "post"
def mkimages(self): monitoring.start("%s/pool_monitoring" % self.run_dir) self.cluster.rmpool(self.poolname, self.pool_profile) self.cluster.mkpool(self.poolname, self.pool_profile) for node in settings.getnodes("clients").split(","): for volnum in xrange(0, self.volumes_per_client): node = node.rpartition("@")[2] common.pdsh( settings.getnodes("head"), "/usr/bin/rbd create cbt-librbdfio-%s-%d --size %s --pool %s --order %s" % (node, volnum, self.vol_size, self.poolname, self.vol_order), ).communicate() monitoring.stop()
def pre(self): pre_time = self.config.get("pre_time", 60) common.pdsh(settings.getnodes('head'), self.logcmd('Starting Recovery Test Thread, waiting %s seconds.' % pre_time)).communicate() time.sleep(pre_time) lcmd = self.logcmd("Setting the ceph osd noup flag") common.pdsh(settings.getnodes('head'), 'ceph -c %s ceph osd set noup;%s' % (self.cluster.tmp_conf, lcmd)).communicate() for osdnum in self.config.get('osds'): lcmd = self.logcmd("Marking OSD %s down." % osdnum) common.pdsh(settings.getnodes('head'), 'ceph -c %s osd down %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)).communicate() lcmd = self.logcmd("Marking OSD %s out." % osdnum) common.pdsh(settings.getnodes('head'), 'ceph -c %s osd out %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)).communicate() common.pdsh(settings.getnodes('head'), self.logcmd('Waiting for the cluster to break and heal')).communicate() self.state = 'osdout'
def cleanup_tests(): clients = settings.getnodes('clients') rgws = settings.getnodes('rgws') nodes = settings.getnodes('clients', 'servers', 'mons', 'rgws') pdsh(clients, 'sudo killall -9 rados;sudo killall -9 rest-bench').communicate() if rgws: pdsh(rgws, 'sudo killall -9 radosgw-admin').communicate() pdsh(nodes, 'sudo killall -9 pdcp').communicate() # cleanup the tmp_dir tmp_dir = settings.cluster.get("tmp_dir") print 'Deleting %s' % tmp_dir pdsh(nodes, 'rm -rf %s' % tmp_dir).communicate()
def initialize(self): super(RbdFio, self).initialize() self.cleanup() if not self.use_existing: self.cluster.initialize() self.cluster.dump_config(self.run_dir) # Setup the pools monitoring.start("%s/pool_monitoring" % self.run_dir) common.pdsh(settings.getnodes('head'), 'sudo ceph -c %s osd pool create rbdfio %d %d' % (self.tmp_conf, self.pgs, self.pgs)).communicate() common.pdsh(settings.getnodes('head'), 'sudo ceph -c %s osd pool set rbdfio size 1' % self.tmp_conf).communicate() print 'Checking Healh after pool creation.' self.cluster.check_health() monitoring.stop() # Mount the filesystem common.pdsh(settings.getnodes('clients'), 'sudo modprobe rbd').communicate() for i in xrange(self.concurrent_procs): common.pdsh(settings.getnodes('clients'), 'sudo rbd -c %s create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (self.tmp_conf, i, self.vol_size)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p -m0755 -- %s/mnt/rbdfio-`hostname -s`-%d' % (self.tmp_dir, i)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d %s/mnt/rbdfio-`hostname -s`-%d' % (i, self.tmp_dir, i)).communicate() print 'Running scrub monitoring' monitoring.start("%s/scrub_monitoring" % self.run_dir) self.cluster.check_scrub() monitoring.stop() # Create the run directory common.make_remote_dir(self.run_dir)
def initialize(self): self.cleanup() super(RbdFio, self).initialize() common.setup_cluster() common.setup_ceph() common.dump_config(self.run_dir) # Setup the pools common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool create rbdfio %d %d' % (self.pgs, self.pgs)).communicate() common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool set rbdfio size 1').communicate() print 'Checking Healh after pool creation.' common.check_health() common.pdsh(settings.getnodes('clients'), 'sudo modprobe rbd').communicate() for i in xrange(self.concurrent_procs): common.pdsh(settings.getnodes('clients'), 'sudo rbd create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (i, self.vol_size)).communicate() # common.pdsh(settings.cluster.get('clients'), 'sudo rbd map rbdfio-`hostname -s`-%d --pool rbdfio --id admin' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d /srv/rbdfio-`hostname -s`-%d' %(i, i)).communicate() common.check_scrub() # Create the run directory common.make_remote_dir(self.run_dir)
def initialize(self): super(KvmRbdFio, self).initialize() for i in xrange(1): letter = string.ascii_lowercase[i+1] common.pdsh(settings.getnodes('clients'), 'sudo mkfs.ext4 /dev/vd%s' % letter).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mount -t ext4 -o noatime /dev/vd%s /srv/rbdfio-`hostname -s`-%d' %(letter, i)).communicate() # Create the run directory common.make_remote_dir(self.run_dir) # populate the fio files logger.info('Attempting to populating fio files...') pre_cmd = 'sudo fio --rw=write -ioengine=sync --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.numjobs, self.vol_size, self.names) common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()
def setup_fs(self): sc = settings.cluster fs = sc.get('fs') mkfs_opts = sc.get('mkfs_opts', '') mount_opts = sc.get('mount_opts', '') if fs == '': shutdown("No OSD filesystem specified. Exiting.") for device in xrange (0,sc.get('osds_per_node')): osds = settings.getnodes('osds') common.pdsh(osds, 'sudo umount /dev/disk/by-partlabel/osd-device-%s-data' % device).communicate() common.pdsh(osds, 'sudo rm -rf %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate() common.pdsh(osds, 'sudo mkdir -p -m0755 -- %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate() if fs == 'tmpfs': print 'using tmpfs osds, not creating a file system.' elif fs == 'zfs': print 'ruhoh, zfs detected. No mkfs for you!' common.pdsh(osds, 'sudo zpool destroy osd-device-%s-data' % device).communicate() common.pdsh(osds, 'sudo zpool create -f -O xattr=sa -m legacy osd-device-%s-data /dev/disk/by-partlabel/osd-device-%s-data' % (device, device)).communicate() common.pdsh(osds, 'sudo zpool add osd-device-%s-data log /dev/disk/by-partlabel/osd-device-%s-zil' % (device, device)).communicate() common.pdsh(osds, 'sudo mount %s -t zfs osd-device-%s-data %s/osd-device-%s-data' % (mount_opts, device, self.mnt_dir, device)).communicate() else: common.pdsh(osds, 'sudo mkfs.%s %s /dev/disk/by-partlabel/osd-device-%s-data' % (fs, mkfs_opts, device)).communicate() common.pdsh(osds, 'sudo mount %s -t %s /dev/disk/by-partlabel/osd-device-%s-data %s/osd-device-%s-data' % (mount_opts, fs, device, self.mnt_dir, device)).communicate()
def osdin(self): # Wait until the cluster is healthy. ret = self.cluster.check_health("%s/recovery.log" % self.config.get('run_dir')) if self.inhealthtries < self.maxhealthtries and ret == 0: self.inhealthtries = self.inhealthtries + 1 return # Cluster hasn't become unhealthy yet. if ret == 0: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate() else: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate() post_time = self.config.get("post_time", 60) common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate() time.sleep(post_time) self.state = "done"
def initialize(self): super(LibrbdFio, self).initialize() print 'Running scrub monitoring.' monitoring.start("%s/scrub_monitoring" % self.run_dir) self.cluster.check_scrub() monitoring.stop() print 'Pausing for 60s for idle monitoring.' monitoring.start("%s/idle_monitoring" % self.run_dir) time.sleep(60) monitoring.stop() common.sync_files('%s/*' % self.run_dir, self.out_dir) self.mkimages() # Create the run directory common.make_remote_dir(self.run_dir) # populate the fio files print 'Attempting to populating fio files...' pre_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=cbt-librbdfio-`hostname -s` --invalidate=0 --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.poolname, self.numjobs, self.vol_size, self.names) common.pdsh(settings.getnodes('clients'), pre_cmd).communicate() return True
def _run(self, mode, run_dir, out_dir): # We'll always drop caches for rados bench self.dropcaches() if self.concurrent_ops: concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops op_size_str = '-b %s' % self.op_size common.make_remote_dir(run_dir) # dump the cluster config common.dump_config(run_dir) monitoring.start(run_dir) # Run rados bench print 'Running radosbench read test.' ps = [] for i in xrange(self.concurrent_procs): out_file = '%s/output.%s' % (run_dir, i) objecter_log = '%s/objecter.%s.log' % (run_dir, i) p = common.pdsh(settings.getnodes('clients'), '/usr/bin/rados -p rados-bench-`hostname -s`-%s %s bench %s %s %s --no-cleanup 2> %s > %s' % (i, op_size_str, self.time, mode, concurrent_ops_str, objecter_log, out_file)) ps.append(p) for p in ps: p.wait() monitoring.stop(run_dir) # Get the historic ops common.dump_historic_ops(run_dir) common.sync_files('%s/*' % run_dir, out_dir)
def make_movies(directory): sc = settings.cluster seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user') blktrace_dir = '%s/blktrace' % directory for device in xrange (0,sc.get('osds_per_node')): common.pdsh(settings.getnodes('osds'), 'cd %s;%s -t device%s -o device%s.mpg --movie' % (blktrace_dir,seekwatcher,device,device))
def initialize(self): super(RbdFio, self).initialize() logger.info('Running scrub monitoring.') monitoring.start("%s/scrub_monitoring" % self.run_dir) self.cluster.check_scrub() monitoring.stop() logger.info('Pausing for 60s for idle monitoring.') monitoring.start("%s/idle_monitoring" % self.run_dir) time.sleep(60) monitoring.stop() common.sync_files('%s/*' % self.run_dir, self.out_dir) self.mkimages() # Create the run directory common.make_remote_dir(self.run_dir) # populate the fio files logger.info('Attempting to populating fio files...') pre_cmd = 'sudo %s --ioengine=%s --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.ioengine, self.numjobs, self.vol_size*0.9, self.names) common.pdsh(settings.getnodes('clients'), pre_cmd).communicate() return True
def rmpool(self, name, profile_name): pool_profiles = self.config.get('pool_profiles', {'default': {}}) profile = pool_profiles.get(profile_name, {}) cache_profile = profile.get('cache_profile', None) if cache_profile: cache_name = '%s-cache' % name # flush and remove the overlay and such common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier cache-mode %s forward' % (self.ceph_cmd, self.tmp_conf, cache_name)).communicate() common.pdsh(settings.getnodes('head'), 'sudo rados -c %s -p %s cache-flush-evict-all' % (self.tmp_conf, cache_name)).communicate() common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier remove-overlay %s' % (self.ceph_cmd, self.tmp_conf, name)).communicate() common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier remove %s %s' % (self.ceph_cmd, self.tmp_conf, name, cache_name)).communicate() # delete the cache pool self.rmpool(cache_name, cache_profile) common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool delete %s %s --yes-i-really-really-mean-it' % (self.ceph_cmd, self.tmp_conf, name, name)).communicate()
def distribute_conf(self): nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws') conf_file = self.config.get("conf_file") logger.info("Distributing %s.", conf_file) common.pdcp(nodes, '', conf_file, self.tmp_conf).communicate() common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak').communicate() common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf).communicate()
def __init__(self, cluster, config): super(KvmRbdFio, self).__init__(cluster, config) self.concurrent_procs = config.get('concurrent_procs', 1) self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(',')) self.time = str(config.get('time', '300')) self.ramp = str(config.get('ramp', '0')) self.iodepth = config.get('iodepth', 16) self.numjobs = config.get('numjobs', 1) self.mode = config.get('mode', 'write') self.rwmixread = config.get('rwmixread', 50) self.rwmixwrite = 100 - self.rwmixread self.ioengine = config.get('ioengine', 'libaio') self.op_size = config.get('op_size', 4194304) self.pgs = config.get('pgs', 2048) self.vol_size = config.get('vol_size', 65536) * 0.9 self.rep_size = config.get('rep_size', 1) self.rbdadd_mons = config.get('rbdadd_mons') self.rbdadd_options = config.get('rbdadd_options') self.client_ra = config.get('client_ra', '128') self.fio_cmd = config.get('fio_cmd', '/usr/bin/fio') # FIXME there are too many permutations, need to put results in SQLITE3 self.run_dir = '%s/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode) self.out_dir = '%s/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode) # Make the file names string self.names = '' for i in xrange(self.concurrent_procs): self.names += '--name=/srv/rbdfio-`hostname -s`-0/cbt-kvmrbdfio-%d ' % i
def __init__(self, cluster, config): super(RawFio, self).__init__(cluster, config) # comma-separated list of block devices to use inside the client host/VM/container self.block_device_list = config.get('block_devices', '/dev/vdb' ) self.block_devices = [ d.strip() for d in self.block_device_list.split(',') ] self.concurrent_procs = config.get('concurrent_procs', len(self.block_devices)) self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(',')) self.fio_out_format = "json" self.time = str(config.get('time', '300')) self.ramp = str(config.get('ramp', '0')) self.startdelay = config.get('startdelay', None) self.rate_iops = config.get('rate_iops', None) self.iodepth = config.get('iodepth', 16) self.direct = config.get('direct', 1) self.numjobs = config.get('numjobs', 1) self.mode = config.get('mode', 'write') self.rwmixread = config.get('rwmixread', 50) self.rwmixwrite = 100 - self.rwmixread self.ioengine = config.get('ioengine', 'libaio') self.op_size = config.get('op_size', 4194304) self.vol_size = config.get('vol_size', 65536) * 0.9 self.fio_cmd = config.get('fio_cmd', 'sudo /usr/bin/fio') # FIXME there are too many permutations, need to put results in SQLITE3 self.run_dir = '%s/raw_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode) self.out_dir = '%s/raw_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)
def __init__(self, cluster, config): super(RbdFio, self).__init__(cluster, config) # FIXME there are too many permutations, need to put results in SQLITE3 self.cmd_path = config.get('cmd_path', '/usr/bin/fio') self.pool_profile = config.get('pool_profile', 'default') self.concurrent_procs = config.get('concurrent_procs', 1) self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(',')) self.time = str(config.get('time', None)) self.ramp = str(config.get('ramp', None)) self.iodepth = config.get('iodepth', 16) self.numjobs = config.get('numjobs', 1) self.end_fsync = str(config.get('end_fsync', 0)) self.mode = config.get('mode', 'write') self.rwmixread = config.get('rwmixread', 50) self.rwmixwrite = 100 - self.rwmixread self.log_avg_msec = config.get('log_avg_msec', None) self.ioengine = config.get('ioengine', 'libaio') self.op_size = config.get('op_size', 4194304) self.vol_size = config.get('vol_size', 65536) self.vol_order = config.get('vol_order', 22) self.random_distribution = config.get('random_distribution', None) self.rbdadd_mons = config.get('rbdadd_mons') self.rbdadd_options = config.get('rbdadd_options', 'share') self.client_ra = config.get('client_ra', 128) self.poolname = "cbt-kernelrbdfio" self.run_dir = '%s/rbdfio/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.concurrent_procs), int(self.iodepth), self.mode) self.out_dir = '%s/rbdfio/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.concurrent_procs), int(self.iodepth), self.mode) # Make the file names string self.names = '' for i in xrange(self.concurrent_procs): self.names += '--name=%s/cbt-kernelrbdfio-`hostname -s`/cbt-kernelrbdfio-%d ' % (self.cluster.mnt_dir, i)
def run(self): # First create a credential file for each gateway self.mkcredfiles() # We'll always drop caches for rados bench self.dropcaches() # dump the cluster config self.cluster.dump_config(self.run_dir) # Run the backfill testing thread if requested if 'recovery_test' in self.cluster.config: recovery_callback = self.recovery_callback self.cluster.create_recovery_test(self.run_dir, recovery_callback) # Run getput monitoring.start(self.run_dir) logger.info('Running getput %s test.' % self.test) ps = [] for i in xrange(0, len(self.auth_urls)): cmd = self.mkgetputcmd("%s/gw%02d.cred" % (self.run_dir, i), i) p = common.pdsh(settings.getnodes('clients'), cmd) ps.append(p) for p in ps: p.wait() monitoring.stop(self.run_dir) # If we were doing recovery, wait until it's done. if 'recovery_test' in self.cluster.config: self.cluster.wait_recovery_done() # Finally, get the historic ops self.cluster.dump_historic_ops(self.run_dir) common.sync_files('%s/*' % self.run_dir, self.out_dir)
def markdown(self): for osdnum in self.config.get('osds'): lcmd = self.logcmd("Marking OSD %s down." % osdnum) common.pdsh( settings.getnodes('head'), '%s -c %s osd down %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate() lcmd = self.logcmd("Marking OSD %s out." % osdnum) common.pdsh( settings.getnodes('head'), '%s -c %s osd out %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate() common.pdsh( settings.getnodes('head'), self.logcmd( 'Waiting for the cluster to break and heal')).communicate() self.state = 'osdout'
def osdin(self): # Wait until the cluster is healthy. ret = self.cluster.check_health( self.health_checklist, "%s/recovery.log" % self.config.get('run_dir')) if self.inhealthtries < self.maxhealthtries and ret == 0: self.inhealthtries = self.inhealthtries + 1 return # Cluster hasn't become unhealthy yet. if ret == 0: common.pdsh( settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate() else: common.pdsh( settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate() self.state = "post"
def run(self): self.haltrequest.clear() self.stoprequest.clear() while not self.haltrequest.isSet(): self.states[self.state]() common.pdsh( settings.getnodes('head'), self.logcmd('Exiting recovery test thread. Last state was: %s' % self.state)).communicate()
def check_scrub(self): logger.info('Waiting until Scrubbing completes...') while True: stdout, stderr = common.pdsh(settings.getnodes('head'), '%s -c %s pg dump | cut -f 16 | grep "0.000000" | wc -l' % (self.ceph_cmd, self.tmp_conf)).communicate() if " 0\n" in stdout: break else: logger.info(stdout) time.sleep(1)
def clean_remote_dir(remote_dir): print("cleaning remote dir %s".format(remote_dir)) if remote_dir == "/" or not os.path.isabs(remote_dir): raise SystemExit("Cleaning the remote dir doesn't seem safe, bailing.") nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds') pdsh(nodes, 'if [ -d "%s" ]; then rm -rf %s; fi' % (remote_dir, remote_dir), continue_if_error=False).communicate()
def mkimage(self, name, size, pool, data_pool, order): dp_option = '' if data_pool: dp_option = "--data-pool %s" % data_pool common.pdsh( settings.getnodes('head'), '%s -c %s create %s --size %s --pool %s %s --order %s' % (self.rbd_cmd, self.tmp_conf, name, size, pool, dp_option, order)).communicate()
def check_scrub(self): print 'Waiting until Scrubbing completes...' while True: stdout, stderr = common.pdsh(settings.getnodes('head'), 'ceph -c %s pg dump | cut -f 16 | grep "0.000000" | wc -l' % self.tmp_conf).communicate() if " 0\n" in stdout: break else: print stdout time.sleep(1)
def run(self): super(KvmRbdFio, self).run() # Set client readahead self.set_client_param('read_ahead_kb', self.client_ra) clnts = settings.getnodes('clients') # We'll always drop caches for rados bench self.dropcaches() monitoring.start(self.run_dir) time.sleep(5) # Run the backfill testing thread if requested if 'recovery_test' in self.cluster.config: recovery_callback = self.recovery_callback self.cluster.create_recovery_test(self.run_dir, recovery_callback) logger.info('Starting rbd fio %s test.', self.mode) fio_process_list = [] for i in range(self.concurrent_procs): b = self.block_devices[i % len(self.block_devices)] bnm = os.path.basename(b) mtpt = '/srv/rbdfio-`hostname -s`-%s' % bnm fiopath = os.path.join(mtpt, 'fio%d.img' % i) out_file = '%s/output.%d' % (self.run_dir, i) fio_cmd = 'sudo %s' % self.fio_cmd fio_cmd += ' --rw=%s' % self.mode if (self.mode == 'readwrite' or self.mode == 'randrw'): fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % ( self.rwmixread, self.rwmixwrite) fio_cmd += ' --ioengine=%s' % self.ioengine fio_cmd += ' --runtime=%s' % self.time fio_cmd += ' --ramp_time=%s' % self.ramp if self.startdelay: fio_cmd += ' --startdelay=%s' % self.startdelay if self.rate_iops: fio_cmd += ' --rate_iops=%s' % self.rate_iops fio_cmd += ' --numjobs=%s' % self.numjobs fio_cmd += ' --direct=1' fio_cmd += ' --bs=%dB' % self.op_size fio_cmd += ' --iodepth=%d' % self.iodepth fio_cmd += ' --size=%dM' % self.vol_size fio_cmd += ' --write_iops_log=%s' % out_file fio_cmd += ' --write_bw_log=%s' % out_file fio_cmd += ' --write_lat_log=%s' % out_file if 'recovery_test' in self.cluster.config: fio_cmd += ' --time_based' fio_cmd += ' --name=%s > %s' % (fiopath, out_file) fio_process_list.append( common.pdsh(clnts, fio_cmd, continue_if_error=False)) for p in fio_process_list: p.communicate() monitoring.stop(self.run_dir) logger.info('Finished rbd fio test') common.sync_files('%s/*' % self.run_dir, self.out_dir)
def prefill_data(self): # populate the fio files ps = [] logger.info('Attempting to populating fio files...') for ep_num in range(self.endpoints_per_client): p = common.pdsh(settings.getnodes('clients'), self.prefill_command(ep_num)) ps.append(p) for p in ps: p.wait()
def distribute_conf(self): nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws') conf_file = self.config.get("conf_file") print "Distributing %s." % conf_file common.pdcp(nodes, '', conf_file, self.tmp_conf) common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak', True) common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf)
def start(directory): nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws') collectl_dir = '%s/collectl' % directory perf_dir = '%s/perf' % directory blktrace_dir = '%s/blktrace' % directory # collectl common.pdsh(nodes, 'mkdir -p -m0755 -- %s' % collectl_dir) common.pdsh(nodes, 'collectl -s+mYZ -D -i 1:10 -F0 -f %s' % collectl_dir)
def run(self): super(RbdFio, self).run() # Set client readahead self.set_client_param('read_ahead_kb', self.client_ra) # We'll always drop caches for rados bench self.dropcaches() monitoring.start(self.run_dir) # Run the backfill testing thread if requested if 'recovery_test' in self.cluster.config: recovery_callback = self.recovery_callback self.cluster.create_recovery_test(self.run_dir, recovery_callback) time.sleep(5) out_file = '%s/output' % self.run_dir fio_cmd = 'sudo %s' % (self.cmd_path_full) fio_cmd += ' --rw=%s' % self.mode if (self.mode == 'readwrite' or self.mode == 'randrw'): fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (self.rwmixread, self.rwmixwrite) fio_cmd += ' --ioengine=%s' % self.ioengine if self.time is not None: fio_cmd += ' --runtime=%s' % self.time if self.ramp is not None: fio_cmd += ' --ramp_time=%s' % self.ramp fio_cmd += ' --numjobs=%s' % self.numjobs fio_cmd += ' --direct=%s' % self.direct fio_cmd += ' --bs=%dB' % self.op_size fio_cmd += ' --iodepth=%d' % self.iodepth if self.vol_size: fio_cmd += ' --size=%dM' % (int(self.vol_size) * 0.9) fio_cmd += ' --write_iops_log=%s' % out_file fio_cmd += ' --write_bw_log=%s' % out_file fio_cmd += ' --write_lat_log=%s' % out_file if 'recovery_test' in self.cluster.config: fio_cmd += ' --time_based' if self.random_distribution is not None: fio_cmd += ' --random_distribution=%s' % self.random_distribution fio_cmd += ' %s > %s' % (self.names, out_file) if self.log_avg_msec is not None: fio_cmd += ' --log_avg_msec=%s' % self.log_avg_msec logger.info('Running rbd fio %s test.', self.mode) common.pdsh(settings.getnodes('clients'), fio_cmd).communicate() # If we were doing recovery, wait until it's done. if 'recovery_test' in self.cluster.config: self.cluster.wait_recovery_done() monitoring.stop(self.run_dir) # Finally, get the historic ops self.cluster.dump_historic_ops(self.run_dir) common.sync_files('%s/*' % self.run_dir, self.out_dir)
def make_movies(directory): sc = settings.cluster seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user') blktrace_dir = '%s/blktrace' % directory for device in xrange(0, sc.get('osds_per_node')): common.pdsh( settings.getnodes('osds'), 'cd %s;%s -t device%s -o device%s.mpg --movie' % (blktrace_dir, seekwatcher, device, device))
def run(self): print 'Setting OSD Read Ahead to: %s' % self.osd_ra self.cluster.set_osd_param('read_ahead_kb', self.osd_ra) print 'Cleaning existing temporary run directory: %s' % self.run_dir common.pdsh(settings.getnodes('clients', 'osds', 'mons', 'rgws'), 'sudo rm -rf %s' % self.run_dir) if self.valgrind is not None: print 'Adding valgrind to the command path.' self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir) # Set the full command path self.cmd_path_full += self.cmd_path
def post(self): if self.stoprequest.isSet(): common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but stoprequest is set, finishing now.')).communicate() self.haltrequest.set() return if self.config.get("repeat", False): # reset counters self.outhealthtries = 0 self.inhealthtries = 0 common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but repeat is set. Moving to "markdown" state.')).communicate() self.state = "markdown" return post_time = self.config.get("post_time", 60) common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate() time.sleep(post_time) self.state = "done"
def cleanup(self): super(KvmRbdFio, self).cleanup() clnts = settings.getnodes('clients') common.pdsh(clnts, 'killall fio').communicate() time.sleep(3) common.pdsh(clnts, 'killall -9 fio').communicate() time.sleep(3) common.pdsh(clnts, 'rm -rf /srv/*/*', continue_if_error=False).communicate() common.pdsh(clnts, 'sudo umount /srv/* || echo -n').communicate()
def distribute_conf(self): nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws') conf_file = self.config.get("conf_file") logger.info("Distributing %s.", conf_file) common.pdcp(nodes, '', conf_file, self.tmp_conf).communicate() common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak' ).communicate() common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf).communicate()
def mkpools(self): monitoring.start("%s/pool_monitoring" % self.run_dir) for i in xrange(self.concurrent_procs): for node in settings.getnodes('clients').split(','): node = node.rpartition("@")[2] self.cluster.rmpool('rados-bench-%s-%s' % (node, i), self.pool_profile) self.cluster.mkpool('rados-bench-%s-%s' % (node, i), self.pool_profile) monitoring.stop()
def __init__(self, config): super(Ceph, self).__init__(config) self.health_wait = config.get('health_wait', 5) self.ceph_osd_cmd = config.get('ceph-osd_cmd', '/usr/bin/ceph-osd') self.ceph_mon_cmd = config.get('ceph-mon_cmd', '/usr/bin/ceph-mon') self.ceph_run_cmd = config.get('ceph-run_cmd', '/usr/bin/ceph-run') self.ceph_rgw_cmd = config.get('ceph-rgw_cmd', '/usr/bin/radosgw') self.ceph_mgr_cmd = config.get('ceph-mgr_cmd', '/usr/bin/ceph-mgr') self.ceph_mds_cmd = config.get('ceph-mds_cmd', '/usr/bin/ceph-mds') self.ceph_authtool_cmd = config.get('ceph-authtool_cmd', '/usr/bin/ceph-authtool') self.radosgw_admin_cmd = config.get('radosgw-admin_cmd', '/usr/bin/radosgw-admin') self.ceph_cmd = config.get('ceph_cmd', '/usr/bin/ceph') self.ceph_fuse_cmd = config.get('ceph-fuse_cmd', '/usr/bin/ceph-fuse') self.rados_cmd = config.get('rados_cmd', '/usr/bin/rados') self.rbd_cmd = config.get('rbd_cmd', '/usr/bin/rbd') self.rbd_nbd_cmd = config.get('rbd-nbd_cmd', '/usr/bin/rbd-nbd') self.rbd_fuse_cmd = config.get('rbd-fuse_cmd', '/usr/bin/rbd-fuse') self.mount_cmd = config.get('mount_cmd', '/usr/sbin/ceph.mount') self.log_dir = config.get('log_dir', "%s/log" % self.tmp_dir) self.pid_dir = config.get('pid_dir', "%s/pid" % self.tmp_dir) self.core_dir = config.get('core_dir', "%s/core" % self.tmp_dir) self.monitoring_dir = "%s/monitoring" % self.tmp_dir self.osdmap_fn = "%s/osdmap" % self.tmp_dir self.monmap_fn = "%s/monmap" % self.tmp_dir self.use_existing = config.get('use_existing', True) self.newstore_block = config.get('newstore_block', False) self.version_compat = config.get('version_compat', '') # these parameters control parallel OSD build self.ceph_osd_online_rate = config.get('osd_online_rate', 10) self.ceph_osd_online_tmo = config.get('osd_online_timeout', 120) self.ceph_osd_parallel_creates = config.get('osd_parallel_creates') self.client_keyring = '/etc/ceph/ceph.keyring' self.client_secret = '/etc/ceph/ceph.secret' # If making the cluster, use the ceph.conf file distributed by initialize to the tmp_dir self.tmp_conf = '%s/ceph.conf' % self.tmp_dir # If using an existing cluster, defualt to /etc/ceph/ceph.conf if self.use_existing: self.tmp_conf = self.config.get('conf_file') self.osd_valgrind = config.get('osd_valgrind', None) self.mon_valgrind = config.get('mon_valgrind', None) self.rgw_valgrind = config.get('rgw_valgrind', None) self.mgr_valgrind = config.get('mgr_valgrind', None) self.tiering = config.get('tiering', False) self.ruleset_map = {} self.cur_ruleset = 1 self.idle_duration = config.get('idle_duration', 0) self.use_existing = config.get('use_existing', True) self.stoprequest = threading.Event() self.haltrequest = threading.Event() self.urls = [] self.auth_urls = [] self.osd_count = config.get('osds_per_node') * len(settings.getnodes('osds'))
def _run(self, mode, run_dir, out_dir): # We'll always drop caches for rados bench self.dropcaches() if self.concurrent_ops: concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops #determine rados version rados_version_str = subprocess.check_output(["rados", "-v"]) m = re.findall("version (\d+)", rados_version_str) rados_version = int(m[0]) if mode in ['write'] or rados_version < 9: op_size_str = '-b %s' % self.op_size else: op_size_str = '' common.make_remote_dir(run_dir) # dump the cluster config self.cluster.dump_config(run_dir) # Run the backfill testing thread if requested if 'recovery_test' in self.cluster.config: recovery_callback = self.recovery_callback self.cluster.create_recovery_test(run_dir, recovery_callback) # Run rados bench monitoring.start(run_dir) logger.info('Running radosbench %s test.' % mode) ps = [] for i in xrange(self.concurrent_procs): out_file = '%s/output.%s' % (run_dir, i) objecter_log = '%s/objecter.%s.log' % (run_dir, i) # default behavior is to use a single storage pool pool_name = self.pool run_name = '--run-name %s`hostname -s`-%s' % (self.object_set_id, i) if self.pool_per_proc: # support previous behavior of 1 storage pool per rados process pool_name = 'rados-bench-`hostname -s`-%s' % i run_name = '' rados_bench_cmd = '%s -c %s -p %s bench %s %s %s %s %s --no-cleanup 2> %s > %s' % \ (self.cmd_path_full, self.tmp_conf, pool_name, op_size_str, self.time, mode, concurrent_ops_str, run_name, objecter_log, out_file) p = common.pdsh(settings.getnodes('clients'), rados_bench_cmd) ps.append(p) for p in ps: p.wait() monitoring.stop(run_dir) # If we were doing recovery, wait until it's done. if 'recovery_test' in self.cluster.config: self.cluster.wait_recovery_done() # Finally, get the historic ops self.cluster.dump_historic_ops(run_dir) common.sync_files('%s/*' % run_dir, out_dir)
def run(self): super(RawFio, self).run() # Set client readahead clnts = settings.getnodes('clients') # We'll always drop caches for rados bench self.dropcaches() monitoring.start(self.run_dir) time.sleep(5) logger.info('Starting raw fio %s test.', self.mode) fio_process_list = [] for i in range(self.concurrent_procs): b = self.block_devices[i % len(self.block_devices)] fiopath = b out_file = '%s/output.%d' % (self.run_dir, i) fio_cmd = 'sudo %s' % self.fio_cmd fio_cmd += ' --rw=%s' % self.mode if (self.mode == 'readwrite' or self.mode == 'randrw'): fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % ( self.rwmixread, self.rwmixwrite) fio_cmd += ' --ioengine=%s' % self.ioengine fio_cmd += ' --runtime=%s' % self.time fio_cmd += ' --ramp_time=%s' % self.ramp if self.startdelay: fio_cmd += ' --startdelay=%s' % self.startdelay if self.rate_iops: fio_cmd += ' --rate_iops=%s' % self.rate_iops fio_cmd += ' --numjobs=%s' % self.numjobs fio_cmd += ' --direct=%s' % self.direct fio_cmd += ' --bs=%dB' % self.op_size fio_cmd += ' --iodepth=%d' % self.iodepth fio_cmd += ' --size=%dM' % self.vol_size if self.log_iops: fio_cmd += ' --write_iops_log=%s' % out_file if self.log_bw: fio_cmd += ' --write_bw_log=%s' % out_file if self.log_lat: fio_cmd += ' --write_lat_log=%s' % out_file fio_cmd += ' --output-format=%s' % self.fio_out_format if 'recovery_test' in self.cluster.config: fio_cmd += ' --time_based' fio_cmd += ' --name=%s > %s' % (fiopath, out_file) logger.debug("FIO CMD: %s" % fio_cmd) fio_process_list.append( common.pdsh(clnts, fio_cmd, continue_if_error=False)) for p in fio_process_list: p.communicate() monitoring.stop(self.run_dir) logger.info('Finished raw fio test') common.sync_files('%s/*' % self.run_dir, self.out_dir)
def start_rgw(self): user = settings.cluster.get('user') rgwhosts = settings.cluster.get('rgws') if not rgwhosts: return # If we are starting rGW, make the RGW pools self.make_rgw_pools() for rgwhost, gateways in rgwhosts.iteritems(): for rgwname, rgwsettings in gateways.iteritems(): host = rgwsettings.get('host', rgwhost) port = rgwsettings.get('port', None) ssl_certificate = rgwsettings.get('ssl_certificate', None) # Build the auth_url auth_url = "http://" if ssl_certificate is None else "https://" auth_url += host auth_url += ":7480" if port is None else ":%s" % port auth_url += "/auth/v1.0" self.auth_urls.append(auth_url) # set the rgw_frontends rgw_frontends = None if ssl_certificate is not None: rgw_frontends = "civetweb ssl_certificate=%s" % ssl_certificate if port is not None: if rgw_frontends is None: rgw_frontends = "civetweb" rgw_frontends += " port=%s" % port cmd = '%s -c %s -n %s --log-file=%s/rgw.log' % (self.ceph_rgw_cmd, self.tmp_conf, rgwname, self.log_dir) if rgw_frontends is not None: cmd += " --rgw-frontends='%s'" % rgw_frontends if self.rgw_valgrind: cmd = "%s %s" % (common.setup_valgrind(self.rgw_valgrind, 'rgw.%s' % host, self.tmp_dir), cmd) else: cmd = '%s %s' % (self.ceph_run_cmd, cmd) if user: pdshhost = '%s@%s' % (user, rgwhost) common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s"' % cmd).communicate() # set min_size of pools to 1, when there is only one osd num_osds = len(settings.cluster.get('osds')) rgw_default_pools = ['.rgw.root', 'default.rgw.control', 'default.rgw.meta', 'default.rgw.log'] pool_min_repl_size = 1 if num_osds == 1: time.sleep(5) for pool in rgw_default_pools: common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool set %s min_size %d' % (self.ceph_cmd, self.tmp_conf, pool, pool_min_repl_size), continue_if_error=False).communicate() time.sleep(5)
def sync_files(remote_dir, local_dir): nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds') if not os.path.exists(local_dir): os.makedirs(local_dir) pdsh( nodes, 'sudo chown -R %s.%s %s' % (settings.cluster.get('user'), settings.cluster.get('user'), remote_dir)) rpdcp(nodes, '-r', remote_dir, local_dir).communicate()
def mkpools(self): with monitoring.monitor("%s/pool_monitoring" % self.run_dir): if self.pool_per_proc: # allow use of a separate storage pool per process for i in range(self.concurrent_procs): for node in settings.getnodes('clients').split(','): node = node.rpartition("@")[2] self.cluster.rmpool('rados-bench-%s-%s' % (node, i), self.pool_profile) self.cluster.mkpool('rados-bench-%s-%s' % (node, i), self.pool_profile, 'radosbench') else: # the default behavior is to use a single Ceph storage pool for all rados bench processes self.cluster.rmpool('rados-bench-cbt', self.pool_profile) self.cluster.mkpool('rados-bench-cbt', self.pool_profile, 'radosbench')
def sync_files(remote_dir, local_dir): nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds') if not os.path.exists(local_dir): os.makedirs(local_dir) if 'user' in settings.cluster: pdsh(nodes, 'sudo chown -R {0}.{0} {1}'.format(settings.cluster['user'], remote_dir), continue_if_error=False).communicate() rpdcp(nodes, '-r', remote_dir, local_dir).communicate()
def setup_fs(self): sc = settings.cluster fs = sc.get('fs') mkfs_opts = sc.get('mkfs_opts', '') mount_opts = sc.get('mount_opts', '') if fs == '': settings.shutdown("No OSD filesystem specified. Exiting.") mkfs_threads = [] for device in xrange(0, sc.get('osds_per_node')): osds = settings.getnodes('osds') common.pdsh( osds, 'sudo umount /dev/disk/by-partlabel/osd-device-%s-data' % device).communicate() common.pdsh( osds, 'sudo rm -rf %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate() common.pdsh( osds, 'sudo mkdir -p -m0755 -- %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate() if fs == 'tmpfs': logger.info('using tmpfs osds, not creating a file system.') elif fs == 'zfs': logger.info('ruhoh, zfs detected. No mkfs for you!') common.pdsh(osds, 'sudo zpool destroy osd-device-%s-data' % device).communicate() common.pdsh( osds, 'sudo zpool create -f -O xattr=sa -m legacy osd-device-%s-data /dev/disk/by-partlabel/osd-device-%s-data' % (device, device)).communicate() common.pdsh( osds, 'sudo zpool add osd-device-%s-data log /dev/disk/by-partlabel/osd-device-%s-zil' % (device, device)).communicate() common.pdsh( osds, 'sudo mount %s -t zfs osd-device-%s-data %s/osd-device-%s-data' % (mount_opts, device, self.mnt_dir, device)).communicate() else: # do mkfs and mount in 1 long command # alternative is to wait until make_osds to mount it mkfs_cmd = 'sudo sh -c "mkfs.%s %s /dev/disk/by-partlabel/osd-device-%s-data ; ' % ( fs, mkfs_opts, device) mkfs_cmd += 'mount %s -t %s /dev/disk/by-partlabel/osd-device-%s-data %s/osd-device-%s-data"' % ( mount_opts, fs, device, self.mnt_dir, device) mkfs_threads.append((device, common.pdsh(osds, mkfs_cmd))) for device, t in mkfs_threads: # for tmpfs and zfs cases, thread list is empty logger.info('for device %d on all hosts awaiting mkfs and mount' % device) t.communicate()
def run(self): if self.osd_ra and self.osd_ra_changed: logger.info('Setting OSD Read Ahead to: %s', self.osd_ra) self.cluster.set_osd_param('read_ahead_kb', self.osd_ra) logger.debug('Cleaning existing temporary run directory: %s', self.run_dir) common.pdsh(settings.getnodes('clients', 'osds', 'mons', 'rgws'), 'sudo rm -rf %s' % self.run_dir).communicate() if self.valgrind is not None: logger.debug('Adding valgrind to the command path.') self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir) # Set the full command path self.cmd_path_full += self.cmd_path
def start(directory): nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws') collectl_dir = '%s/collectl' % directory # perf_dir = '%s/perf' % directory # blktrace_dir = '%s/blktrace' % directory # collectl rawdskfilt = 'cciss/c\d+d\d+ |hd[ab] | sd[a-z]+ |dm-\d+ |xvd[a-z] |fio[a-z]+ | vd[a-z]+ |emcpower[a-z]+ |psv\d+ |nvme[0-9]n[0-9]+p[0-9]+ ' common.pdsh(nodes, 'mkdir -p -m0755 -- %s' % collectl_dir) common.pdsh( nodes, 'collectl -s+mYZ -i 1:10 --rawdskfilt "%s" -F0 -f %s' % (rawdskfilt, collectl_dir))
def initialize(self): super(StdFioBench, self).initialize() for i in xrange(1): letter = string.ascii_lowercase[i + 1] if not self.use_existing: common.pdsh(settings.getnodes('clients'), 'sudo umount -f %s' % (self.block_dev_name)).communicate() common.pdsh( settings.getnodes('clients'), 'sudo mkfs.%s -f %s' % (self.filesystem, self.block_dev_name)).communicate() common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p %s ' % (self.mount_point_name)).communicate() common.pdsh( settings.getnodes('clients'), 'sudo mount -t %s -o noatime %s %s' % (self.filesystem, self.block_dev_name, self.mount_point_name)).communicate() common.pdsh( settings.getnodes('clients'), 'sudo mkdir -p %s/`hostname -s`-%d' % (self.mount_point_name, i)).communicate() # Create the run directory common.make_remote_dir(self.run_dir) # populate the fio files logger.info('Attempting to populating fio files...') pre_cmd = 'sudo %s --rw=write --ioengine=sync --numjobs=%s --bs=8M --size %dM %s > /dev/null ' % ( self.fio_cmd, self.numjobs, self.vol_size, self.names) common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()
def osdout(self): ret = self.cluster.check_health("%s/recovery.log" % self.config.get('run_dir')) common.pdsh(settings.getnodes('head'), self.logcmd("ret: %s" % ret)) if self.outhealthtries < self.maxhealthtries and ret == 0: self.outhealthtries = self.outhealthtries + 1 return # Cluster hasn't become unhealthy yet. if ret == 0: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')) else: common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')) lcmd = self.logcmd("Unsetting the ceph osd noup flag") common.pdsh( settings.getnodes('head'), 'ceph -c %s ceph osd unset noup;%s' % (self.cluster.tmp_conf, lcmd)) for osdnum in self.config.get('osds'): lcmd = self.logcmd("Marking OSD %s up." % osdnum) common.pdsh( settings.getnodes('head'), 'ceph -c %s osd up %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)) lcmd = self.logcmd("Marking OSD %s in." % osdnum) common.pdsh( settings.getnodes('head'), 'ceph -c %s osd in %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)) self.state = "osdin"