Esempio n. 1
0
    def initialize(self): 
        super(RawFio, self).initialize()
        common.pdsh(settings.getnodes('clients'),
                    'sudo rm -rf %s' % self.run_dir,
                    continue_if_error=False).communicate()
        common.make_remote_dir(self.run_dir)
        clnts = settings.getnodes('clients')
        logger.info('creating mountpoints...')

        logger.info('Attempting to initialize fio files...')
        initializer_list = []
        for i in range(self.concurrent_procs):
            b = self.block_devices[i % len(self.block_devices)]
            fiopath = b
            pre_cmd = 'sudo %s --rw=write -ioengine=%s --bs=%s ' % (self.fio_cmd, self.ioengine, self.op_size)
            pre_cmd = '%s --size %dM --name=%s --output-format=%s> /dev/null' % (
                       pre_cmd, self.vol_size, fiopath, self.fio_out_format)
            initializer_list.append(common.pdsh(clnts, pre_cmd,
                                    continue_if_error=False))
        for p in initializer_list:
             p.communicate()

        # Create the run directory
        common.pdsh(clnts, 'rm -rf %s' % self.run_dir,
                    continue_if_error=False).communicate()
        common.make_remote_dir(self.run_dir)
Esempio n. 2
0
    def initialize(self): 
        pass
#        self.cleanup()
#        super(KvmRbdFio, self).initialize()
        common.setup_cluster()
#        common.setup_ceph()

        # Setup the pools
#        common.pdsh(settings.cluster.get('head'), 'sudo ceph osd pool create rbdfio %d %d' % (self.pgs, self.pgs)).communicate()
#        common.pdsh(settings.cluster.get('head'), 'sudo ceph osd pool set rbdfio size 1').communicate()
#        print 'Checking Healh after pool creation.'
#        common.check_health()

#        common.pdsh(settings.cluster.get('clients'), 'sudo modprobe rbd').communicate()
#        for i in xrange(self.concurrent_procs):
        names = ""
        for i in xrange(self.concurrent_procs):
            letter = string.ascii_lowercase[i+1]
#            common.pdsh(settings.cluster.get('clients'), 'sudo rbd create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (i, self.vol_size)).communicate()
#            common.pdsh(settings.cluster.get('clients'), 'sudo rbd map rbdfio-`hostname -s`-%d  --pool rbdfio --id admin' % i).communicate()
#            common.pdsh(settings.cluster.get('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/vd%s' % letter).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/vd%s /srv/rbdfio-`hostname -s`-%d' %(letter, i)).communicate()

        # Create the run directory
        common.make_remote_dir(self.run_dir)
Esempio n. 3
0
    def run(self):
        super(KvmRbdFio, self).run()
        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        time.sleep(5)
        names = ""
        for i in xrange(self.concurrent_procs):
            names += "--name=/srv/rbdfio-`hostname -s`-%d/cbt-kvmrbdfio " % i
        out_file = '%s/output' % self.run_dir
        pre_cmd = 'sudo fio --rw=read -ioengine=sync --numjobs=1 --bs=4M --runtime=1 --size %dM %s > /dev/null' % (self.vol_size * 9/10, names)
        fio_cmd = 'sudo fio --rw=%s -ioengine=%s --runtime=%s --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM %s > %s' %  (self.mode, self.ioengine, self.time, self.op_size, self.iodepth, self.vol_size * 9/10, names, out_file)
        print 'Attempting to populating fio files...'
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()
        print 'Running rbd fio %s test.' % self.mode
        common.pdsh(settings.getnodes('clients'), fio_cmd).communicate()
#        ps = []
#        for i in xrange(self.concurrent_procs):
#            out_file = '%s/output.%s' % (self.run_dir, i)
#            p = common.pdsh(settings.cluster.get('clients'), 'sudo fio --rw=%s -ioengine=%s --runtime=%s --name=/srv/rbdfio-`hostname -s`-%d/cbt-rbdfio --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM > %s' % (self.mode, self.ioengine, self.time, i, self.op_size, self.iodepth, self.vol_size * 9/10, out_file))
#            ps.append(p)
#        for p in ps:
#            p.wait()
        monitoring.stop(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Esempio n. 4
0
    def initialize(self): 
        common.cleanup_tests()
        if not self.use_existing:
            common.setup_cluster()
            common.setup_ceph()

            # Create the run directory
            common.make_remote_dir(self.run_dir)

            # Setup the pools

            monitoring.start("%s/pool_monitoring" % self.run_dir)
            for i in xrange(self.concurrent_procs):
                for node in settings.getnodes('clients').split(','):
                    node = node.rpartition("@")[2]
                    common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool create rados-bench-%s-%s %d %d' % (node, i, self.pgs_per_pool, self.pgs_per_pool)).communicate()
                    common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool set rados-bench-%s-%s size 1' % (node, i)).communicate()
                    # check the health for each pool.
                    print 'Checking Healh after pool creation.'
                    common.check_health()
            monitoring.stop()

        print 'Running scrub monitoring.'
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        common.check_scrub()
        monitoring.stop()

        print 'Pausing for 60s for idle monitoring.'
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        return True
Esempio n. 5
0
    def rmpool(self, name, profile_name):
        pool_profiles = self.config.get("pool_profiles", {"default": {}})
        profile = pool_profiles.get(profile_name, {})
        cache_profile = profile.get("cache_profile", None)
        if cache_profile:
            cache_name = "%s-cache" % name

            # flush and remove the overlay and such
            common.pdsh(
                settings.getnodes("head"),
                "sudo ceph -c %s osd tier cache-mode %s forward" % (self.tmp_conf, cache_name),
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo rados -c %s -p %s cache-flush-evict-all" % (self.tmp_conf, cache_name)
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo ceph -c %s osd tier remove-overlay %s" % (self.tmp_conf, name)
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo ceph -c %s osd tier remove %s %s" % (self.tmp_conf, name, cache_name)
            ).communicate()

            # delete the cache pool
            self.rmpool(cache_name, cache_profile)
        common.pdsh(
            settings.getnodes("head"),
            "sudo ceph -c %s osd pool delete %s %s --yes-i-really-really-mean-it" % (self.tmp_conf, name, name),
        ).communicate()
Esempio n. 6
0
 def pre(self):
     pre_time = self.config.get("pre_time", 60)
     common.pdsh(settings.getnodes('head'), self.logcmd('Starting Recovery Test Thread, waiting %s seconds.' % pre_time)).communicate()
     time.sleep(pre_time)
     lcmd = self.logcmd("Setting the ceph osd noup flag")
     common.pdsh(settings.getnodes('head'), '%s -c %s osd set noup;%s' % (self.ceph_cmd, self.cluster.tmp_conf, lcmd)).communicate()
     self.state = 'markdown'
Esempio n. 7
0
 def mkimages(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     self.cluster.rmpool(self.poolname, self.pool_profile)
     self.cluster.mkpool(self.poolname, self.pool_profile)
     for node in settings.getnodes('clients').split(','):
         node = node.rpartition("@")[2]
         common.pdsh(settings.getnodes('head'), '/usr/bin/rbd create cbt-librbdfio-%s --size %s --pool %s --order %s' % (node, self.vol_size, self.poolname, self.vol_order)).communicate()
     monitoring.stop()
Esempio n. 8
0
    def markdown(self):
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s down." % osdnum)
            common.pdsh(settings.getnodes('head'), '%s -c %s osd down %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate()
            lcmd = self.logcmd("Marking OSD %s out." % osdnum)
            common.pdsh(settings.getnodes('head'), '%s -c %s osd out %s;%s' % (self.ceph_cmd, self.cluster.tmp_conf, osdnum, lcmd)).communicate()
        common.pdsh(settings.getnodes('head'), self.logcmd('Waiting for the cluster to break and heal')).communicate()

        self.state = 'osdout'
Esempio n. 9
0
def stop(directory=None):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws')

    common.pdsh(nodes, 'pkill -SIGINT -f collectl').communicate()
    common.pdsh(nodes, 'sudo pkill -SIGINT -f perf_3.6').communicate()
    common.pdsh(settings.getnodes('osds'), 'sudo pkill -SIGINT -f blktrace').communicate()
    if directory:
        sc = settings.cluster
        common.pdsh(nodes, 'cd %s/perf;sudo chown %s.%s perf.data' % (directory, sc.get('user'), sc.get('user')))
        make_movies(directory)
Esempio n. 10
0
 def mkimages(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     self.cluster.rmpool(self.poolname, self.pool_profile)
     self.cluster.mkpool(self.poolname, self.pool_profile)
     common.pdsh(settings.getnodes('clients'), '/usr/bin/rbd create cbt-kernelrbdfio-`hostname -s` --size %s --pool %s' % (self.vol_size, self.poolname)).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo rbd map cbt-kernelrbdfio-`hostname -s` --pool %s --id admin' % self.poolname).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s`').communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p -m0755 -- %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s` %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate()
     monitoring.stop()
Esempio n. 11
0
    def _run(self, mode, run_dir, out_dir):
        # We'll always drop caches for rados bench
        self.dropcaches()

        if self.concurrent_ops:
            concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops
        #determine rados version
        rados_version_str, err = common.pdsh(settings.getnodes('head'), '/usr/bin/rados -v').communicate()
        m = re.findall("version (\d+)", rados_version_str)
        rados_version = int(m[0])

        if mode in ['write'] or rados_version < 9:
            op_size_str = '-b %s' % self.op_size
        else:
            op_size_str = ''


        common.make_remote_dir(run_dir)

        # dump the cluster config
        self.cluster.dump_config(run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(run_dir, recovery_callback)

        # Run rados bench
        monitoring.start(run_dir)
        logger.info('Running radosbench %s test.' % mode)
        ps = []
        for i in xrange(self.concurrent_procs):
            out_file = '%s/output.%s' % (run_dir, i)
            objecter_log = '%s/objecter.%s.log' % (run_dir, i)
            # default behavior is to use a single storage pool 
            pool_name = self.pool
            run_name = '--run-name %s`hostname -s`-%s'%(self.object_set_id, i)
            if self.pool_per_proc: # support previous behavior of 1 storage pool per rados process
                pool_name = 'rados-bench-`hostname -s`-%s'%i
                run_name = ''
            rados_bench_cmd = '%s -c %s -p %s bench %s %s %s %s %s --no-cleanup 2> %s > %s' % \
                 (self.cmd_path_full, self.tmp_conf, pool_name, op_size_str, self.time, mode, concurrent_ops_str, run_name, objecter_log, out_file)
            p = common.pdsh(settings.getnodes('clients'), rados_bench_cmd)
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(run_dir)

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(run_dir)
        common.sync_files('%s/*' % run_dir, out_dir)
Esempio n. 12
0
    def osdin(self):
        # Wait until the cluster is healthy.
        ret = self.cluster.check_health(self.health_checklist, "%s/recovery.log" % self.config.get('run_dir'))
        if self.inhealthtries < self.maxhealthtries and ret == 0:
            self.inhealthtries = self.inhealthtries + 1
            return # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate()
        else:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate()
        self.state = "post"
Esempio n. 13
0
 def mkimages(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     self.cluster.rmpool(self.poolname, self.pool_profile)
     self.cluster.mkpool(self.poolname, self.pool_profile)
     for node in settings.getnodes("clients").split(","):
         for volnum in xrange(0, self.volumes_per_client):
             node = node.rpartition("@")[2]
             common.pdsh(
                 settings.getnodes("head"),
                 "/usr/bin/rbd create cbt-librbdfio-%s-%d --size %s --pool %s --order %s"
                 % (node, volnum, self.vol_size, self.poolname, self.vol_order),
             ).communicate()
     monitoring.stop()
Esempio n. 14
0
    def pre(self):
        pre_time = self.config.get("pre_time", 60)
        common.pdsh(settings.getnodes('head'), self.logcmd('Starting Recovery Test Thread, waiting %s seconds.' % pre_time)).communicate()
        time.sleep(pre_time)
        lcmd = self.logcmd("Setting the ceph osd noup flag")
        common.pdsh(settings.getnodes('head'), 'ceph -c %s ceph osd set noup;%s' % (self.cluster.tmp_conf, lcmd)).communicate()
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s down." % osdnum)
            common.pdsh(settings.getnodes('head'), 'ceph -c %s osd down %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)).communicate()
            lcmd = self.logcmd("Marking OSD %s out." % osdnum)
            common.pdsh(settings.getnodes('head'), 'ceph -c %s osd out %s;%s' % (self.cluster.tmp_conf, osdnum, lcmd)).communicate()
        common.pdsh(settings.getnodes('head'), self.logcmd('Waiting for the cluster to break and heal')).communicate()

        self.state = 'osdout'
Esempio n. 15
0
def cleanup_tests():
    clients = settings.getnodes('clients')
    rgws = settings.getnodes('rgws')
    nodes = settings.getnodes('clients', 'servers', 'mons', 'rgws')

    pdsh(clients, 'sudo killall -9 rados;sudo killall -9 rest-bench').communicate()
    if rgws:
        pdsh(rgws, 'sudo killall -9 radosgw-admin').communicate()
    pdsh(nodes, 'sudo killall -9 pdcp').communicate()

    # cleanup the tmp_dir
    tmp_dir = settings.cluster.get("tmp_dir")
    print 'Deleting %s' % tmp_dir
    pdsh(nodes, 'rm -rf %s' % tmp_dir).communicate()
Esempio n. 16
0
    def initialize(self): 
        super(RbdFio, self).initialize()
        self.cleanup()

        if not self.use_existing:
            self.cluster.initialize()
            self.cluster.dump_config(self.run_dir)

            # Setup the pools
            monitoring.start("%s/pool_monitoring" % self.run_dir)
            common.pdsh(settings.getnodes('head'), 'sudo ceph -c %s osd pool create rbdfio %d %d' % (self.tmp_conf, self.pgs, self.pgs)).communicate()
            common.pdsh(settings.getnodes('head'), 'sudo ceph -c %s osd pool set rbdfio size 1' % self.tmp_conf).communicate()
            print 'Checking Healh after pool creation.'
            self.cluster.check_health()
            monitoring.stop()

            # Mount the filesystem
            common.pdsh(settings.getnodes('clients'), 'sudo modprobe rbd').communicate()
            for i in xrange(self.concurrent_procs):
                common.pdsh(settings.getnodes('clients'), 'sudo rbd -c %s create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (self.tmp_conf, i, self.vol_size)).communicate()
                common.pdsh(settings.getnodes('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate()
                common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d' % i).communicate()
                common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p -m0755 -- %s/mnt/rbdfio-`hostname -s`-%d' % (self.tmp_dir, i)).communicate()
                common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d %s/mnt/rbdfio-`hostname -s`-%d' % (i, self.tmp_dir, i)).communicate()

        print 'Running scrub monitoring'
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        # Create the run directory
        common.make_remote_dir(self.run_dir)
Esempio n. 17
0
    def initialize(self): 
        self.cleanup()
        super(RbdFio, self).initialize()
        common.setup_cluster()
        common.setup_ceph()
        common.dump_config(self.run_dir)
        # Setup the pools
        common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool create rbdfio %d %d' % (self.pgs, self.pgs)).communicate()
        common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool set rbdfio size 1').communicate()
        print 'Checking Healh after pool creation.'
        common.check_health()

        common.pdsh(settings.getnodes('clients'), 'sudo modprobe rbd').communicate()
        for i in xrange(self.concurrent_procs):
            common.pdsh(settings.getnodes('clients'), 'sudo rbd create rbdfio/rbdfio-`hostname -s`-%d --size %d' % (i, self.vol_size)).communicate()
#            common.pdsh(settings.cluster.get('clients'), 'sudo rbd map rbdfio-`hostname -s`-%d  --pool rbdfio --id admin' % i).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo echo "%s %s rbdfio rbdfio-`hostname -s`-%d" | sudo tee /sys/bus/rbd/add && sudo /sbin/udevadm settle' % (self.rbdadd_mons, self.rbdadd_options, i)).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d' % i).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/rbdfio/rbdfio-`hostname -s`-%d /srv/rbdfio-`hostname -s`-%d' %(i, i)).communicate()

        common.check_scrub()

        # Create the run directory
        common.make_remote_dir(self.run_dir)
Esempio n. 18
0
    def initialize(self): 
        super(KvmRbdFio, self).initialize()
        for i in xrange(1):
             letter = string.ascii_lowercase[i+1]
             common.pdsh(settings.getnodes('clients'), 'sudo mkfs.ext4 /dev/vd%s' % letter).communicate()
             common.pdsh(settings.getnodes('clients'), 'sudo mkdir /srv/rbdfio-`hostname -s`-%d' % i).communicate()
             common.pdsh(settings.getnodes('clients'), 'sudo mount -t ext4 -o noatime /dev/vd%s /srv/rbdfio-`hostname -s`-%d' %(letter, i)).communicate()

        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        logger.info('Attempting to populating fio files...')
        pre_cmd = 'sudo fio --rw=write -ioengine=sync --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.numjobs, self.vol_size, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()
Esempio n. 19
0
    def setup_fs(self):
        sc = settings.cluster
        fs = sc.get('fs')
        mkfs_opts = sc.get('mkfs_opts', '')
        mount_opts = sc.get('mount_opts', '')

        if fs == '':
             shutdown("No OSD filesystem specified.  Exiting.")

        for device in xrange (0,sc.get('osds_per_node')):
            osds = settings.getnodes('osds')
            common.pdsh(osds, 'sudo umount /dev/disk/by-partlabel/osd-device-%s-data' % device).communicate()
            common.pdsh(osds, 'sudo rm -rf %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate()
            common.pdsh(osds, 'sudo mkdir -p -m0755 -- %s/osd-device-%s-data' % (self.mnt_dir, device)).communicate()

            if fs == 'tmpfs':
                print 'using tmpfs osds, not creating a file system.'
            elif fs == 'zfs':
                print 'ruhoh, zfs detected.  No mkfs for you!'
                common.pdsh(osds, 'sudo zpool destroy osd-device-%s-data' % device).communicate()
                common.pdsh(osds, 'sudo zpool create -f -O xattr=sa -m legacy osd-device-%s-data /dev/disk/by-partlabel/osd-device-%s-data' % (device, device)).communicate()
                common.pdsh(osds, 'sudo zpool add osd-device-%s-data log /dev/disk/by-partlabel/osd-device-%s-zil' % (device, device)).communicate()
                common.pdsh(osds, 'sudo mount %s -t zfs osd-device-%s-data %s/osd-device-%s-data' % (mount_opts, device, self.mnt_dir, device)).communicate()
            else: 
                common.pdsh(osds, 'sudo mkfs.%s %s /dev/disk/by-partlabel/osd-device-%s-data' % (fs, mkfs_opts, device)).communicate()
                common.pdsh(osds, 'sudo mount %s -t %s /dev/disk/by-partlabel/osd-device-%s-data %s/osd-device-%s-data' % (mount_opts, fs, device, self.mnt_dir, device)).communicate()
Esempio n. 20
0
    def osdin(self):
        # Wait until the cluster is healthy.
        ret = self.cluster.check_health("%s/recovery.log" % self.config.get('run_dir'))
        if self.inhealthtries < self.maxhealthtries and ret == 0:
            self.inhealthtries = self.inhealthtries + 1
            return # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate()
        else:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate()

        post_time = self.config.get("post_time", 60)
        common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate()
        time.sleep(post_time)
        self.state = "done"
Esempio n. 21
0
    def initialize(self): 
        super(LibrbdFio, self).initialize()

        print 'Running scrub monitoring.'
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        print 'Pausing for 60s for idle monitoring.'
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        self.mkimages()

        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        print 'Attempting to populating fio files...'
        pre_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=cbt-librbdfio-`hostname -s` --invalidate=0  --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.poolname, self.numjobs, self.vol_size, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()

        return True
Esempio n. 22
0
    def _run(self, mode, run_dir, out_dir):
        # We'll always drop caches for rados bench
        self.dropcaches()

        if self.concurrent_ops:
            concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops
        op_size_str = '-b %s' % self.op_size

        common.make_remote_dir(run_dir)

        # dump the cluster config
        common.dump_config(run_dir)

        monitoring.start(run_dir)
        # Run rados bench
        print 'Running radosbench read test.'
        ps = []
        for i in xrange(self.concurrent_procs):
            out_file = '%s/output.%s' % (run_dir, i)
            objecter_log = '%s/objecter.%s.log' % (run_dir, i)
            p = common.pdsh(settings.getnodes('clients'), '/usr/bin/rados -p rados-bench-`hostname -s`-%s %s bench %s %s %s --no-cleanup 2> %s > %s' % (i, op_size_str, self.time, mode, concurrent_ops_str, objecter_log, out_file))
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(run_dir)

        # Get the historic ops
        common.dump_historic_ops(run_dir)
        common.sync_files('%s/*' % run_dir, out_dir)
Esempio n. 23
0
def make_movies(directory):
    sc = settings.cluster
    seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user')
    blktrace_dir = '%s/blktrace' % directory

    for device in xrange (0,sc.get('osds_per_node')):
        common.pdsh(settings.getnodes('osds'), 'cd %s;%s -t device%s -o device%s.mpg --movie' % (blktrace_dir,seekwatcher,device,device))
Esempio n. 24
0
    def initialize(self): 
        super(RbdFio, self).initialize()

        logger.info('Running scrub monitoring.')
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        logger.info('Pausing for 60s for idle monitoring.')
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        self.mkimages()
 
        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        logger.info('Attempting to populating fio files...')
        pre_cmd = 'sudo %s --ioengine=%s --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.ioengine, self.numjobs, self.vol_size*0.9, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()

        return True
Esempio n. 25
0
    def rmpool(self, name, profile_name):
        pool_profiles = self.config.get('pool_profiles', {'default': {}})
        profile = pool_profiles.get(profile_name, {})
        cache_profile = profile.get('cache_profile', None)
        if cache_profile:
            cache_name = '%s-cache' % name

            # flush and remove the overlay and such
            common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier cache-mode %s forward' % (self.ceph_cmd, self.tmp_conf, cache_name)).communicate()
            common.pdsh(settings.getnodes('head'), 'sudo rados -c %s -p %s cache-flush-evict-all' % (self.tmp_conf, cache_name)).communicate()
            common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier remove-overlay %s' % (self.ceph_cmd, self.tmp_conf, name)).communicate()
            common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd tier remove %s %s' % (self.ceph_cmd, self.tmp_conf, name, cache_name)).communicate()

            # delete the cache pool
            self.rmpool(cache_name, cache_profile)
        common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool delete %s %s --yes-i-really-really-mean-it' % (self.ceph_cmd, self.tmp_conf, name, name)).communicate()
Esempio n. 26
0
 def distribute_conf(self):
     nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws')
     conf_file = self.config.get("conf_file")
     logger.info("Distributing %s.", conf_file)
     common.pdcp(nodes, '', conf_file, self.tmp_conf).communicate()
     common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak').communicate()
     common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf).communicate()
Esempio n. 27
0
    def __init__(self, cluster, config):
        super(KvmRbdFio, self).__init__(cluster, config)
        self.concurrent_procs = config.get('concurrent_procs', 1)
        self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(','))

        self.time =  str(config.get('time', '300'))
        self.ramp = str(config.get('ramp', '0'))
        self.iodepth = config.get('iodepth', 16)
        self.numjobs = config.get('numjobs', 1)
        self.mode = config.get('mode', 'write')
        self.rwmixread = config.get('rwmixread', 50)
        self.rwmixwrite = 100 - self.rwmixread
        self.ioengine = config.get('ioengine', 'libaio')
        self.op_size = config.get('op_size', 4194304)
        self.pgs = config.get('pgs', 2048)
        self.vol_size = config.get('vol_size', 65536) * 0.9
        self.rep_size = config.get('rep_size', 1)
        self.rbdadd_mons = config.get('rbdadd_mons')
        self.rbdadd_options = config.get('rbdadd_options')
        self.client_ra = config.get('client_ra', '128')
        self.fio_cmd = config.get('fio_cmd', '/usr/bin/fio')

        # FIXME there are too many permutations, need to put results in SQLITE3 
        self.run_dir = '%s/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)
        self.out_dir = '%s/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)

        # Make the file names string
        self.names = ''
        for i in xrange(self.concurrent_procs):
            self.names += '--name=/srv/rbdfio-`hostname -s`-0/cbt-kvmrbdfio-%d ' % i
Esempio n. 28
0
 def __init__(self, cluster, config):
     super(RawFio, self).__init__(cluster, config)
     # comma-separated list of block devices to use inside the client host/VM/container
     self.block_device_list = config.get('block_devices', '/dev/vdb' )
     self.block_devices = [ d.strip() for d in self.block_device_list.split(',') ]
     self.concurrent_procs = config.get('concurrent_procs', len(self.block_devices))
     self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(','))
     self.fio_out_format = "json"
     self.time =  str(config.get('time', '300'))
     self.ramp = str(config.get('ramp', '0'))
     self.startdelay = config.get('startdelay', None)
     self.rate_iops = config.get('rate_iops', None)
     self.iodepth = config.get('iodepth', 16)
     self.direct = config.get('direct', 1)
     self.numjobs = config.get('numjobs', 1)
     self.mode = config.get('mode', 'write')
     self.rwmixread = config.get('rwmixread', 50)
     self.rwmixwrite = 100 - self.rwmixread
     self.ioengine = config.get('ioengine', 'libaio')
     self.op_size = config.get('op_size', 4194304)
     self.vol_size = config.get('vol_size', 65536) * 0.9
     self.fio_cmd = config.get('fio_cmd', 'sudo /usr/bin/fio')
     # FIXME there are too many permutations, need to put results in SQLITE3 
     self.run_dir = '%s/raw_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)
     self.out_dir = '%s/raw_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.op_size), int(self.total_procs), int(self.iodepth), self.mode)
Esempio n. 29
0
    def __init__(self, cluster, config):
        super(RbdFio, self).__init__(cluster, config)

        # FIXME there are too many permutations, need to put results in SQLITE3
        self.cmd_path = config.get('cmd_path', '/usr/bin/fio')
        self.pool_profile = config.get('pool_profile', 'default')

        self.concurrent_procs = config.get('concurrent_procs', 1)
        self.total_procs = self.concurrent_procs * len(settings.getnodes('clients').split(','))
        self.time =  str(config.get('time', None))
        self.ramp = str(config.get('ramp', None))
        self.iodepth = config.get('iodepth', 16)
        self.numjobs = config.get('numjobs', 1)
        self.end_fsync = str(config.get('end_fsync', 0))
        self.mode = config.get('mode', 'write')
        self.rwmixread = config.get('rwmixread', 50)
        self.rwmixwrite = 100 - self.rwmixread
        self.log_avg_msec = config.get('log_avg_msec', None)
        self.ioengine = config.get('ioengine', 'libaio')
        self.op_size = config.get('op_size', 4194304)
        self.vol_size = config.get('vol_size', 65536)
        self.vol_order = config.get('vol_order', 22)
        self.random_distribution = config.get('random_distribution', None)
        self.rbdadd_mons = config.get('rbdadd_mons')
        self.rbdadd_options = config.get('rbdadd_options', 'share')
        self.client_ra = config.get('client_ra', 128)
        self.poolname = "cbt-kernelrbdfio"

        self.run_dir = '%s/rbdfio/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.run_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.concurrent_procs), int(self.iodepth), self.mode)
        self.out_dir = '%s/rbdfio/osd_ra-%08d/client_ra-%08d/op_size-%08d/concurrent_procs-%03d/iodepth-%03d/%s' % (self.archive_dir, int(self.osd_ra), int(self.client_ra), int(self.op_size), int(self.concurrent_procs), int(self.iodepth), self.mode)

        # Make the file names string
        self.names = ''
        for i in xrange(self.concurrent_procs):
            self.names += '--name=%s/cbt-kernelrbdfio-`hostname -s`/cbt-kernelrbdfio-%d ' % (self.cluster.mnt_dir, i)
Esempio n. 30
0
    def run(self):
        # First create a credential file for each gateway
        self.mkcredfiles()

        # We'll always drop caches for rados bench
        self.dropcaches()
        
        # dump the cluster config
        self.cluster.dump_config(self.run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        # Run getput 
        monitoring.start(self.run_dir)
        logger.info('Running getput %s test.' % self.test)

        ps = []
        for i in xrange(0, len(self.auth_urls)):
            cmd = self.mkgetputcmd("%s/gw%02d.cred" % (self.run_dir, i), i)
            p = common.pdsh(settings.getnodes('clients'), cmd)
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(self.run_dir)

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Esempio n. 31
0
    def markdown(self):
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s down." % osdnum)
            common.pdsh(
                settings.getnodes('head'), '%s -c %s osd down %s;%s' %
                (self.ceph_cmd, self.cluster.tmp_conf, osdnum,
                 lcmd)).communicate()
            lcmd = self.logcmd("Marking OSD %s out." % osdnum)
            common.pdsh(
                settings.getnodes('head'), '%s -c %s osd out %s;%s' %
                (self.ceph_cmd, self.cluster.tmp_conf, osdnum,
                 lcmd)).communicate()
        common.pdsh(
            settings.getnodes('head'),
            self.logcmd(
                'Waiting for the cluster to break and heal')).communicate()

        self.state = 'osdout'
Esempio n. 32
0
    def osdin(self):
        # Wait until the cluster is healthy.
        ret = self.cluster.check_health(
            self.health_checklist,
            "%s/recovery.log" % self.config.get('run_dir'))
        if self.inhealthtries < self.maxhealthtries and ret == 0:
            self.inhealthtries = self.inhealthtries + 1
            return  # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(
                settings.getnodes('head'),
                self.logcmd('Cluster never went unhealthy.')).communicate()
        else:
            common.pdsh(
                settings.getnodes('head'),
                self.logcmd('Cluster appears to have healed.')).communicate()
        self.state = "post"
Esempio n. 33
0
 def run(self):
     self.haltrequest.clear()
     self.stoprequest.clear()
     while not self.haltrequest.isSet():
         self.states[self.state]()
     common.pdsh(
         settings.getnodes('head'),
         self.logcmd('Exiting recovery test thread.  Last state was: %s' %
                     self.state)).communicate()
Esempio n. 34
0
 def check_scrub(self):
     logger.info('Waiting until Scrubbing completes...')
     while True:
         stdout, stderr = common.pdsh(settings.getnodes('head'), '%s -c %s pg dump | cut -f 16 | grep "0.000000" | wc -l' % (self.ceph_cmd, self.tmp_conf)).communicate()
         if " 0\n" in stdout:
             break
         else:
             logger.info(stdout)
         time.sleep(1)
Esempio n. 35
0
def clean_remote_dir(remote_dir):
    print("cleaning remote dir %s".format(remote_dir))
    if remote_dir == "/" or not os.path.isabs(remote_dir):
        raise SystemExit("Cleaning the remote dir doesn't seem safe, bailing.")

    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds')
    pdsh(nodes,
         'if [ -d "%s" ]; then rm -rf %s; fi' % (remote_dir, remote_dir),
         continue_if_error=False).communicate()
Esempio n. 36
0
 def mkimage(self, name, size, pool, data_pool, order):
     dp_option = ''
     if data_pool:
         dp_option = "--data-pool %s" % data_pool
     common.pdsh(
         settings.getnodes('head'),
         '%s -c %s create %s --size %s --pool %s %s --order %s' %
         (self.rbd_cmd, self.tmp_conf, name, size, pool, dp_option,
          order)).communicate()
Esempio n. 37
0
 def check_scrub(self):
     print 'Waiting until Scrubbing completes...'
     while True:
         stdout, stderr = common.pdsh(settings.getnodes('head'), 'ceph -c %s pg dump | cut -f 16 | grep "0.000000" | wc -l' % self.tmp_conf).communicate()
         if " 0\n" in stdout:
             break
         else:
             print stdout
         time.sleep(1)
Esempio n. 38
0
    def run(self):
        super(KvmRbdFio, self).run()
        # Set client readahead
        self.set_client_param('read_ahead_kb', self.client_ra)
        clnts = settings.getnodes('clients')

        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        time.sleep(5)
        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        logger.info('Starting rbd fio %s test.', self.mode)

        fio_process_list = []
        for i in range(self.concurrent_procs):
            b = self.block_devices[i % len(self.block_devices)]
            bnm = os.path.basename(b)
            mtpt = '/srv/rbdfio-`hostname -s`-%s' % bnm
            fiopath = os.path.join(mtpt, 'fio%d.img' % i)
            out_file = '%s/output.%d' % (self.run_dir, i)
            fio_cmd = 'sudo %s' % self.fio_cmd
            fio_cmd += ' --rw=%s' % self.mode
            if (self.mode == 'readwrite' or self.mode == 'randrw'):
                fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (
                    self.rwmixread, self.rwmixwrite)
            fio_cmd += ' --ioengine=%s' % self.ioengine
            fio_cmd += ' --runtime=%s' % self.time
            fio_cmd += ' --ramp_time=%s' % self.ramp
            if self.startdelay:
                fio_cmd += ' --startdelay=%s' % self.startdelay
            if self.rate_iops:
                fio_cmd += ' --rate_iops=%s' % self.rate_iops
            fio_cmd += ' --numjobs=%s' % self.numjobs
            fio_cmd += ' --direct=1'
            fio_cmd += ' --bs=%dB' % self.op_size
            fio_cmd += ' --iodepth=%d' % self.iodepth
            fio_cmd += ' --size=%dM' % self.vol_size
            fio_cmd += ' --write_iops_log=%s' % out_file
            fio_cmd += ' --write_bw_log=%s' % out_file
            fio_cmd += ' --write_lat_log=%s' % out_file
            if 'recovery_test' in self.cluster.config:
                fio_cmd += ' --time_based'
            fio_cmd += ' --name=%s > %s' % (fiopath, out_file)
            fio_process_list.append(
                common.pdsh(clnts, fio_cmd, continue_if_error=False))
        for p in fio_process_list:
            p.communicate()
        monitoring.stop(self.run_dir)
        logger.info('Finished rbd fio test')

        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Esempio n. 39
0
 def prefill_data(self):
     # populate the fio files
     ps = []
     logger.info('Attempting to populating fio files...')
     for ep_num in range(self.endpoints_per_client):
         p = common.pdsh(settings.getnodes('clients'), self.prefill_command(ep_num))
         ps.append(p)
     for p in ps:
         p.wait()
Esempio n. 40
0
 def distribute_conf(self):
     nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws')
     conf_file = self.config.get("conf_file")
     print "Distributing %s." % conf_file
     common.pdcp(nodes, '', conf_file, self.tmp_conf)
     common.pdsh(nodes,
                 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak',
                 True)
     common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf)
Esempio n. 41
0
def start(directory):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws')
    collectl_dir = '%s/collectl' % directory
    perf_dir = '%s/perf' % directory
    blktrace_dir = '%s/blktrace' % directory

    # collectl
    common.pdsh(nodes, 'mkdir -p -m0755 -- %s' % collectl_dir)
    common.pdsh(nodes, 'collectl -s+mYZ -D -i 1:10 -F0 -f %s' % collectl_dir)
Esempio n. 42
0
    def run(self):
        super(RbdFio, self).run()

        # Set client readahead
        self.set_client_param('read_ahead_kb', self.client_ra)

        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        time.sleep(5)
        out_file = '%s/output' % self.run_dir
        fio_cmd = 'sudo %s' % (self.cmd_path_full)
        fio_cmd += ' --rw=%s' % self.mode
        if (self.mode == 'readwrite' or self.mode == 'randrw'):
            fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (self.rwmixread,
                                                            self.rwmixwrite)
        fio_cmd += ' --ioengine=%s' % self.ioengine
        if self.time is not None:
            fio_cmd += ' --runtime=%s' % self.time
        if self.ramp is not None:
            fio_cmd += ' --ramp_time=%s' % self.ramp
        fio_cmd += ' --numjobs=%s' % self.numjobs
        fio_cmd += ' --direct=%s' % self.direct
        fio_cmd += ' --bs=%dB' % self.op_size
        fio_cmd += ' --iodepth=%d' % self.iodepth
        if self.vol_size:
            fio_cmd += ' --size=%dM' % (int(self.vol_size) * 0.9)
        fio_cmd += ' --write_iops_log=%s' % out_file
        fio_cmd += ' --write_bw_log=%s' % out_file
        fio_cmd += ' --write_lat_log=%s' % out_file
        if 'recovery_test' in self.cluster.config:
            fio_cmd += ' --time_based'
        if self.random_distribution is not None:
            fio_cmd += ' --random_distribution=%s' % self.random_distribution
        fio_cmd += ' %s > %s' % (self.names, out_file)
        if self.log_avg_msec is not None:
            fio_cmd += ' --log_avg_msec=%s' % self.log_avg_msec
        logger.info('Running rbd fio %s test.', self.mode)
        common.pdsh(settings.getnodes('clients'), fio_cmd).communicate()

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        monitoring.stop(self.run_dir)

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Esempio n. 43
0
def make_movies(directory):
    sc = settings.cluster
    seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user')
    blktrace_dir = '%s/blktrace' % directory

    for device in xrange(0, sc.get('osds_per_node')):
        common.pdsh(
            settings.getnodes('osds'),
            'cd %s;%s -t device%s -o device%s.mpg --movie' %
            (blktrace_dir, seekwatcher, device, device))
Esempio n. 44
0
 def run(self):
     print 'Setting OSD Read Ahead to: %s' % self.osd_ra
     self.cluster.set_osd_param('read_ahead_kb', self.osd_ra)
     print 'Cleaning existing temporary run directory: %s' % self.run_dir
     common.pdsh(settings.getnodes('clients', 'osds', 'mons', 'rgws'), 'sudo rm -rf %s' % self.run_dir)
     if self.valgrind is not None:
         print 'Adding valgrind to the command path.'
         self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir)
     # Set the full command path
     self.cmd_path_full += self.cmd_path
Esempio n. 45
0
    def post(self):
        if self.stoprequest.isSet():
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but stoprequest is set, finishing now.')).communicate()
            self.haltrequest.set()
            return

        if self.config.get("repeat", False):
            # reset counters
            self.outhealthtries = 0
            self.inhealthtries = 0

            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but repeat is set.  Moving to "markdown" state.')).communicate()
            self.state = "markdown"
            return

        post_time = self.config.get("post_time", 60)
        common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate()
        time.sleep(post_time)
        self.state = "done"
Esempio n. 46
0
 def cleanup(self):
     super(KvmRbdFio, self).cleanup()
     clnts = settings.getnodes('clients')
     common.pdsh(clnts, 'killall fio').communicate()
     time.sleep(3)
     common.pdsh(clnts, 'killall -9 fio').communicate()
     time.sleep(3)
     common.pdsh(clnts, 'rm -rf /srv/*/*',
                 continue_if_error=False).communicate()
     common.pdsh(clnts, 'sudo umount /srv/* || echo -n').communicate()
Esempio n. 47
0
File: ceph.py Progetto: mosoriob/cbt
 def distribute_conf(self):
     nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws')
     conf_file = self.config.get("conf_file")
     logger.info("Distributing %s.", conf_file)
     common.pdcp(nodes, '', conf_file, self.tmp_conf).communicate()
     common.pdsh(nodes,
                 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak'
                 ).communicate()
     common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' %
                 self.tmp_conf).communicate()
Esempio n. 48
0
 def mkpools(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     for i in xrange(self.concurrent_procs):
         for node in settings.getnodes('clients').split(','):
             node = node.rpartition("@")[2]
             self.cluster.rmpool('rados-bench-%s-%s' % (node, i),
                                 self.pool_profile)
             self.cluster.mkpool('rados-bench-%s-%s' % (node, i),
                                 self.pool_profile)
     monitoring.stop()
Esempio n. 49
0
    def __init__(self, config):
        super(Ceph, self).__init__(config)
        self.health_wait = config.get('health_wait', 5)
        self.ceph_osd_cmd = config.get('ceph-osd_cmd', '/usr/bin/ceph-osd')
        self.ceph_mon_cmd = config.get('ceph-mon_cmd', '/usr/bin/ceph-mon')
        self.ceph_run_cmd = config.get('ceph-run_cmd', '/usr/bin/ceph-run')
        self.ceph_rgw_cmd = config.get('ceph-rgw_cmd', '/usr/bin/radosgw')
        self.ceph_mgr_cmd = config.get('ceph-mgr_cmd', '/usr/bin/ceph-mgr')
        self.ceph_mds_cmd = config.get('ceph-mds_cmd', '/usr/bin/ceph-mds')
        self.ceph_authtool_cmd = config.get('ceph-authtool_cmd', '/usr/bin/ceph-authtool')
        self.radosgw_admin_cmd = config.get('radosgw-admin_cmd', '/usr/bin/radosgw-admin')
        self.ceph_cmd = config.get('ceph_cmd', '/usr/bin/ceph')
        self.ceph_fuse_cmd = config.get('ceph-fuse_cmd', '/usr/bin/ceph-fuse')
        self.rados_cmd = config.get('rados_cmd', '/usr/bin/rados')
        self.rbd_cmd = config.get('rbd_cmd', '/usr/bin/rbd')
        self.rbd_nbd_cmd = config.get('rbd-nbd_cmd', '/usr/bin/rbd-nbd')
        self.rbd_fuse_cmd = config.get('rbd-fuse_cmd', '/usr/bin/rbd-fuse')
        self.mount_cmd = config.get('mount_cmd', '/usr/sbin/ceph.mount')
        self.log_dir = config.get('log_dir', "%s/log" % self.tmp_dir)
        self.pid_dir = config.get('pid_dir', "%s/pid" % self.tmp_dir)
        self.core_dir = config.get('core_dir', "%s/core" % self.tmp_dir)
        self.monitoring_dir = "%s/monitoring" % self.tmp_dir
        self.osdmap_fn = "%s/osdmap" % self.tmp_dir
        self.monmap_fn = "%s/monmap" % self.tmp_dir
        self.use_existing = config.get('use_existing', True)
        self.newstore_block = config.get('newstore_block', False)
        self.version_compat = config.get('version_compat', '')
        # these parameters control parallel OSD build 
        self.ceph_osd_online_rate = config.get('osd_online_rate', 10)
        self.ceph_osd_online_tmo = config.get('osd_online_timeout', 120)
        self.ceph_osd_parallel_creates = config.get('osd_parallel_creates')

        self.client_keyring = '/etc/ceph/ceph.keyring'
        self.client_secret = '/etc/ceph/ceph.secret'
        # If making the cluster, use the ceph.conf file distributed by initialize to the tmp_dir
        self.tmp_conf = '%s/ceph.conf' % self.tmp_dir
        # If using an existing cluster, defualt to /etc/ceph/ceph.conf
        if self.use_existing:
            self.tmp_conf = self.config.get('conf_file')

        self.osd_valgrind = config.get('osd_valgrind', None)
        self.mon_valgrind = config.get('mon_valgrind', None)
        self.rgw_valgrind = config.get('rgw_valgrind', None)
        self.mgr_valgrind = config.get('mgr_valgrind', None)
        self.tiering = config.get('tiering', False)
        self.ruleset_map = {}
        self.cur_ruleset = 1
        self.idle_duration = config.get('idle_duration', 0)
        self.use_existing = config.get('use_existing', True)
        self.stoprequest = threading.Event()
        self.haltrequest = threading.Event()

        self.urls = []
        self.auth_urls = []
        self.osd_count = config.get('osds_per_node') * len(settings.getnodes('osds'))
Esempio n. 50
0
    def _run(self, mode, run_dir, out_dir):
        # We'll always drop caches for rados bench
        self.dropcaches()

        if self.concurrent_ops:
            concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops
        #determine rados version
        rados_version_str = subprocess.check_output(["rados", "-v"])
        m = re.findall("version (\d+)", rados_version_str)
        rados_version = int(m[0])

        if mode in ['write'] or rados_version < 9:
            op_size_str = '-b %s' % self.op_size
        else:
            op_size_str = ''

        common.make_remote_dir(run_dir)

        # dump the cluster config
        self.cluster.dump_config(run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(run_dir, recovery_callback)

        # Run rados bench
        monitoring.start(run_dir)
        logger.info('Running radosbench %s test.' % mode)
        ps = []
        for i in xrange(self.concurrent_procs):
            out_file = '%s/output.%s' % (run_dir, i)
            objecter_log = '%s/objecter.%s.log' % (run_dir, i)
            # default behavior is to use a single storage pool
            pool_name = self.pool
            run_name = '--run-name %s`hostname -s`-%s' % (self.object_set_id,
                                                          i)
            if self.pool_per_proc:  # support previous behavior of 1 storage pool per rados process
                pool_name = 'rados-bench-`hostname -s`-%s' % i
                run_name = ''
            rados_bench_cmd = '%s -c %s -p %s bench %s %s %s %s %s --no-cleanup 2> %s > %s' % \
                 (self.cmd_path_full, self.tmp_conf, pool_name, op_size_str, self.time, mode, concurrent_ops_str, run_name, objecter_log, out_file)
            p = common.pdsh(settings.getnodes('clients'), rados_bench_cmd)
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(run_dir)

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(run_dir)
        common.sync_files('%s/*' % run_dir, out_dir)
Esempio n. 51
0
    def run(self):
        super(RawFio, self).run()
        # Set client readahead
        clnts = settings.getnodes('clients')

        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        time.sleep(5)

        logger.info('Starting raw fio %s test.', self.mode)

        fio_process_list = []
        for i in range(self.concurrent_procs):
            b = self.block_devices[i % len(self.block_devices)]
            fiopath = b
            out_file = '%s/output.%d' % (self.run_dir, i)
            fio_cmd = 'sudo %s' % self.fio_cmd
            fio_cmd += ' --rw=%s' % self.mode
            if (self.mode == 'readwrite' or self.mode == 'randrw'):
                fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (
                    self.rwmixread, self.rwmixwrite)
            fio_cmd += ' --ioengine=%s' % self.ioengine
            fio_cmd += ' --runtime=%s' % self.time
            fio_cmd += ' --ramp_time=%s' % self.ramp
            if self.startdelay:
                fio_cmd += ' --startdelay=%s' % self.startdelay
            if self.rate_iops:
                fio_cmd += ' --rate_iops=%s' % self.rate_iops
            fio_cmd += ' --numjobs=%s' % self.numjobs
            fio_cmd += ' --direct=%s' % self.direct
            fio_cmd += ' --bs=%dB' % self.op_size
            fio_cmd += ' --iodepth=%d' % self.iodepth
            fio_cmd += ' --size=%dM' % self.vol_size
            if self.log_iops:
                fio_cmd += ' --write_iops_log=%s' % out_file
            if self.log_bw:
                fio_cmd += ' --write_bw_log=%s' % out_file
            if self.log_lat:
                fio_cmd += ' --write_lat_log=%s' % out_file
            fio_cmd += ' --output-format=%s' % self.fio_out_format
            if 'recovery_test' in self.cluster.config:
                fio_cmd += ' --time_based'
            fio_cmd += ' --name=%s > %s' % (fiopath, out_file)
            logger.debug("FIO CMD: %s" % fio_cmd)
            fio_process_list.append(
                common.pdsh(clnts, fio_cmd, continue_if_error=False))
        for p in fio_process_list:
            p.communicate()
        monitoring.stop(self.run_dir)
        logger.info('Finished raw fio test')

        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Esempio n. 52
0
    def start_rgw(self):
        user = settings.cluster.get('user')
        rgwhosts = settings.cluster.get('rgws')

        if not rgwhosts:
            return

        # If we are starting rGW, make the RGW pools
        self.make_rgw_pools()

        for rgwhost, gateways in rgwhosts.iteritems():
            for rgwname, rgwsettings in gateways.iteritems():
                host = rgwsettings.get('host', rgwhost)
                port = rgwsettings.get('port', None)
                ssl_certificate = rgwsettings.get('ssl_certificate', None)

                # Build the auth_url
                auth_url = "http://" if ssl_certificate is None else "https://"
                auth_url += host
                auth_url += ":7480" if port is None else ":%s" % port
                auth_url += "/auth/v1.0"
                self.auth_urls.append(auth_url)

                # set the rgw_frontends
                rgw_frontends = None
                if ssl_certificate is not None:
                    rgw_frontends = "civetweb ssl_certificate=%s" % ssl_certificate
                if port is not None:
                    if rgw_frontends is None:
                        rgw_frontends = "civetweb"
                    rgw_frontends += " port=%s" % port

                cmd = '%s -c %s -n %s --log-file=%s/rgw.log' % (self.ceph_rgw_cmd, self.tmp_conf, rgwname, self.log_dir)
                if rgw_frontends is not None:
                    cmd += " --rgw-frontends='%s'" % rgw_frontends
                if self.rgw_valgrind:
                    cmd = "%s %s" % (common.setup_valgrind(self.rgw_valgrind, 'rgw.%s' % host, self.tmp_dir), cmd)
                else:
                    cmd = '%s %s' % (self.ceph_run_cmd, cmd)

                if user:
                    pdshhost = '%s@%s' % (user, rgwhost)
                common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s"' % cmd).communicate()

                # set min_size of pools to 1, when there is only one osd
                num_osds = len(settings.cluster.get('osds'))
                rgw_default_pools = ['.rgw.root', 'default.rgw.control', 'default.rgw.meta', 'default.rgw.log']
                pool_min_repl_size = 1

                if num_osds == 1:
                    time.sleep(5)
                    for pool in rgw_default_pools:
                        common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool set %s min_size %d' % (self.ceph_cmd, self.tmp_conf, pool, pool_min_repl_size),
                        continue_if_error=False).communicate()
                        time.sleep(5)
Esempio n. 53
0
def sync_files(remote_dir, local_dir):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds')

    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    pdsh(
        nodes,
        'sudo chown -R %s.%s %s' % (settings.cluster.get('user'),
                                    settings.cluster.get('user'), remote_dir))
    rpdcp(nodes, '-r', remote_dir, local_dir).communicate()
Esempio n. 54
0
 def mkpools(self):
     with monitoring.monitor("%s/pool_monitoring" % self.run_dir):
         if self.pool_per_proc: # allow use of a separate storage pool per process
             for i in range(self.concurrent_procs):
                 for node in settings.getnodes('clients').split(','):
                     node = node.rpartition("@")[2]
                     self.cluster.rmpool('rados-bench-%s-%s' % (node, i), self.pool_profile)
                     self.cluster.mkpool('rados-bench-%s-%s' % (node, i), self.pool_profile, 'radosbench')
         else: # the default behavior is to use a single Ceph storage pool for all rados bench processes
             self.cluster.rmpool('rados-bench-cbt', self.pool_profile)
             self.cluster.mkpool('rados-bench-cbt', self.pool_profile, 'radosbench')
Esempio n. 55
0
def sync_files(remote_dir, local_dir):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds')

    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    if 'user' in settings.cluster:
        pdsh(nodes, 
             'sudo chown -R {0}.{0} {1}'.format(settings.cluster['user'], remote_dir),
             continue_if_error=False).communicate()
    rpdcp(nodes, '-r', remote_dir, local_dir).communicate()
Esempio n. 56
0
File: ceph.py Progetto: mosoriob/cbt
    def setup_fs(self):
        sc = settings.cluster
        fs = sc.get('fs')
        mkfs_opts = sc.get('mkfs_opts', '')
        mount_opts = sc.get('mount_opts', '')

        if fs == '':
            settings.shutdown("No OSD filesystem specified.  Exiting.")

        mkfs_threads = []
        for device in xrange(0, sc.get('osds_per_node')):
            osds = settings.getnodes('osds')
            common.pdsh(
                osds, 'sudo umount /dev/disk/by-partlabel/osd-device-%s-data' %
                device).communicate()
            common.pdsh(
                osds, 'sudo rm -rf %s/osd-device-%s-data' %
                (self.mnt_dir, device)).communicate()
            common.pdsh(
                osds, 'sudo mkdir -p -m0755 -- %s/osd-device-%s-data' %
                (self.mnt_dir, device)).communicate()

            if fs == 'tmpfs':
                logger.info('using tmpfs osds, not creating a file system.')
            elif fs == 'zfs':
                logger.info('ruhoh, zfs detected.  No mkfs for you!')
                common.pdsh(osds, 'sudo zpool destroy osd-device-%s-data' %
                            device).communicate()
                common.pdsh(
                    osds,
                    'sudo zpool create -f -O xattr=sa -m legacy osd-device-%s-data /dev/disk/by-partlabel/osd-device-%s-data'
                    % (device, device)).communicate()
                common.pdsh(
                    osds,
                    'sudo zpool add osd-device-%s-data log /dev/disk/by-partlabel/osd-device-%s-zil'
                    % (device, device)).communicate()
                common.pdsh(
                    osds,
                    'sudo mount %s -t zfs osd-device-%s-data %s/osd-device-%s-data'
                    %
                    (mount_opts, device, self.mnt_dir, device)).communicate()
            else:
                # do mkfs and mount in 1 long command
                # alternative is to wait until make_osds to mount it
                mkfs_cmd = 'sudo sh -c "mkfs.%s %s /dev/disk/by-partlabel/osd-device-%s-data ; ' % (
                    fs, mkfs_opts, device)
                mkfs_cmd += 'mount %s -t %s /dev/disk/by-partlabel/osd-device-%s-data %s/osd-device-%s-data"' % (
                    mount_opts, fs, device, self.mnt_dir, device)
                mkfs_threads.append((device, common.pdsh(osds, mkfs_cmd)))
        for device, t in mkfs_threads:  # for tmpfs and zfs cases, thread list is empty
            logger.info('for device %d on all hosts awaiting mkfs and mount' %
                        device)
            t.communicate()
    def run(self):
        if self.osd_ra and self.osd_ra_changed:
            logger.info('Setting OSD Read Ahead to: %s', self.osd_ra)
            self.cluster.set_osd_param('read_ahead_kb', self.osd_ra)

        logger.debug('Cleaning existing temporary run directory: %s', self.run_dir)
        common.pdsh(settings.getnodes('clients', 'osds', 'mons', 'rgws'), 'sudo rm -rf %s' % self.run_dir).communicate()
        if self.valgrind is not None:
            logger.debug('Adding valgrind to the command path.')
            self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir)
        # Set the full command path
        self.cmd_path_full += self.cmd_path
Esempio n. 58
0
def start(directory):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws')
    collectl_dir = '%s/collectl' % directory
    # perf_dir = '%s/perf' % directory
    # blktrace_dir = '%s/blktrace' % directory

    # collectl
    rawdskfilt = 'cciss/c\d+d\d+ |hd[ab] | sd[a-z]+ |dm-\d+ |xvd[a-z] |fio[a-z]+ | vd[a-z]+ |emcpower[a-z]+ |psv\d+ |nvme[0-9]n[0-9]+p[0-9]+ '
    common.pdsh(nodes, 'mkdir -p -m0755 -- %s' % collectl_dir)
    common.pdsh(
        nodes, 'collectl -s+mYZ -i 1:10 --rawdskfilt "%s" -F0 -f %s' %
        (rawdskfilt, collectl_dir))
Esempio n. 59
0
    def initialize(self):
        super(StdFioBench, self).initialize()
        for i in xrange(1):
            letter = string.ascii_lowercase[i + 1]
            if not self.use_existing:
                common.pdsh(settings.getnodes('clients'), 'sudo umount -f %s' %
                            (self.block_dev_name)).communicate()
                common.pdsh(
                    settings.getnodes('clients'), 'sudo mkfs.%s -f  %s' %
                    (self.filesystem, self.block_dev_name)).communicate()
            common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p %s ' %
                        (self.mount_point_name)).communicate()
            common.pdsh(
                settings.getnodes('clients'),
                'sudo mount -t %s -o noatime %s %s' %
                (self.filesystem, self.block_dev_name,
                 self.mount_point_name)).communicate()
            common.pdsh(
                settings.getnodes('clients'),
                'sudo mkdir -p %s/`hostname -s`-%d' %
                (self.mount_point_name, i)).communicate()

        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        logger.info('Attempting to populating fio files...')
        pre_cmd = 'sudo %s --rw=write --ioengine=sync --numjobs=%s --bs=8M --size %dM %s > /dev/null ' % (
            self.fio_cmd, self.numjobs, self.vol_size, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()
Esempio n. 60
0
    def osdout(self):
        ret = self.cluster.check_health("%s/recovery.log" %
                                        self.config.get('run_dir'))
        common.pdsh(settings.getnodes('head'), self.logcmd("ret: %s" % ret))

        if self.outhealthtries < self.maxhealthtries and ret == 0:
            self.outhealthtries = self.outhealthtries + 1
            return  # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(settings.getnodes('head'),
                        self.logcmd('Cluster never went unhealthy.'))
        else:
            common.pdsh(settings.getnodes('head'),
                        self.logcmd('Cluster appears to have healed.'))

        lcmd = self.logcmd("Unsetting the ceph osd noup flag")
        common.pdsh(
            settings.getnodes('head'), 'ceph -c %s ceph osd unset noup;%s' %
            (self.cluster.tmp_conf, lcmd))
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s up." % osdnum)
            common.pdsh(
                settings.getnodes('head'), 'ceph -c %s osd up %s;%s' %
                (self.cluster.tmp_conf, osdnum, lcmd))
            lcmd = self.logcmd("Marking OSD %s in." % osdnum)
            common.pdsh(
                settings.getnodes('head'), 'ceph -c %s osd in %s;%s' %
                (self.cluster.tmp_conf, osdnum, lcmd))

        self.state = "osdin"