Beispiel #1
0
 def distribute_conf(self):
     nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws')
     conf_file = self.config.get("conf_file")
     print "Distributing %s." % conf_file
     common.pdcp(nodes, '', conf_file, self.tmp_conf)
     common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak', True)
     common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf)
Beispiel #2
0
 def pre(self):
     pre_time = self.config.get("pre_time", 60)
     common.pdsh(settings.getnodes('head'), self.logcmd('Starting Recovery Test Thread, waiting %s seconds.' % pre_time)).communicate()
     time.sleep(pre_time)
     lcmd = self.logcmd("Setting the ceph osd noup flag")
     common.pdsh(settings.getnodes('head'), '%s -c %s osd set noup;%s' % (self.ceph_cmd, self.cluster.tmp_conf, lcmd)).communicate()
     self.state = 'markdown'
Beispiel #3
0
    def initialize(self): 
        super(RawFio, self).initialize()
        common.pdsh(settings.getnodes('clients'),
                    'sudo rm -rf %s' % self.run_dir,
                    continue_if_error=False).communicate()
        common.make_remote_dir(self.run_dir)
        clnts = settings.getnodes('clients')
        logger.info('creating mountpoints...')

        logger.info('Attempting to initialize fio files...')
        initializer_list = []
        for i in range(self.concurrent_procs):
            b = self.block_devices[i % len(self.block_devices)]
            fiopath = b
            pre_cmd = 'sudo %s --rw=write -ioengine=%s --bs=%s ' % (self.fio_cmd, self.ioengine, self.op_size)
            pre_cmd = '%s --size %dM --name=%s --output-format=%s> /dev/null' % (
                       pre_cmd, self.vol_size, fiopath, self.fio_out_format)
            initializer_list.append(common.pdsh(clnts, pre_cmd,
                                    continue_if_error=False))
        for p in initializer_list:
             p.communicate()

        # Create the run directory
        common.pdsh(clnts, 'rm -rf %s' % self.run_dir,
                    continue_if_error=False).communicate()
        common.make_remote_dir(self.run_dir)
Beispiel #4
0
def make_movies(directory):
    sc = settings.cluster
    seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user')
    blktrace_dir = '%s/blktrace' % directory

    for device in xrange (0,sc.get('osds_per_node')):
        common.pdsh(settings.getnodes('osds'), 'cd %s;%s -t device%s -o device%s.mpg --movie' % (blktrace_dir,seekwatcher,device,device))
Beispiel #5
0
    def initialize(self): 
        super(RbdFio, self).initialize()

        logger.info('Running scrub monitoring.')
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        logger.info('Pausing for 60s for idle monitoring.')
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        self.mkimages()
 
        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        logger.info('Attempting to populating fio files...')
        pre_cmd = 'sudo %s --ioengine=%s --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.ioengine, self.numjobs, self.vol_size*0.9, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()

        return True
Beispiel #6
0
    def test_pdcp_rpdcp(self):
        nodes = os.environ[var_name]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            fname = os.tempnam()

        val = str(uuid.uuid1())
        with open(fname, "w") as fd:
            fd.write(val)

        try:
            common.pdcp(nodes, None, fname, fname).communicate()
            out, err = common.pdsh(nodes, "cat " + fname).communicate()
            for node in iter_nodes(nodes):
                self.assertIn("{0}: {1}\n".format(node, val), out)
        finally:
            os.unlink(fname)

        common.rpdcp(nodes, None, fname, os.path.dirname(fname)).communicate()
        try:
            with open(fname) as fd:
                self.assertEqual(fd.read(), val)
        finally:
            os.unlink(fname)

        common.pdsh(nodes, "rm " + fname).communicate()
Beispiel #7
0
    def initialize(self): 
        common.cleanup_tests()
        if not self.use_existing:
            common.setup_cluster()
            common.setup_ceph()

            # Create the run directory
            common.make_remote_dir(self.run_dir)

            # Setup the pools

            monitoring.start("%s/pool_monitoring" % self.run_dir)
            for i in xrange(self.concurrent_procs):
                for node in settings.getnodes('clients').split(','):
                    node = node.rpartition("@")[2]
                    common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool create rados-bench-%s-%s %d %d' % (node, i, self.pgs_per_pool, self.pgs_per_pool)).communicate()
                    common.pdsh(settings.getnodes('head'), 'sudo ceph osd pool set rados-bench-%s-%s size 1' % (node, i)).communicate()
                    # check the health for each pool.
                    print 'Checking Healh after pool creation.'
                    common.check_health()
            monitoring.stop()

        print 'Running scrub monitoring.'
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        common.check_scrub()
        monitoring.stop()

        print 'Pausing for 60s for idle monitoring.'
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        return True
Beispiel #8
0
def make_movies(directory):
    sc = settings.cluster
    seekwatcher = '/usr/bin/seekwatcher'
    blktrace_dir = '%s/blktrace' % directory

    for device in 'bcdefghijklm':
        common.pdsh(settings.getnodes('osds'), 'cd %s;%s -t sd%s1 -o sd%s1.mpg --movie' % (blktrace_dir,seekwatcher,device,device)).communicate()
Beispiel #9
0
    def run(self):
        super(KvmRbdFio, self).run()
        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        time.sleep(5)
        names = ""
        for i in xrange(self.concurrent_procs):
            names += "--name=/srv/rbdfio-`hostname -s`-%d/cbt-kvmrbdfio " % i
        out_file = '%s/output' % self.run_dir
        pre_cmd = 'sudo fio --rw=read -ioengine=sync --numjobs=1 --bs=4M --runtime=1 --size %dM %s > /dev/null' % (self.vol_size * 9/10, names)
        fio_cmd = 'sudo fio --rw=%s -ioengine=%s --runtime=%s --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM %s > %s' %  (self.mode, self.ioengine, self.time, self.op_size, self.iodepth, self.vol_size * 9/10, names, out_file)
        print 'Attempting to populating fio files...'
        common.pdsh(settings.cluster.get('clients'), pre_cmd).communicate()
        print 'Running rbd fio %s test.' % self.mode
        common.pdsh(settings.cluster.get('clients'), fio_cmd).communicate()
#        ps = []
#        for i in xrange(self.concurrent_procs):
#            out_file = '%s/output.%s' % (self.run_dir, i)
#            p = common.pdsh(settings.cluster.get('clients'), 'sudo fio --rw=%s -ioengine=%s --runtime=%s --name=/srv/rbdfio-`hostname -s`-%d/cbt-rbdfio --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM > %s' % (self.mode, self.ioengine, self.time, i, self.op_size, self.iodepth, self.vol_size * 9/10, out_file))
#            ps.append(p)
#        for p in ps:
#            p.wait()
        monitoring.stop(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Beispiel #10
0
 def run(self):
     self.stoprequest.clear()
     while not self.stoprequest.isSet():
         self.states[self.state]()
     common.pdsh(
         settings.getnodes("head"), self.logcmd("Exiting recovery test thread.  Last state was: %s" % self.state)
     ).communicate()
Beispiel #11
0
    def run(self):
        super(RbdFio, self).run()
        # Set client readahead
        self.set_client_param('read_ahead_kb', self.client_ra)

        # We'll always drop caches for rados bench
        self.dropcaches()

        common.make_remote_dir(self.run_dir)
        monitoring.start(self.run_dir)
        # Run rados bench
        print 'Running rbd fio %s test.' % self.mode
        names = ""
        for i in xrange(self.concurrent_procs):
            names += "--name=%s/mnt/rbdfio-`hostname -s`-%d/cbt-rbdfio " % (self.tmp_dir, i)
        out_file = '%s/output' % self.run_dir
        fio_cmd = 'sudo fio --rw=%s -ioengine=%s --runtime=%s --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM %s > %s' %  (self.mode, self.ioengine, self.time, self.op_size, self.iodepth, self.vol_size * 9/10, names, out_file)
        common.pdsh(settings.getnodes('clients'), fio_cmd).communicate()
#        ps = []
#        for i in xrange(self.concurrent_procs):
#            out_file = '%s/output.%s' % (self.run_dir, i)
#            p = common.pdsh(settings.cluster.get('clients'), 'sudo fio --rw=%s -ioengine=%s --runtime=%s --name=/srv/rbdfio-`hostname -s`-%d/cbt-rbdfio --numjobs=1 --direct=1 --bs=%dB --iodepth=%d --size %dM > %s' % (self.mode, self.ioengine, self.time, i, self.op_size, self.iodepth, self.vol_size * 9/10, out_file))
#            ps.append(p)
#        for p in ps:
#            p.wait()
        monitoring.stop(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Beispiel #12
0
    def initialize(self): 
        super(LibrbdFio, self).initialize()

        print 'Running scrub monitoring.'
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        print 'Pausing for 60s for idle monitoring.'
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        self.mkimages()

        # Create the run directory
        common.make_remote_dir(self.run_dir)

        # populate the fio files
        print 'Attempting to populating fio files...'
        pre_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=cbt-librbdfio-`hostname -s` --invalidate=0  --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (self.cmd_path, self.poolname, self.numjobs, self.vol_size, self.names)
        common.pdsh(settings.getnodes('clients'), pre_cmd).communicate()

        return True
Beispiel #13
0
    def rmpool(self, name, profile_name):
        pool_profiles = self.config.get("pool_profiles", {"default": {}})
        profile = pool_profiles.get(profile_name, {})
        cache_profile = profile.get("cache_profile", None)
        if cache_profile:
            cache_name = "%s-cache" % name

            # flush and remove the overlay and such
            common.pdsh(
                settings.getnodes("head"),
                "sudo ceph -c %s osd tier cache-mode %s forward" % (self.tmp_conf, cache_name),
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo rados -c %s -p %s cache-flush-evict-all" % (self.tmp_conf, cache_name)
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo ceph -c %s osd tier remove-overlay %s" % (self.tmp_conf, name)
            ).communicate()
            common.pdsh(
                settings.getnodes("head"), "sudo ceph -c %s osd tier remove %s %s" % (self.tmp_conf, name, cache_name)
            ).communicate()

            # delete the cache pool
            self.rmpool(cache_name, cache_profile)
        common.pdsh(
            settings.getnodes("head"),
            "sudo ceph -c %s osd pool delete %s %s --yes-i-really-really-mean-it" % (self.tmp_conf, name, name),
        ).communicate()
Beispiel #14
0
 def distribute_conf(self):
     nodes = settings.getnodes("head", "clients", "osds", "mons", "rgws")
     conf_file = self.config.get("conf_file")
     print "Distributing %s." % conf_file
     common.pdcp(nodes, "", conf_file, self.tmp_conf).communicate()
     common.pdsh(nodes, "sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak").communicate()
     common.pdsh(nodes, "sudo ln -s %s /etc/ceph/ceph.conf" % self.tmp_conf).communicate()
Beispiel #15
0
 def distribute_conf(self):
     nodes = settings.getnodes('head', 'clients', 'osds', 'mons', 'rgws')
     conf_file = self.config.get("conf_file")
     logger.info("Distributing %s.", conf_file)
     common.pdcp(nodes, '', conf_file, self.tmp_conf).communicate()
     common.pdsh(nodes, 'sudo mv /etc/ceph/ceph.conf /etc/ceph/ceph.conf.cbt.bak').communicate()
     common.pdsh(nodes, 'sudo ln -s %s /etc/ceph/ceph.conf' % self.tmp_conf).communicate()
Beispiel #16
0
    def make_osds(self):
        osdnum = 0
        osdhosts = settings.cluster.get('osds')

        for host in osdhosts:
            user = settings.cluster.get('user')
            if user:
                pdshhost = '%s@%s' % (user, host)

            for i in xrange(0, settings.cluster.get('osds_per_node')):            
                # Build the OSD
                osduuid = str(uuid.uuid4())
                key_fn = '%s/osd-device-%s-data/keyring' % (self.mnt_dir, i)
                common.pdsh(pdshhost, 'sudo %s -c %s osd create %s' % (self.ceph_cmd, self.tmp_conf, osduuid)).communicate()
                common.pdsh(pdshhost, 'sudo %s -c %s osd crush add osd.%d 1.0 host=%s rack=localrack root=default' % (self.ceph_cmd, self.tmp_conf, osdnum, host)).communicate()
                common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s -c %s -i %d --mkfs --mkkey --osd-uuid %s"' % (self.ceph_osd_cmd, self.tmp_conf, osdnum, osduuid)).communicate()
                common.pdsh(pdshhost, 'sudo %s -c %s -i %s auth add osd.%d osd "allow *" mon "allow profile osd"' % (self.ceph_cmd, self.tmp_conf, key_fn, osdnum)).communicate()

                # Start the OSD
                pidfile="%s/ceph-osd.%d.pid" % (self.pid_dir, osdnum)
                cmd = '%s -c %s -i %d --pid-file=%s' % (self.ceph_osd_cmd, self.tmp_conf, osdnum, pidfile)
                if self.osd_valgrind:
                    cmd = "%s %s" % (common.setup_valgrind(self.osd_valgrind, 'osd.%d' % osdnum, self.tmp_dir), cmd)
                else:
                    cmd = '%s %s' % (self.ceph_run_cmd, cmd)
                stderr_file = "%s/osd.%d.stderr" % (self.tmp_dir, osdnum)

                common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s 2> %s"' % (cmd, stderr_file)).communicate()
                osdnum = osdnum+1
Beispiel #17
0
 def mkimages(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     self.cluster.rmpool(self.poolname, self.pool_profile)
     self.cluster.mkpool(self.poolname, self.pool_profile)
     for node in settings.getnodes('clients').split(','):
         node = node.rpartition("@")[2]
         common.pdsh(settings.getnodes('head'), '/usr/bin/rbd create cbt-librbdfio-%s --size %s --pool %s --order %s' % (node, self.vol_size, self.poolname, self.vol_order)).communicate()
     monitoring.stop()
Beispiel #18
0
 def run(self):
     print "Setting OSD Read Ahead to: %s" % self.osd_ra
     self.cluster.set_osd_param("read_ahead_kb", self.osd_ra)
     print "Cleaning existing temporary run directory: %s" % self.run_dir
     common.pdsh(settings.getnodes("clients", "osds", "mons", "rgws"), "sudo rm -rf %s" % self.run_dir).communicate()
     if self.valgrind is not None:
         print "Adding valgrind to the command path."
         self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir)
Beispiel #19
0
    def cleanup(self):
         super(RawFio, self).cleanup()
         clnts = settings.getnodes('clients')

         logger.debug("Kill fio: %s" % clnts)
         common.pdsh(clnts, 'killall fio').communicate()
         time.sleep(3)
         common.pdsh(clnts, 'killall -9 fio').communicate()
Beispiel #20
0
def start(directory):
    nodes = settings.getnodes('clients', 'servers', 'mons', 'rgws')
    collectl_dir = '%s/collectl' % directory
    perf_dir = '%s/perf' % directory
    blktrace_dir = '%s/blktrace' % directory

    # collectl
    common.pdsh(nodes, 'mkdir -p -m0755 -- %s;collectl -s+mYZ -i 1:10 -F0 -f %s' % (collectl_dir,collectl_dir))
Beispiel #21
0
    def run(self):
        super(LibrbdFio, self).run()

        # We'll always drop caches for rados bench
        self.dropcaches()

        # dump the cluster config
        self.cluster.dump_config(self.run_dir)

        monitoring.start(self.run_dir)

        time.sleep(5)
        out_file = '%s/output' % self.run_dir
        fio_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=cbt-librbdfio-`hostname -s` --invalidate=0' % (self.cmd_path_full, self.poolname)
        fio_cmd += ' --rw=%s' % self.mode
        if (self.mode == 'readwrite' or self.mode == 'randrw'):
            fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (self.rwmixread, self.rwmixwrite)
#        fio_cmd += ' --ioengine=%s' % self.ioengine
        if self.time is not None:
            fio_cmd += ' --runtime=%s' % self.time
        if self.ramp is not None:
            fio_cmd += ' --ramp_time=%s' % self.ramp
        fio_cmd += ' --numjobs=%s' % self.numjobs
        fio_cmd += ' --direct=1'
        fio_cmd += ' --bs=%dB' % self.op_size
        fio_cmd += ' --iodepth=%d' % self.iodepth
        fio_cmd += ' --end_fsync=%s' % self.end_fsync
#        if self.vol_size:
#            fio_cmd += ' -- size=%dM' % self.vol_size
        fio_cmd += ' --write_iops_log=%s' % out_file
        fio_cmd += ' --write_bw_log=%s' % out_file
        fio_cmd += ' --write_lat_log=%s' % out_file
        if 'recovery_test' in self.cluster.config:
            fio_cmd += ' --time_based'
        if self.random_distribution is not None:
            fio_cmd += ' --random_distribution=%s' % self.random_distribution
        if self.log_avg_msec is not None:
            fio_cmd += ' --log_avg_msec=%s' % self.log_avg_msec
        fio_cmd += ' %s > %s' % (self.names, out_file)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        print 'Running rbd fio %s test.' % self.mode
        common.pdsh(settings.getnodes('clients'), fio_cmd).communicate()


        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        monitoring.stop(self.run_dir)

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Beispiel #22
0
def start(directory):
    sc = settings.cluster
    nodes = common.get_nodes([sc.get("clients"), sc.get("servers"), sc.get("mons"), sc.get("rgws")])
    collectl_dir = "%s/collectl" % directory
    perf_dir = "%s/perf" % directory
    blktrace_dir = "%s/blktrace" % directory

    # collectl
    common.pdsh(nodes, "mkdir -p -m0755 -- %s;collectl -s+mYZ -i 1:10 -F0 -f %s" % (collectl_dir, collectl_dir))
Beispiel #23
0
def make_movies(directory):
    sc = settings.cluster
    seekwatcher = "/home/%s/bin/seekwatcher" % sc.get("user")
    blktrace_dir = "%s/blktrace" % directory

    for device in xrange(0, sc.get("osds_per_node")):
        common.pdsh(
            sc.get("servers"),
            "cd %s;%s -t device%s -o device%s.mpg --movie" % (blktrace_dir, seekwatcher, device, device),
        ).communicate()
Beispiel #24
0
 def run(self):
     print 'Setting OSD Read Ahead to: %s' % self.osd_ra
     self.cluster.set_osd_param('read_ahead_kb', self.osd_ra)
     print 'Cleaning existing temporary run directory: %s' % self.run_dir
     common.pdsh(settings.getnodes('clients', 'osds', 'mons', 'rgws'), 'sudo rm -rf %s' % self.run_dir)
     if self.valgrind is not None:
         print 'Adding valgrind to the command path.'
         self.cmd_path_full = common.setup_valgrind(self.valgrind, self.getclass(), self.run_dir)
     # Set the full command path
     self.cmd_path_full += self.cmd_path
Beispiel #25
0
 def mkimages(self):
     monitoring.start("%s/pool_monitoring" % self.run_dir)
     self.cluster.rmpool(self.poolname, self.pool_profile)
     self.cluster.mkpool(self.poolname, self.pool_profile)
     common.pdsh(settings.getnodes('clients'), '/usr/bin/rbd create cbt-kernelrbdfio-`hostname -s` --size %s --pool %s' % (self.vol_size, self.poolname)).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo rbd map cbt-kernelrbdfio-`hostname -s` --pool %s --id admin' % self.poolname).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mkfs.xfs /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s`').communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mkdir -p -m0755 -- %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate()
     common.pdsh(settings.getnodes('clients'), 'sudo mount -t xfs -o noatime,inode64 /dev/rbd/cbt-kernelrbdfio/cbt-kernelrbdfio-`hostname -s` %s/cbt-kernelrbdfio-`hostname -s`' % self.cluster.mnt_dir).communicate()
     monitoring.stop()
Beispiel #26
0
def start(directory):
    nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws')
    collectl_dir = '%s/collectl' % directory
    # perf_dir = '%s/perf' % directory
    # blktrace_dir = '%s/blktrace' % directory

    # collectl
    rawdskfilt = 'cciss/c\d+d\d+ |hd[ab] | sd[a-z]+ |dm-\d+ |xvd[a-z] |fio[a-z]+ | vd[a-z]+ |emcpower[a-z]+ |psv\d+ |nvme[0-9]n[0-9]+p[0-9]+ '
    common.pdsh(nodes, 'mkdir -p -m0755 -- %s' % collectl_dir)
    common.pdsh(nodes, 'collectl -s+mYZ -i 1:10 --rawdskfilt "%s" -F0 -f %s' % (rawdskfilt, collectl_dir))
Beispiel #27
0
    def start_rgw(self):
        user = settings.cluster.get('user')
        rgwhosts = settings.cluster.get('rgws')

        if not rgwhosts:
            return

        # If we are starting rGW, make the RGW pools
        self.make_rgw_pools()

        for rgwhost, gateways in rgwhosts.iteritems():
            for rgwname, rgwsettings in gateways.iteritems():
                host = rgwsettings.get('host', rgwhost)
                port = rgwsettings.get('port', None)
                ssl_certificate = rgwsettings.get('ssl_certificate', None)

                # Build the auth_url
                auth_url = "http://" if ssl_certificate is None else "https://"
                auth_url += host
                auth_url += ":7480" if port is None else ":%s" % port
                auth_url += "/auth/v1.0"
                self.auth_urls.append(auth_url)

                # set the rgw_frontends
                rgw_frontends = None
                if ssl_certificate is not None:
                    rgw_frontends = "civetweb ssl_certificate=%s" % ssl_certificate
                if port is not None:
                    if rgw_frontends is None:
                        rgw_frontends = "civetweb"
                    rgw_frontends += " port=%s" % port

                cmd = '%s -c %s -n %s --log-file=%s/rgw.log' % (self.ceph_rgw_cmd, self.tmp_conf, rgwname, self.log_dir)
                if rgw_frontends is not None:
                    cmd += " --rgw-frontends='%s'" % rgw_frontends
                if self.rgw_valgrind:
                    cmd = "%s %s" % (common.setup_valgrind(self.rgw_valgrind, 'rgw.%s' % host, self.tmp_dir), cmd)
                else:
                    cmd = '%s %s' % (self.ceph_run_cmd, cmd)

                if user:
                    pdshhost = '%s@%s' % (user, rgwhost)
                common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s"' % cmd).communicate()

                # set min_size of pools to 1, when there is only one osd
                num_osds = len(settings.cluster.get('osds'))
                rgw_default_pools = ['.rgw.root', 'default.rgw.control', 'default.rgw.meta', 'default.rgw.log']
                pool_min_repl_size = 1

                if num_osds == 1:
                    time.sleep(5)
                    for pool in rgw_default_pools:
                        common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool set %s min_size %d' % (self.ceph_cmd, self.tmp_conf, pool, pool_min_repl_size),
                        continue_if_error=False).communicate()
                        time.sleep(5)
Beispiel #28
0
    def _run(self, mode, run_dir, out_dir):
        # We'll always drop caches for rados bench
        self.dropcaches()

        if self.concurrent_ops:
            concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops
        #determine rados version
        rados_version_str, err = common.pdsh(settings.getnodes('head'), '/usr/bin/rados -v').communicate()
        m = re.findall("version (\d+)", rados_version_str)
        rados_version = int(m[0])

        if mode in ['write'] or rados_version < 9:
            op_size_str = '-b %s' % self.op_size
        else:
            op_size_str = ''


        common.make_remote_dir(run_dir)

        # dump the cluster config
        self.cluster.dump_config(run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(run_dir, recovery_callback)

        # Run rados bench
        monitoring.start(run_dir)
        logger.info('Running radosbench %s test.' % mode)
        ps = []
        for i in xrange(self.concurrent_procs):
            out_file = '%s/output.%s' % (run_dir, i)
            objecter_log = '%s/objecter.%s.log' % (run_dir, i)
            # default behavior is to use a single storage pool 
            pool_name = self.pool
            run_name = '--run-name %s`hostname -s`-%s'%(self.object_set_id, i)
            if self.pool_per_proc: # support previous behavior of 1 storage pool per rados process
                pool_name = 'rados-bench-`hostname -s`-%s'%i
                run_name = ''
            rados_bench_cmd = '%s -c %s -p %s bench %s %s %s %s %s --no-cleanup 2> %s > %s' % \
                 (self.cmd_path_full, self.tmp_conf, pool_name, op_size_str, self.time, mode, concurrent_ops_str, run_name, objecter_log, out_file)
            p = common.pdsh(settings.getnodes('clients'), rados_bench_cmd)
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(run_dir)

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(run_dir)
        common.sync_files('%s/*' % run_dir, out_dir)
Beispiel #29
0
    def run(self):
        super(RbdFio, self).run()

        # Set client readahead
        self.set_client_param('read_ahead_kb', self.client_ra)

        # We'll always drop caches for rados bench
        self.dropcaches()

        monitoring.start(self.run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        time.sleep(5)
        out_file = '%s/output' % self.run_dir
        fio_cmd = 'sudo %s' % (self.cmd_path_full)
        fio_cmd += ' --rw=%s' % self.mode
        if (self.mode == 'readwrite' or self.mode == 'randrw'):
            fio_cmd += ' --rwmixread=%s --rwmixwrite=%s' % (self.rwmixread, self.rwmixwrite)
        fio_cmd += ' --ioengine=%s' % self.ioengine
        if self.time is not None:
            fio_cmd += ' --runtime=%s' % self.time
        if self.ramp is not None:
            fio_cmd += ' --ramp_time=%s' % self.ramp
        fio_cmd += ' --numjobs=%s' % self.numjobs
        fio_cmd += ' --direct=1'
        fio_cmd += ' --bs=%dB' % self.op_size
        fio_cmd += ' --iodepth=%d' % self.iodepth
        if self.vol_size:
            fio_cmd += ' --size=%dM' % (int(self.vol_size) * 0.9)
        fio_cmd += ' --write_iops_log=%s' % out_file
        fio_cmd += ' --write_bw_log=%s' % out_file
        fio_cmd += ' --write_lat_log=%s' % out_file
        if 'recovery_test' in self.cluster.config:
            fio_cmd += ' --time_based'
        if self.random_distribution is not None:
            fio_cmd += ' --random_distribution=%s' % self.random_distribution
        fio_cmd += ' %s > %s' % (self.names, out_file)
        if self.log_avg_msec is not None:
            fio_cmd += ' --log_avg_msec=%s' % self.log_avg_msec
        logger.info('Running rbd fio %s test.', self.mode)
        common.pdsh(settings.getnodes('clients'), fio_cmd).communicate()

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        monitoring.stop(self.run_dir)

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Beispiel #30
0
def make_movies(directory):
    use_existing = settings.cluster.get('use_existing', True)
    if use_existing:
        return None
    sc = settings.cluster
    seekwatcher = '/home/%s/bin/seekwatcher' % sc.get('user')
    blktrace_dir = '%s/blktrace' % directory

    for device in range(sc.get('osds_per_node')):
        common.pdsh(settings.getnodes('osds'), 'cd %s;%s -t device%s -o device%s.mpg --movie' %
                    (blktrace_dir, seekwatcher, device, device)).communicate()
Beispiel #31
0
    def check_backfill(self, check_list=None, logfile=None):
        # Wait for a defined amount of time in case ceph health is delayed
        time.sleep(self.health_wait)
        logline = ""
        if logfile:
            logline = "| tee -a %s" % logfile
        ret = 0

        # Match any of these things to continue checking backfill 
        check_list = ["backfill", "misplaced"]
        while True:
            stdout, stderr = common.pdsh(settings.getnodes('head'), '%s -c %s -s %s' % (self.ceph_cmd, self.tmp_conf, logline)).communicate()
            if check_list and not any(x in stdout for x in check_list):
                break
            else:
                ret = ret + 1
            for line in stdout.splitlines():
                if 'misplaced' in line:
                    logger.info("%s", line)
            time.sleep(1)
        return ret
Beispiel #32
0
    def check_health(self, check_list=None, logfile=None):
        # Wait for a defined amount of time in case ceph health is delayed
        time.sleep(self.health_wait)
        logline = ""
        if logfile:
            logline = "| tee -a %s" % logfile
        ret = 0

        # Match any of these things to continue checking health
        check_list = ["degraded", "peering", "recovery_wait", "stuck", "inactive", "unclean", "recovery", "stale"]
        while True:
            stdout, stderr = common.pdsh(settings.getnodes('head'), '%s -c %s health %s' % (self.ceph_cmd, self.tmp_conf, logline)).communicate()
            if check_list and not any(x in stdout for x in check_list):
                break
            if "HEALTH_OK" in stdout:
                break
            else:
                ret = ret + 1
            logger.info("%s", stdout)
            time.sleep(1)
        return ret
Beispiel #33
0
    def run(self):
        super(Cosbench, self).run()
        self.dropcaches()
        self.cluster.dump_config(self.run_dir)
        monitoring.start(self.run_dir)

        # Run cosbench test
        try:
            self._run()
        except KeyboardInterrupt:
            logger.warning("accept keyboard interrupt, cancel this run")
            conf = self.config
            stdout, stderr = common.pdsh("%s@%s" % (self.user, conf["controller"]),'sh %s/cli.sh cancel %s' % (conf["cosbench_dir"], self.runid)).communicate()
            logger.info("%s", stdout)

        self.check_workload_status()
        self.check_cosbench_res_dir()

        monitoring.stop(self.run_dir)
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
Beispiel #34
0
    def run(self):
        try:
            key_fn = '%s/keyring'%self.osddir
            ceph_conf = self.cl_obj.tmp_conf
            phost = sshtarget(settings.cluster.get('user'), self.host)
            common.pdsh(phost, 'sudo %s -c %s osd crush add osd.%d 1.0 host=%s rack=localrack root=default' % (self.cl_obj.ceph_cmd, ceph_conf, self.osdnum, self.host)).communicate()
            cmd='ulimit -n 16384 && ulimit -c unlimited && exec %s -c %s -i %d --mkfs --mkkey --osd-uuid %s' % (self.cl_obj.ceph_osd_cmd, ceph_conf, self.osdnum, self.osduuid)
            common.pdsh(phost, 'sudo sh -c "%s"' % cmd).communicate()
            common.pdsh(phost, 'sudo %s -c %s -i %s auth add osd.%d osd "allow *" mon "allow profile osd" mgr "allow"' % (self.cl_obj.ceph_cmd, ceph_conf, key_fn, self.osdnum)).communicate()

            # Start the OSD
            pidfile="%s/ceph-osd.%d.pid" % (self.cl_obj.pid_dir, self.osdnum)
            cmd = '%s -c %s -i %d --pid-file=%s' % (self.cl_obj.ceph_osd_cmd, ceph_conf, self.osdnum, pidfile)
            if self.cl_obj.osd_valgrind:
                cmd = common.setup_valgrind(self.cl_obj.osd_valgrind, 'osd.%d' % self.osdnum, self.cl_obj.tmp_dir) + ' ' + cmd
            else:
                cmd = '%s %s' % (self.cl_obj.ceph_run_cmd, cmd)
            stderr_file = "%s/osd.%d.stderr" % (self.cl_obj.tmp_dir, self.osdnum)
            common.pdsh(phost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s 2> %s"' % (cmd, stderr_file)).communicate()
        except Exception as e:
            self.exc = e
        finally:
            self.response_time = time.time() - self.start_time
Beispiel #35
0
    def osdin(self):
        # Wait until the cluster is healthy.
        ret = self.cluster.check_health("%s/recovery.log" % self.config.get('run_dir'))
        if self.inhealthtries < self.maxhealthtries and ret == 0:
            self.inhealthtries = self.inhealthtries + 1
            return # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster never went unhealthy.')).communicate()
        else:
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster appears to have healed.')).communicate()

        post_time = self.config.get("post_time", 60)
        common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate()
        time.sleep(post_time)
        self.state = "done"
Beispiel #36
0
    def markdown(self):
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s down." % osdnum)
            common.pdsh(
                settings.getnodes('head'), 'ceph -c %s osd down %s;%s' %
                (self.cluster.tmp_conf, osdnum, lcmd)).communicate()
            lcmd = self.logcmd("Marking OSD %s out." % osdnum)
            common.pdsh(
                settings.getnodes('head'), 'ceph -c %s osd out %s;%s' %
                (self.cluster.tmp_conf, osdnum, lcmd)).communicate()
        common.pdsh(
            settings.getnodes('head'),
            self.logcmd(
                'Waiting for the cluster to break and heal')).communicate()

        self.state = 'osdout'
Beispiel #37
0
 def stop(self, directory):
     if self.perf_runners:
         for runner in self.perf_runners:
             runner.kill()
     else:
         common.pdsh(self.nodes,
                     'sudo pkill -SIGINT -f perf\ ').communicate()
     if directory:
         sc = settings.cluster
         common.pdsh(
             self.nodes,
             'sudo chown {user}.{user} {dir}/perf/perf.data'.format(
                 user=self.user, dir=directory))
         common.pdsh(
             self.nodes,
             'sudo chown {user}.{user} {dir}/perf/perf_stat.*'.format(
                 user=self.user, dir=directory))
Beispiel #38
0
    def _run(self, mode, run_dir, out_dir):
        # We'll always drop caches for rados bench
        self.dropcaches()

        if self.concurrent_ops:
            concurrent_ops_str = '--concurrent-ios %s' % self.concurrent_ops
        op_size_str = '-b %s' % self.op_size

        common.make_remote_dir(run_dir)

        # dump the cluster config
        self.cluster.dump_config(run_dir)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(run_dir, recovery_callback)

        # Run rados bench
        monitoring.start(run_dir)
        print 'Running radosbench read test.'
        ps = []
        for i in xrange(self.concurrent_procs):
            out_file = '%s/output.%s' % (run_dir, i)
            objecter_log = '%s/objecter.%s.log' % (run_dir, i)
            p = common.pdsh(settings.getnodes('clients'), '%s -c %s -p rados-bench-`hostname -s`-%s %s bench %s %s %s --no-cleanup 2> %s > %s' % (self.cmd_path_full, self.tmp_conf, i, op_size_str, self.time, mode, concurrent_ops_str, objecter_log, out_file))
            ps.append(p)
        for p in ps:
            p.wait()
        monitoring.stop(run_dir)

        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(run_dir)
        common.sync_files('%s/*' % run_dir, out_dir)
Beispiel #39
0
    def initialize(self):
        super(LibrbdFio, self).initialize()

        # Clean and Create the run directory
        common.clean_remote_dir(self.run_dir)
        common.make_remote_dir(self.run_dir)

        logger.info('Running scrub monitoring.')
        monitoring.start("%s/scrub_monitoring" % self.run_dir)
        self.cluster.check_scrub()
        monitoring.stop()

        logger.info('Pausing for 60s for idle monitoring.')
        monitoring.start("%s/idle_monitoring" % self.run_dir)
        time.sleep(60)
        monitoring.stop()

        common.sync_files('%s/*' % self.run_dir, self.out_dir)

        self.mkimages()

        # populate the fio files
        ps = []
        logger.info('Attempting to populating fio files...')
        if (self.use_existing_volumes == False):
            for volnum in xrange(self.volumes_per_client):
                rbd_name = 'cbt-librbdfio-`%s`-%d' % (common.get_fqdn_cmd(),
                                                      volnum)
                pre_cmd = 'sudo %s --ioengine=rbd --clientname=admin --pool=%s --rbdname=%s --invalidate=0  --rw=write --numjobs=%s --bs=4M --size %dM %s > /dev/null' % (
                    self.cmd_path, self.pool_name, rbd_name, self.numjobs,
                    self.vol_size, self.names)
                p = common.pdsh(settings.getnodes('clients'), pre_cmd)
                ps.append(p)
            for p in ps:
                p.wait()
        return True
Beispiel #40
0
 def prerun_check(self):
     #1. check cosbench
     if not self.check_workload_status():
         sys.exit()
     #2. check rgw
     cosconf = {}
     for param in self.config["auth"]["config"].split(';'):
         try:
             key, value = param.split('=')
             cosconf[key] = value
         except:
             pass
     logger.debug("%s", cosconf)
     if "username" in cosconf and "password" in cosconf and "url" in cosconf:
         stdout, stderr = common.pdsh("%s@%s" % (self.user, self.config["controller"]),"curl -D - -H 'X-Auth-User: %s' -H 'X-Auth-Key: %s' %s" % (cosconf["username"], cosconf["password"], cosconf["url"])).communicate()
     else:
         logger.error("Auth Configuration in Yaml file is not in correct format")
         sys.exit()
     if re.search('(refused|error)', stderr):
         logger.error("Cosbench connect to Radosgw Connection Failed\n%s", stderr)
         sys.exit()
     if re.search("AccessDenied", stdout):
         logger.error("Cosbench connect to Radosgw Auth Failed\n%s", stdout)
         sys.exit()
Beispiel #41
0
    def check_health(self, check_list=None, logfile=None):
        logline = ""
        if logfile:
            logline = "| tee -a %s" % logfile
        ret = 0

        # Match any of these things to continue checking health
        check_list = [
            "degraded", "peering", "recovery_wait", "stuck", "inactive",
            "unclean", "recovery", "stale"
        ]
        while True:
            stdout, stderr = common.pdsh(
                settings.getnodes('head'), 'ceph -c %s health %s' %
                (self.tmp_conf, logline)).communicate()
            if check_list and not set(check_list).intersection(stdout.split()):
                break
            if "HEALTH_OK" in stdout:
                break
            else:
                ret = ret + 1
            logger.info("%s", stdout)
            time.sleep(1)
        return ret
Beispiel #42
0
    def run(self):
        super(LibrbdFio, self).run()

        # We'll always drop caches for rados bench
        self.dropcaches()

        # dump the cluster config
        self.cluster.dump_config(self.run_dir)

        monitoring.start(self.run_dir)

        time.sleep(5)

        # Run the backfill testing thread if requested
        if 'recovery_test' in self.cluster.config:
            recovery_callback = self.recovery_callback
            self.cluster.create_recovery_test(self.run_dir, recovery_callback)

        logger.info('Running rbd fio %s test.', self.mode)
        ps = []
        for i in range(self.volumes_per_client):
            fio_cmd = self.mkfiocmd(i)
            p = common.pdsh(settings.getnodes('clients'), fio_cmd)
            ps.append(p)
        for p in ps:
            p.wait()
        # If we were doing recovery, wait until it's done.
        if 'recovery_test' in self.cluster.config:
            self.cluster.wait_recovery_done()

        monitoring.stop(self.run_dir)

        # Finally, get the historic ops
        self.cluster.dump_historic_ops(self.run_dir)
        common.sync_files('%s/*' % self.run_dir, self.out_dir)
        self.analyze(self.out_dir)
Beispiel #43
0
    def post(self):
        if self.stoprequest.isSet():
            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but stoprequest is set, finishing now.')).communicate()
            self.haltrequest.set()
            return

        if self.config.get("repeat", False):
            # reset counters
            self.outhealthtries = 0
            self.inhealthtries = 0

            common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, but repeat is set.  Moving to "markdown" state.')).communicate()
            self.state = "markdown"
            return

        post_time = self.config.get("post_time", 60)
        common.pdsh(settings.getnodes('head'), self.logcmd('Cluster is healthy, completion in %s seconds.' % post_time)).communicate()
        time.sleep(post_time)
        self.state = "done"
 def map_rbd(self, node, rbd_name):
     common.pdsh(
         node,
         f'sudo targetcli /backstores/user:rbd create cfgstring={self.pool}/{rbd_name} name={rbd_name} size={self.endpoint_size}M',
         continue_if_error=False).communicate()
     stdout, stderr = common.pdsh(node,
                                  f'sudo targetcli /loopback create',
                                  continue_if_error=False).communicate()
     wwn = stdout.rstrip().rpartition(": ")[2].rpartition(" ")[2][:-1]
     common.pdsh(
         node,
         f'sudo targetcli /loopback/{wwn}/luns create /backstores/user:rbd/{rbd_name}',
         continue_if_error=False).communicate()
     stdout, stderr = common.pdsh(
         node,
         f'cat /sys/kernel/config/target/loopback/{wwn}/tpgt_1/address',
         continue_if_error=False).communicate()
     address = stdout.rstrip().rpartition(": ")[2]
     stdout, stderr = common.pdsh(
         node,
         f'ls /sys/class/scsi_disk/{address}:0/device/block',
         continue_if_error=False).communicate()
     return '/dev/%s' % stdout.rstrip().rpartition(": ")[2]
Beispiel #45
0
    def start_mds(self):
        user = settings.cluster.get('user')
        mdshosts = settings.cluster.get('mdss')

        if not mdshosts:
            return

        for mdshost, mds in mdshosts.items():
            for mdsname, mdssettings in mds.items():
                cmd = '%s -i %s' % (self.ceph_mds_cmd, mdsname)
                if self.mgr_valgrind:
                    cmd = "%s %s" % (common.setup_valgrind(self.mds_valgrind, mdsname, self.tmp_dir), cmd)
                else:
                    cmd = "%s %s" % (self.ceph_run_cmd, cmd)
                if user:
                    pdshhost = '%s@%s' % (user, mdshost)
                data_dir = "%s/mds.%s" % (self.tmp_dir, mdsname)
                common.pdsh(pdshhost, 'sudo mkdir -p %s' % data_dir).communicate()
                common.pdsh(pdshhost, 'sudo %s auth get-or-create mds.%s mon \'allow profile mds\' osd \'allow rw tag cephfs *=*\' mds \'allow\' mgr \'allow profile mds\' -o %s/keyring' % (self.ceph_cmd, mdsname, data_dir)).communicate()
                common.pdsh(pdshhost, 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s"' % cmd).communicate()
Beispiel #46
0
    def mount_rbd(self):
        for ep_num in xrange(0, self.endpoints_per_client):
            dir_name = self.get_dir_name(ep_num) 
            for node in common.get_fqdn_list('clients'):
                rbd_name = self.get_rbd_name(node, ep_num)
                rbd_device = self.map_rbd(node, rbd_name)

                logger.info(rbd_device)

                # mkfs
                common.pdsh(node, 'sudo mkfs.xfs %s' % rbd_device, continue_if_error=False).communicate()

                # mkdir
                common.pdsh(node, 'sudo mkdir -p -m0755 -- %s' % dir_name, continue_if_error=False).communicate()

                # mount
                common.pdsh(node, 'sudo mount -t xfs %s %s' % (rbd_device, dir_name),
                            continue_if_error=False).communicate()
            self.endpoints.append(dir_name)
        self.endpoint_type = "directory"
        return self.get_endpoints()
Beispiel #47
0
    def osdout(self):
        ret = self.cluster.check_health(
            self.health_checklist,
            "%s/recovery.log" % self.config.get('run_dir'))
        common.pdsh(settings.getnodes('head'),
                    self.logcmd("ret: %s" % ret)).communicate()

        if self.outhealthtries < self.maxhealthtries and ret == 0:
            self.outhealthtries = self.outhealthtries + 1
            return  # Cluster hasn't become unhealthy yet.

        if ret == 0:
            common.pdsh(
                settings.getnodes('head'),
                self.logcmd('Cluster never went unhealthy.')).communicate()
        else:
            common.pdsh(
                settings.getnodes('head'),
                self.logcmd('Cluster appears to have healed.')).communicate()

        lcmd = self.logcmd("Unsetting the ceph osd noup flag")
        common.pdsh(
            settings.getnodes('head'), '%s -c %s osd unset noup;%s' %
            (self.ceph_cmd, self.cluster.tmp_conf, lcmd)).communicate()
        for osdnum in self.config.get('osds'):
            lcmd = self.logcmd("Marking OSD %s up." % osdnum)
            common.pdsh(
                settings.getnodes('head'), '%s -c %s osd up %s;%s' %
                (self.ceph_cmd, self.cluster.tmp_conf, osdnum,
                 lcmd)).communicate()
            lcmd = self.logcmd("Marking OSD %s in." % osdnum)
            common.pdsh(
                settings.getnodes('head'), '%s -c %s osd in %s;%s' %
                (self.ceph_cmd, self.cluster.tmp_conf, osdnum,
                 lcmd)).communicate()

        self.state = "osdin"
Beispiel #48
0
    def initialize(self):
        # safety check to make sure we don't blow away an existing cluster!
        if self.use_existing:
            raise RuntimeError(
                'initialize was called on an existing cluster! Avoiding touching anything.'
            )

        super(Ceph, self).initialize()

        # unmount any kernel rbd volumes
        self.rbd_unmount()

        # shutdown any old processes
        self.shutdown()

        # Cleanup old junk and create new junk
        self.cleanup()
        common.mkdir_p(self.tmp_dir)
        common.pdsh(
            settings.getnodes('head', 'clients', 'mons', 'osds', 'rgws',
                              'mds'),
            'mkdir -p -m0755 -- %s' % self.tmp_dir).communicate()
        common.pdsh(
            settings.getnodes('clients', 'mons', 'osds', 'rgws', 'mds'),
            'mkdir -p -m0755 -- %s' % self.pid_dir).communicate()
        common.pdsh(
            settings.getnodes('clients', 'mons', 'osds', 'rgws', 'mds'),
            'mkdir -p -m0755 -- %s' % self.log_dir).communicate()
        common.pdsh(
            settings.getnodes('clients', 'mons', 'osds', 'rgws', 'mds'),
            'mkdir -p -m0755 -- %s' % self.monitoring_dir).communicate()
        common.pdsh(
            settings.getnodes('clients', 'mons', 'osds', 'rgws', 'mds'),
            'mkdir -p -m0755 -- %s' % self.core_dir).communicate()
        self.distribute_conf()

        # Set the core directory
        common.pdsh(
            settings.getnodes('clients', 'mons', 'osds', 'rgws', 'mds'),
            'echo "%s/core.%%e.%%p.%%h.%%t" | sudo tee /proc/sys/kernel/core_pattern'
            % self.tmp_dir).communicate()

        # Create the filesystems
        self.setup_fs()

        # Build the cluster
        monitoring.start('%s/creation' % self.monitoring_dir)
        self.make_mons()
        self.make_osds()
        self.start_rgw()
        monitoring.stop()

        # Check Health
        monitoring.start('%s/initial_health_check' % self.monitoring_dir)
        self.check_health()
        monitoring.stop()

        # Wait for initial scrubbing to complete (This should only matter on pre-dumpling clusters)
        self.check_scrub()

        # Make the crush and erasure profiles
        self.make_profiles()

        # Peform Idle Monitoring
        if self.idle_duration > 0:
            monitoring.start("%s/idle_monitoring" % self.monitoring_dir)
            time.sleep(self.idle_duration)
            monitoring.stop()

        return True
Beispiel #49
0
 def mkimage(self, name, size, pool, order):
     common.pdsh(
         settings.getnodes('head'),
         '%s create %s --size %s --pool %s --order %s' %
         (self.rbd_cmd, name, size, pool, order)).communicate()
Beispiel #50
0
    def make_profiles(self):
        crush_profiles = self.config.get('crush_profiles', {})
        for name, profile in crush_profiles.items():
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush add-bucket %s-root root' %
                (self.ceph_cmd, self.tmp_conf, name)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush add-bucket %s-rack rack' %
                (self.ceph_cmd, self.tmp_conf, name)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush move %s-rack root=%s-root' %
                (self.ceph_cmd, self.tmp_conf, name, name)).communicate()
            # FIXME: We need to build a dict mapping OSDs to hosts and create a proper hierarchy!
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush add-bucket %s-host host' %
                (self.ceph_cmd, self.tmp_conf, name)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush move %s-host rack=%s-rack' %
                (self.ceph_cmd, self.tmp_conf, name, name)).communicate()

            osds = profile.get('osds', None)
            if not osds:
                raise Exception("No OSDs defined for crush profile, bailing!")
            for i in osds:
                common.pdsh(
                    settings.getnodes('head'),
                    '%s -c %s osd crush set %s 1.0 host=%s-host' %
                    (self.ceph_cmd, self.tmp_conf, i, name)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd crush rule create-simple %s %s-root osd' %
                (self.ceph_cmd, self.tmp_conf, name, name)).communicate()
            self.set_ruleset(name)

        erasure_profiles = self.config.get('erasure_profiles', {})
        for name, profile in erasure_profiles.items():
            k = profile.get('erasure_k', 6)
            m = profile.get('erasure_m', 2)
            common.pdsh(
                settings.getnodes('head'),
                '%s -c %s osd erasure-code-profile set %s ruleset-failure-domain=osd k=%s m=%s'
                % (self.ceph_cmd, self.tmp_conf, name, k, m)).communicate()
            self.set_ruleset(name)
Beispiel #51
0
    def mkpool(self, name, profile_name, base_name=None):
        pool_profiles = self.config.get('pool_profiles', {'default': {}})
        profile = pool_profiles.get(profile_name, {})

        pg_size = profile.get('pg_size', 1024)
        pgp_size = profile.get('pgp_size', 1024)
        erasure_profile = profile.get('erasure_profile', '')
        replication = str(profile.get('replication', None))
        cache_profile = profile.get('cache_profile', None)
        crush_profile = profile.get('crush_profile', None)
        cache_mode = profile.get('cache_mode', None)
        hit_set_type = profile.get('hit_set_type', None)
        hit_set_count = profile.get('hit_set_count', None)
        hit_set_period = profile.get('hit_set_period', None)
        target_max_objects = profile.get('target_max_objects', None)
        target_max_bytes = profile.get('target_max_bytes', None)
        min_read_recency_for_promote = profile.get(
            'min_read_recency_for_promote', None)

        #        common.pdsh(settings.getnodes('head'), 'sudo ceph -c %s osd pool delete %s %s --yes-i-really-really-mean-it' % (self.tmp_conf, name, name)).communicate()
        common.pdsh(
            settings.getnodes('head'),
            'sudo %s -c %s osd pool create %s %d %d %s' %
            (self.ceph_cmd, self.tmp_conf, name, pg_size, pgp_size,
             erasure_profile)).communicate()

        if replication and replication == 'erasure':
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool create %s %d %d erasure %s' %
                (self.ceph_cmd, self.tmp_conf, name, pg_size, pgp_size,
                 erasure_profile)).communicate()
        else:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool create %s %d %d' %
                (self.ceph_cmd, self.tmp_conf, name, pg_size,
                 pgp_size)).communicate()

        logger.info('Checking Healh after pool creation.')
        self.check_health()

        if replication and replication.isdigit():
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s size %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 replication)).communicate()
            logger.info(
                'Checking Health after setting pool replication level.')
            self.check_health()

        if base_name and cache_mode:
            logger.info("Adding %s as cache tier for %s.", name, base_name)
            common.pdsh(
                settings.getnodes('head'), 'sudo %s -c %s osd tier add %s %s' %
                (self.ceph_cmd, self.tmp_conf, base_name, name)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd tier cache-mode %s %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 cache_mode)).communicate()
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd tier set-overlay %s %s' %
                (self.ceph_cmd, self.tmp_conf, base_name, name)).communicate()

        if crush_profile:
            ruleset = self.get_ruleset(crush_profile)
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s crush_ruleset %s' %
                (self.ceph_cmd, self.tmp_conf, name, ruleset)).communicate()
        if hit_set_type:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s hit_set_type %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 hit_set_type)).communicate()
        if hit_set_count:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s hit_set_count %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 hit_set_count)).communicate()
        if hit_set_period:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s hit_set_period %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 hit_set_period)).communicate()
        if target_max_objects:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s target_max_objects %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 target_max_objects)).communicate()
        if target_max_bytes:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s target_max_bytes %s' %
                (self.ceph_cmd, self.tmp_conf, name,
                 target_max_bytes)).communicate()
        if min_read_recency_for_promote:
            common.pdsh(
                settings.getnodes('head'),
                'sudo %s -c %s osd pool set %s min_read_recency_for_promote %s'
                % (self.ceph_cmd, self.tmp_conf, name,
                   min_read_recency_for_promote)).communicate()
        logger.info('Final Pool Health Check.')
        self.check_health()

        # If there is a cache profile assigned, make a cache pool
        if cache_profile:
            cache_name = '%s-cache' % name
            self.mkpool(cache_name, cache_profile, name)
Beispiel #52
0
 def dump_historic_ops(self, run_dir):
     common.pdsh(
         settings.getnodes('osds'),
         'find /var/run/ceph/*.asok -maxdepth 1 -exec sudo %s --admin-daemon {} dump_historic_ops \; > %s/historic_ops.out'
         % (self.ceph_cmd, run_dir)).communicate()
Beispiel #53
0
 def set_osd_param(self, param, value):
     common.pdsh(
         settings.getnodes('osds'),
         'find /dev/disk/by-partlabel/osd-device-*data -exec readlink {} \; | cut -d"/" -f 3 | sed "s/[0-9]$//" | xargs -I{} sudo sh -c "echo %s > /sys/block/\'{}\'/queue/%s"'
         % (value, param))
Beispiel #54
0
    def make_mons(self):
        # Build and distribute the keyring
        common.pdsh(
            settings.getnodes('head'),
            'ceph-authtool --create-keyring --gen-key --name=mon. %s --cap mon \'allow *\''
            % self.keyring_fn).communicate()
        common.pdsh(
            settings.getnodes('head'),
            'ceph-authtool --gen-key --name=client.admin --set-uid=0 --cap mon \'allow *\' --cap osd \'allow *\' --cap mds allow %s'
            % self.keyring_fn).communicate()
        common.rscp(settings.getnodes('head'), self.keyring_fn,
                    '%s.tmp' % self.keyring_fn).communicate()
        common.pdcp(settings.getnodes('mons', 'osds', 'rgws', 'mds'), '',
                    '%s.tmp' % self.keyring_fn, self.keyring_fn).communicate()

        # Build the monmap, retrieve it, and distribute it
        mons = settings.getnodes('mons').split(',')
        cmd = 'monmaptool --create --clobber'
        monhosts = settings.cluster.get('mons')
        logger.info(monhosts)
        for monhost, mons in monhosts.iteritems():
            for mon, addr in mons.iteritems():
                cmd = cmd + ' --add %s %s' % (mon, addr)
        cmd = cmd + ' --print %s' % self.monmap_fn
        common.pdsh(settings.getnodes('head'), cmd).communicate()
        common.rscp(settings.getnodes('head'), self.monmap_fn,
                    '%s.tmp' % self.monmap_fn).communicate()
        common.pdcp(settings.getnodes('mons'), '', '%s.tmp' % self.monmap_fn,
                    self.monmap_fn).communicate()

        # Build the ceph-mons
        user = settings.cluster.get('user')
        for monhost, mons in monhosts.iteritems():
            if user:
                monhost = '%s@%s' % (user, monhost)
            for mon, addr in mons.iteritems():
                common.pdsh(monhost, 'sudo rm -rf %s/mon.%s' %
                            (self.tmp_dir, mon)).communicate()
                common.pdsh(monhost, 'mkdir -p %s/mon.%s' %
                            (self.tmp_dir, mon)).communicate()
                common.pdsh(
                    monhost,
                    'sudo sh -c "ulimit -c unlimited && exec %s --mkfs -c %s -i %s --monmap=%s --keyring=%s"'
                    % (self.ceph_mon_cmd, self.tmp_conf, mon, self.monmap_fn,
                       self.keyring_fn)).communicate()
                common.pdsh(
                    monhost, 'cp %s %s/mon.%s/keyring' %
                    (self.keyring_fn, self.tmp_dir, mon)).communicate()

        # Start the mons
        for monhost, mons in monhosts.iteritems():
            if user:
                monhost = '%s@%s' % (user, monhost)
            for mon, addr in mons.iteritems():
                pidfile = "%s/%s.pid" % (self.pid_dir, monhost)
                cmd = 'sudo sh -c "ulimit -n 16384 && ulimit -c unlimited && exec %s -c %s -i %s --keyring=%s --pid-file=%s"' % (
                    self.ceph_mon_cmd, self.tmp_conf, mon, self.keyring_fn,
                    pidfile)
                if self.mon_valgrind:
                    cmd = "%s %s" % (common.setup_valgrind(
                        self.mon_valgrind, 'mon.%s' % monhost,
                        self.tmp_dir), cmd)
                else:
                    cmd = '%s %s' % (self.ceph_run_cmd, cmd)
                common.pdsh(monhost, 'sudo %s' % cmd).communicate()
Beispiel #55
0
 def dump_config(self, run_dir):
     common.pdsh(
         settings.getnodes('osds'),
         'sudo %s -c %s --admin-daemon /var/run/ceph/ceph-osd.0.asok config show > %s/ceph_settings.out'
         % (self.ceph_cmd, self.tmp_conf, run_dir)).communicate()
Beispiel #56
0
    def setup_fs(self):
        sc = settings.cluster
        fs = sc.get('fs')
        mkfs_opts = sc.get('mkfs_opts', '')
        mount_opts = sc.get('mount_opts', '')

        if fs == '':
            settings.shutdown("No OSD filesystem specified.  Exiting.")

        mkfs_threads = []
        for device in xrange(0, sc.get('osds_per_node')):
            osds = settings.getnodes('osds')
            common.pdsh(
                osds, 'sudo umount /dev/disk/by-partlabel/osd-device-%s-data' %
                device).communicate()
            common.pdsh(
                osds, 'sudo rm -rf %s/osd-device-%s-data' %
                (self.mnt_dir, device)).communicate()
            common.pdsh(
                osds, 'sudo mkdir -p -m0755 -- %s/osd-device-%s-data' %
                (self.mnt_dir, device)).communicate()

            if fs == 'tmpfs':
                logger.info('using tmpfs osds, not creating a file system.')
            elif fs == 'zfs':
                logger.info('ruhoh, zfs detected.  No mkfs for you!')
                common.pdsh(osds, 'sudo zpool destroy osd-device-%s-data' %
                            device).communicate()
                common.pdsh(
                    osds,
                    'sudo zpool create -f -O xattr=sa -m legacy osd-device-%s-data /dev/disk/by-partlabel/osd-device-%s-data'
                    % (device, device)).communicate()
                common.pdsh(
                    osds,
                    'sudo zpool add osd-device-%s-data log /dev/disk/by-partlabel/osd-device-%s-zil'
                    % (device, device)).communicate()
                common.pdsh(
                    osds,
                    'sudo mount %s -t zfs osd-device-%s-data %s/osd-device-%s-data'
                    %
                    (mount_opts, device, self.mnt_dir, device)).communicate()
            else:
                # do mkfs and mount in 1 long command
                # alternative is to wait until make_osds to mount it
                mkfs_cmd = 'sudo sh -c "mkfs.%s %s /dev/disk/by-partlabel/osd-device-%s-data ; ' % (
                    fs, mkfs_opts, device)
                mkfs_cmd += 'mount %s -t %s /dev/disk/by-partlabel/osd-device-%s-data %s/osd-device-%s-data"' % (
                    mount_opts, fs, device, self.mnt_dir, device)
                mkfs_threads.append((device, common.pdsh(osds, mkfs_cmd)))
        for device, t in mkfs_threads:  # for tmpfs and zfs cases, thread list is empty
            logger.info('for device %d on all hosts awaiting mkfs and mount' %
                        device)
            t.communicate()
Beispiel #57
0
 def cleanup(self):
     nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds')
     logger.info('Deleting %s', self.tmp_dir)
     common.pdsh(nodes, 'sudo rm -rf %s' % self.tmp_dir).communicate()
Beispiel #58
0
    def shutdown(self):
        nodes = settings.getnodes('clients', 'osds', 'mons', 'rgws', 'mds')

        common.pdsh(nodes, 'sudo killall -9 massif-amd64-li').communicate()
        common.pdsh(nodes, 'sudo killall -9 memcheck-amd64-').communicate()
        common.pdsh(nodes, 'sudo killall -9 ceph-osd').communicate()
        common.pdsh(nodes, 'sudo killall -9 ceph-mon').communicate()
        common.pdsh(nodes, 'sudo killall -9 ceph-mds').communicate()
        common.pdsh(nodes, 'sudo killall -9 rados').communicate()
        common.pdsh(nodes, 'sudo killall -9 rest-bench').communicate()
        common.pdsh(nodes, 'sudo killall -9 radosgw').communicate()
        common.pdsh(nodes, 'sudo killall -9 radosgw-admin').communicate()
        common.pdsh(nodes, 'sudo /etc/init.d/apache2 stop').communicate()
        common.pdsh(nodes, 'sudo killall -9 pdsh').communicate()
        monitoring.stop()
Beispiel #59
0
 def recovery_callback(self):
     common.pdsh(settings.getnodes('clients'),
                 'sudo killall -2 fio').communicate()
Beispiel #60
0
    def dropcaches(self):
        nodes = settings.getnodes('clients', 'servers')

        common.pdsh(nodes, 'sync').communicate()
        common.pdsh(
            nodes, 'echo 3 | sudo tee /proc/sys/vm/drop_caches').communicate()