Beispiel #1
0
    def validate(self):

        if len(self.top_directory) < 6:
            raise FsDriftException(
                'top directory %s too short, may be system directory' % 
                self.top_directory)

        if not os.path.isdir(self.top_directory):
            raise FsDriftException(
                'top directory %s does not exist, so please create it' %
                self.top_directory)

        if self.workload_table_csv_path == None:
            self.workload_table_csv_path = os.path.join(self.top_directory, 
                                                        'example_workload_table.csv')
            workload_table = [
                    'read, 2',
                    'random_read, 1',
                    'random_write, 1',
                    'append, 4',
                    'delete, 0.1',
                    'hardlink, 0.01',
                    'softlink, 0.02',
                    'truncate, 0.05',
                    'rename, 1',
                    'readdir, 0.1',
                    'create, 4']
            with open(self.workload_table_csv_path, 'w') as w_f:
                w_f.write( '\n'.join(workload_table))
    def do_another_file(self):
        if self.params.stop_when_thrds_done and self.filenum % self.files_between_checks == 0:
            if not self.test_ended():
                try:
                    sz = os.stat(self.params.checkerflag_path).st_size
                except OSError as e:
                    if e.errno != errno.ENOENT:
                        raise e
                    sz = 0
                record_len = len(self.thread_done_record())
                threads_done = sz / record_len
                return False

        # if user doesn't want to finish all requests and test has ended, stop

        if not self.finish_all_rq and self.test_ended():
            return False
        if self.elapsed_time >= self.duration:
            if not self.test_ended():
                self.end_test()
            return False
        if self.abort:
            raise FsDriftException(
                'thread ' + str(self.tid) + ' saw abort flag')
        if self.pause_sec > 0.0:
            time.sleep(self.pause_sec)
        return True
Beispiel #3
0
 def op_remount(self):
     c = self.ctrs
     if self.params.mount_command == None:
         raise FsDriftException(
             'you did not specify mount command for remount option')
     if self.verbosity & 0x40000:
         self.log.debug('remount: %s' % self.params.mount_command)
     mountpoint = self.params.mount_command.split()[-1].strip()
     if not self.params.top_directory.startswith(mountpoint):
         raise common.FsDriftException(
             'mountpoint %s does not contain topdir %s' %
             (mountpoint, topdir))
     with open('/proc/mounts', 'r') as mount_f:
         mounts = [l.strip().split() for l in mount_f.readlines()]
     mount_entry = None
     for m in mounts:
         if m[1] == mountpoint:
             mount_entry = m
             break
     if mount_entry == None:
         c.e_not_mounted += 1
     else:
         os.chdir('/tmp')
         rc = os.system('umount %s' % mountpoint)
         if rc != OK:
             c.e_could_not_unmount += 1
             return rc
     rc = os.system(self.params.mount_command)
     if rc != OK:
         c.e_could_not_mount += 1
         return rc
     c.have_remounted += 1
     return OK
Beispiel #4
0
 def get_file_size(self, fd):
     stat_info = os.fstat(fd)
     sz = stat_info.st_size
     if sz < 0:
         raise FsDriftException('negative file size %d seen on fd %d' %
                                (sz, fd))
     return sz
Beispiel #5
0
 def test_multiproc(self):
     self.log.info('starting test')
     self.resetDir()
     with open('/tmp/weights.csv', 'w') as w_f:
         w_f.write( '\n'.join(Test.workload_table))
     thread_ready_timeout = 4
     thread_count = 4
     invokeList = []
     for j in range(0, thread_count):
         s = worker_thread.FsDriftWorkload(self.params)
         s.tid = str(j)
         invokeList.append(s)
     threadList = []
     for s in invokeList:
         threadList.append(subprocess(s))
     for t in threadList:
         t.start()
     threads_ready = True
     self.log.info('threads started')
     for i in range(0, thread_ready_timeout):
         threads_ready = True
         for s in invokeList:
             thread_ready_file = s.gen_thread_ready_fname(s.tid)
             if not os.path.exists(thread_ready_file):
                 threads_ready = False
         if threads_ready:
             break
         time.sleep(1)
     if not threads_ready:
         raise FsDriftException('threads did not show up within %d seconds'
                         % thread_ready_timeout)
     time.sleep(1)
     self.log.info('threads awaiting starting gun')
     common.touch(self.params.starting_gun_path)
     for t in threadList:
         t.retrieve()
         t.join()
         self.log.info('thread ' + t.invoke.tid + ' finished')
     for t in threadList:
         if t.invoke.elapsed_time == -1.0:
             raise FsDriftException('subprocess never got elapsed time')
         if t.invoke.status != OK:
             raise FsDriftException('subprocess status %d for thread %s' % (t.invoke.status, t.invoke.tid))
         print('thread %s counters:\n%s' % (t.invoke.tid, str(t.invoke.ctrs)))
         if t.invoke.ctrs.read_bytes == 0:
             raise FsDriftException('subprocess never read any data')
 def wait_for_gate(self):
     if self.params.starting_gun_path:
         gateReady = self.gen_thread_ready_fname(self.tid)
         touch(gateReady)
         while not os.path.exists(self.params.starting_gun_path):
             if os.path.exists(self.params.abort_path):
                 raise FsDriftException(
                     'thread ' + str(self.tid) + ' saw abort flag')
             time.sleep(0.3)
     # wait a little longer so that
     # other clients have time to see that gate exists
     # give everyone else a chance to see that start-file is there
     # it takes at least 1 second for NFS to invalidate cached metadata
     # with actimeo=1
     time.sleep(2)
Beispiel #7
0
def run_workload():

    log = fsd_log.start_log('fs-drift')

    # if a --host-set parameter was passed,
    # it's a multi-host workload
    # each remote instance will wait
    # until all instances have reached starting gate

    try:
        params = opts.parseopts()
        params.validate()
    except FsDriftException as e:
        log.error(str(e))
        log.info('use --help option to get CLI syntax')
        sys.exit(NOTOK)

    print(params)

    if os.getenv('DEBUG'):
        log.logLevel(logging.DEBUG)

    try:
        sync_files.create_top_dirs(params)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise FsDriftException(
                'you must create the top-level directory %s' %
                params.top_directory)

    # put parameters where all threads can see them

    write_pickle(params.param_pickle_path, params)

    if params.host_set != [] and not params.is_slave:
        return run_multi_host_workload(params, log)
    return multi_thread_workload.run_multi_thread_workload(params)
Beispiel #8
0
def parse_weights(opts):
    linenum = 0
    weights = {}
    if opts.workload_table_csv_path != None:
        try:
            f = open(opts.workload_table_csv_path, 'r')
            lines = f.readlines()
            f.close()
            for l in lines:
                linenum += 1
                record = str.split(str.strip(l), ',')
                if len(record) < 2:
                    continue  # skip blank or partial lines
                (opname, relweight) = (record[0].strip(), record[1].strip())
                if opname.startswith('#') or opname == '':
                    continue
                try:
                    opcode = FSOPCtx.opname_to_opcode[opname]
                    weights[opcode] = float(relweight)
                    if weights[opcode] < 0.0:
                        raise FsDriftException(
                            '%s: negative weights not allowed' %
                            opts.workload_table_csv_path)
                except KeyError:
                    raise FsDriftException('%s: unrecognized opname' % opname)
                except ValueError:
                    raise FsDriftException(
                        '%s: relative frequency must be a floating-point number'
                    )
        except IOError as e:
            raise FsDriftException(
                'could not parse %s at line %d : %s' %
                (opts.workload_table_csv_path, linenum, str(e)))
    else:
        raise FsDriftException('user must provide workload table')
    if len(weights) == 0:
        raise FsDriftException('workload table must not be empty')
    return weights
Beispiel #9
0
def run_multi_host_workload(prm, log):

    # construct list of ssh threads to invoke in parallel

    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    log.debug('python_prog = %s' % python_prog)

    remote_thread_list = []
    host_ct = len(prm.host_set)
    for j in range(0, len(prm.host_set)):
        remote_host = prm.host_set[j]
        fsd_remote_pgm = os.path.join(prm.fsd_remote_dir, 'fs-drift-remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (prm.python_prog, fsd_remote_pgm, prm.network_shared_path)

        this_remote_cmd += ' --as-host %s' % remote_host
        log.debug(this_remote_cmd)
        if prm.launch_as_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, log, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(log, remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        if prm.launch_as_daemon:
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found

    exception_seen = None
    abortfn = prm.abort_path
    sec_delta = 0.5
    # timeout if no host replies in next host_timeout seconds
    per_host_timeout = 10.0
    all_host_timeout = 5.0 + len(prm.host_set) / 3
    if all_host_timeout < per_host_timeout:
        per_host_timeout = all_host_timeout / 2

    hosts_ready = False  # set scope outside while loop
    last_host_seen = -1
    sec = 0.0
    start_loop_start = time.time()
    try:
        while sec < per_host_timeout:
            # HACK to force directory entry coherency for Gluster
            #ndirlist = os.listdir(prm.network_shared_path)
            #log.debug('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise FsDriftException('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm.host_set)):
                h = prm.host_set[j]
                fn = multi_thread_workload.gen_host_ready_fname(prm, h.strip())
                log.debug('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    log.info('did not see host filename %s after %f sec' %
                             (fn, sec))
                    hosts_ready = False
                    break
                log.debug('saw host filename ' + fn)
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in per_host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.is_alive():
                    log.error('thread %s has died' % t)
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            time_since_loop_start = time.time() - start_loop_start
            log.debug('last_host_seen=%d sec=%d' % (last_host_seen, sec))
            if time_since_loop_start > all_host_timeout:
                kill_remaining_threads = True
                break
    except KeyboardInterrupt as e:
        log.error('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        log.exception(e)
        hosts_ready = False
    if not hosts_ready:
        multi_thread_workload.abort_test(prm.abort_path, remote_thread_list)
        if not exception_seen:
            log.info(
                'no additional hosts reached starting gate within %5.1f seconds'
                % per_host_timeout)
            return NOTOK
        else:
            raise exception_seen
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(prm.starting_gun_path, 'hi')
            log.debug('starting all threads by creating starting gun file %s' %
                      prm.starting_gun_path)
        except IOError as e:
            log.error('error writing starting gun file: %s' %
                      os.strerror(e.errno))
            multi_thread_workload.abort_test(prm.abort_path,
                                             remote_thread_list)
            raise e

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            log.error('ssh thread for host %s completed with status %d' %
                      (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        invoke_list = []
        one_shot_delay = True
        for h in prm.host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = multi_thread_workload.host_result_filename(prm, h)
            log.debug('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                host_invoke_list = read_pickle(pickle_fn)
                log.debug(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                log.error('  pickle file %s not found' % pickle_fn)

        output_results.output_results(prm, invoke_list)
    except IOError as e:
        log.exception(e)
        log.error('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        return NOTOK
    except KeyboardInterrupt as e:
        log.error('control-C signal seen (SIGINT)')
        return NOTOK
    except FsDriftException as e:
        log.exception(e)
        return NOTOK
    return (OK)
Beispiel #10
0
    def gen_random_fn(self, is_create=False):
        if self.params.random_distribution == FileAccessDistr.uniform:
            # lower limit 0 means at least 1 file/dir
            index = random.randint(0, self.params.max_files)
        elif self.params.random_distribution == FileAccessDistr.gaussian:

            # if simulated time is not defined,
            # attempt to read it in from a file, set to zero if no file

            if self.simulated_time == FSOPCtx.SIMULATED_TIME_UNDEFINED:
                try:
                    with open(self.simtime_pathname, 'r') as readtime_fd:
                        version = int(read_num_from_file(readtime_fd))
                        if version != 1:
                            raise FsDriftException(
                                'unrecognized version %d in simtime file' %
                                version)
                        self.simulated_time = int(
                            read_num_from_file(readtime_fd))
                        self.center = float(read_num_from_file(readtime_fd))
                        self.velocity = float(read_num_from_file(readtime_fd))
                except IOError as e:
                    if e.errno != errno.ENOENT:
                        raise e
                    self.simulated_time = 0
                self.center = self.center + (self.simulated_time *
                                             self.velocity)
                self.log.info('resuming with simulated time %d' %
                              self.simulated_time)

            # for creates, use greater time, so that reads, etc. will "follow" creates most of the time
            # mean and std deviation define gaussian distribution

            self.center += self.velocity
            if is_create:
                self.center += (self.params.create_stddevs_ahead *
                                self.params.gaussian_stddev)
            if self.verbosity & 0x20:
                self.log.debug('%f = center' % self.center)
            index_float = numpy.random.normal(
                loc=self.center, scale=self.params.gaussian_stddev)
            self.log.debug('index_float = %f' % index_float)
            file_opstr = 'read'
            if is_create:
                file_opstr = 'create'
            if self.verbosity & 0x20:
                self.log.debug('%s gaussian value is %f' %
                               (file_opstr, index_float))
            index = int(index_float) % self.params.max_files

            # since this is a time-varying distribution, record the time every so often
            # so we can pick up where we left off

            if self.params.drift_time == -1:
                self.simulated_time += 1
            if self.simulated_time % self.time_save_rate == 0:
                simtime_dir = os.path.dirname(self.simtime_pathname)
                if not os.path.exists(simtime_dir):
                    os.makedirs(simtime_dir)
                with open(self.simtime_pathname, 'w') as time_fd:
                    time_fd.write('1\n')  # version
                    time_fd.write('%10d\n' % self.simulated_time)
                    time_fd.write('%f\n' % self.center)
                    time_fd.write('%f\n' % self.velocity)
        else:
            raise FsDriftException('invalid distribution type %d' %
                                   self.params.random_distribution)
        if self.verbosity & 0x20:
            self.log.debug('next file index %u out of %u' %
                           (index, self.max_files_per_dir))
        dirpath = self.gen_random_dirname(index)
        fn = os.path.join(dirpath, 'f%09d' % index)
        if self.verbosity & 0x20:
            self.log.debug('next pathname %s' % fn)
        return fn
Beispiel #11
0
            else:
                return self.scallerr('readdir', dirpath, e)
        return OK


# unit test

if __name__ == "__main__":
    import logging
    import opts
    import fsd_log
    options = opts.parseopts()
    log = fsd_log.start_log('fsop-unittest')
    log.info('hi there')
    if not options.top_directory.__contains__('/tmp/'):
        raise FsDriftException('bad top directory')
    os.system('rm -rf %s' % options.top_directory)
    os.makedirs(options.top_directory)
    os.chdir(options.top_directory)
    log.info('chdir to %s' % options.top_directory)
    ctrs = FSOPCounters()
    ctx = FSOPCtx(options, log, ctrs, 'test-host', 'test-tid')
    ctx.verbosity = -1
    rc = ctx.op_create()
    assert (rc == OK)
    rc = ctx.op_read()
    assert (rc == OK)
    rc = ctx.op_random_read()
    assert (rc == OK)
    rc = ctx.op_append()
    assert (rc == OK)
 def chk_status(self):
     if self.status != OK:
         raise FsDriftException(
             'test failed, check log file %s' % self.log_fn())
Beispiel #13
0
 def test_a_mkThrd(self):
     sthrd = ssh_thread(log, 'localhost', 'sleep 1')
     sthrd.start()
     sthrd.join()
     if sthrd.status != OK:
         raise FsDriftException('return status %d' % sthrd.status)
Beispiel #14
0
def run_multi_thread_workload(prm):

    host = prm.as_host
    if host == None:
        host = 'localhost'
    prm_slave = (prm.host_set != [])
    # FIXME: get coherent logging level interface
    verbose = os.getenv('LOGLEVEL_DEBUG' != None)
    host_startup_timeout = 5  + len(prm.host_set) / 3

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke
    my_log = fsd_log.start_log('%s.master' % host)
    my_log.debug(prm)

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    my_log.debug('started %d worker threads on host %s' %
                                (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    abort_fname = prm.abort_path
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    startup_timeout = 3
    sec = 0.0
    while sec < startup_timeout:
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                my_log.debug('thread %d thread-ready file %s not found yet with %f sec left' % 
                            (k, fn, (startup_timeout - sec)))
                break
            thread_to_wait_for = k + 1
            # we only timeout if no more threads have reached starting gate
            # in startup_timeout sec
            sec = 0.0
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        sec += 0.5
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise FsDriftException('only %d threads reached starting gate' 
                                % thread_to_wait_for)

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = gen_host_ready_fname(prm, prm.as_host)
        my_log.debug('host %s creating ready file %s' %
                     (my_host_invoke.onhost, host_ready_fn))
        common.touch(host_ready_fn)

    sg = prm.starting_gun_path
    if not prm_slave:
        my_log.debug('wrote starting gate file ')
        sync_files.write_sync_file(sg, 'hi there')

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if prm_slave:
        my_log.debug('awaiting ' + sg)
        for sec in range(0, host_startup_timeout+3):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            if os.path.exists(prm.abort_path):
                log.info('saw abort file %s, aborting test' % prm.abort_path)
                break
            time.sleep(1)
        if not os.path.exists(sg):
            abort_test(prm.abort_path, thread_list)
            raise Exception('starting signal not seen within %d seconds'
                            % host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        my_log.debug('waiting for thread %s' % t.invoke.tid)
        t.retrieve()
        t.join()

    # if not a slave of some other host, print results (for this host)

    if not prm_slave:
        try:
            worker_list = [ t.invoke for t in thread_list ] 
            output_results.output_results(prm, worker_list)
        except FsDriftException as e:
            print('ERROR: ' + str(e))
            return NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = host_result_filename(prm, prm.as_host)
        my_log.debug('saving result to filename %s' % result_filename)
        worker_list = [ t.invoke for t in thread_list ]
        sync_files.write_pickle(result_filename, worker_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    return OK