Example #1
0
def create_worker_list(prm):

    # for each thread set up FsDriftWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = []
    for k in range(0, prm.threads):
        nextinv = worker_thread.FsDriftWorkload(prm)
        nextinv.tid = '%02d' % k
        t = invoke_process.subprocess(nextinv)
        thread_list.append(t)
        ensure_deleted(nextinv.gen_thread_ready_fname(nextinv.tid))
    return thread_list
Example #2
0
 def run(self):
     ensure_deleted(self.launch_fn)
     ensure_deleted(self.pickle_fn)
     with open(self.launch_fn, 'w') as launch_file:
         launch_file.write(self.remote_cmd)
         launch_file.close()
     self.log.debug('waiting for pickle file %s' % self.pickle_fn)
     self.status = NOTOK  # premature exit means failure
     while not os.path.exists(self.pickle_fn):
         if os.path.exists(self.params.abort_path):
             self.log.info('test abort seen by host ' + self.remote_host)
             return
         time.sleep(2)
     self.status = OK  # success!
Example #3
0
def run_multi_host_workload(prm, log):

    # construct list of ssh threads to invoke in parallel

    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    log.debug('python_prog = %s' % python_prog)

    remote_thread_list = []
    host_ct = len(prm.host_set)
    for j in range(0, len(prm.host_set)):
        remote_host = prm.host_set[j]
        fsd_remote_pgm = os.path.join(prm.fsd_remote_dir, 'fs-drift-remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (prm.python_prog, fsd_remote_pgm, prm.network_shared_path)

        this_remote_cmd += ' --as-host %s' % remote_host
        log.debug(this_remote_cmd)
        if prm.launch_as_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, log, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(log, remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        if prm.launch_as_daemon:
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found

    exception_seen = None
    abortfn = prm.abort_path
    sec_delta = 0.5
    # timeout if no host replies in next host_timeout seconds
    per_host_timeout = 10.0
    all_host_timeout = 5.0 + len(prm.host_set) / 3
    if all_host_timeout < per_host_timeout:
        per_host_timeout = all_host_timeout / 2

    hosts_ready = False  # set scope outside while loop
    last_host_seen = -1
    sec = 0.0
    start_loop_start = time.time()
    try:
        while sec < per_host_timeout:
            # HACK to force directory entry coherency for Gluster
            #ndirlist = os.listdir(prm.network_shared_path)
            #log.debug('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise FsDriftException('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm.host_set)):
                h = prm.host_set[j]
                fn = multi_thread_workload.gen_host_ready_fname(prm, h.strip())
                log.debug('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    log.info('did not see host filename %s after %f sec' %
                             (fn, sec))
                    hosts_ready = False
                    break
                log.debug('saw host filename ' + fn)
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in per_host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.is_alive():
                    log.error('thread %s has died' % t)
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            time_since_loop_start = time.time() - start_loop_start
            log.debug('last_host_seen=%d sec=%d' % (last_host_seen, sec))
            if time_since_loop_start > all_host_timeout:
                kill_remaining_threads = True
                break
    except KeyboardInterrupt as e:
        log.error('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        log.exception(e)
        hosts_ready = False
    if not hosts_ready:
        multi_thread_workload.abort_test(prm.abort_path, remote_thread_list)
        if not exception_seen:
            log.info(
                'no additional hosts reached starting gate within %5.1f seconds'
                % per_host_timeout)
            return NOTOK
        else:
            raise exception_seen
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(prm.starting_gun_path, 'hi')
            log.debug('starting all threads by creating starting gun file %s' %
                      prm.starting_gun_path)
        except IOError as e:
            log.error('error writing starting gun file: %s' %
                      os.strerror(e.errno))
            multi_thread_workload.abort_test(prm.abort_path,
                                             remote_thread_list)
            raise e

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            log.error('ssh thread for host %s completed with status %d' %
                      (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        invoke_list = []
        one_shot_delay = True
        for h in prm.host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = multi_thread_workload.host_result_filename(prm, h)
            log.debug('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                host_invoke_list = read_pickle(pickle_fn)
                log.debug(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                log.error('  pickle file %s not found' % pickle_fn)

        output_results.output_results(prm, invoke_list)
    except IOError as e:
        log.exception(e)
        log.error('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        return NOTOK
    except KeyboardInterrupt as e:
        log.error('control-C signal seen (SIGINT)')
        return NOTOK
    except FsDriftException as e:
        log.exception(e)
        return NOTOK
    return (OK)
Example #4
0
def run_multi_thread_workload(prm):

    host = prm.as_host
    if host == None:
        host = 'localhost'
    prm_slave = (prm.host_set != [])
    # FIXME: get coherent logging level interface
    verbose = os.getenv('LOGLEVEL_DEBUG' != None)
    host_startup_timeout = 5  + len(prm.host_set) / 3

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke
    my_log = fsd_log.start_log('%s.master' % host)
    my_log.debug(prm)

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    my_log.debug('started %d worker threads on host %s' %
                                (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    abort_fname = prm.abort_path
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    startup_timeout = 3
    sec = 0.0
    while sec < startup_timeout:
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                my_log.debug('thread %d thread-ready file %s not found yet with %f sec left' % 
                            (k, fn, (startup_timeout - sec)))
                break
            thread_to_wait_for = k + 1
            # we only timeout if no more threads have reached starting gate
            # in startup_timeout sec
            sec = 0.0
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        sec += 0.5
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise FsDriftException('only %d threads reached starting gate' 
                                % thread_to_wait_for)

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = gen_host_ready_fname(prm, prm.as_host)
        my_log.debug('host %s creating ready file %s' %
                     (my_host_invoke.onhost, host_ready_fn))
        common.touch(host_ready_fn)

    sg = prm.starting_gun_path
    if not prm_slave:
        my_log.debug('wrote starting gate file ')
        sync_files.write_sync_file(sg, 'hi there')

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if prm_slave:
        my_log.debug('awaiting ' + sg)
        for sec in range(0, host_startup_timeout+3):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            if os.path.exists(prm.abort_path):
                log.info('saw abort file %s, aborting test' % prm.abort_path)
                break
            time.sleep(1)
        if not os.path.exists(sg):
            abort_test(prm.abort_path, thread_list)
            raise Exception('starting signal not seen within %d seconds'
                            % host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        my_log.debug('waiting for thread %s' % t.invoke.tid)
        t.retrieve()
        t.join()

    # if not a slave of some other host, print results (for this host)

    if not prm_slave:
        try:
            worker_list = [ t.invoke for t in thread_list ] 
            output_results.output_results(prm, worker_list)
        except FsDriftException as e:
            print('ERROR: ' + str(e))
            return NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = host_result_filename(prm, prm.as_host)
        my_log.debug('saving result to filename %s' % result_filename)
        worker_list = [ t.invoke for t in thread_list ]
        sync_files.write_pickle(result_filename, worker_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    return OK