Esempio n. 1
0
 def test_multiproc_stonewall(self):
     self.invok.start_log()
     self.invok.log.info('starting stonewall test')
     thread_ready_timeout = 4
     thread_count = 4
     self.deltree()
     os.mkdir(self.topdir)
     os.mkdir(self.invok.src_dir)
     os.mkdir(self.invok.dest_dir)
     sgate_file = self.invok.tmp_dir + os.sep + "start"
     smallfile.ensure_deleted(sgate_file)
     invokeList = []
     for j in range(0, thread_count):
         s = smallfile.smf_invocation()
         #s.log_to_stderr = True
         s.verbose = True
         s.tid = str(j)
         s.src_dir = self.invok.src_dir
         s.dst_dir = self.invok.dest_dir
         s.prefix = "thr_"
         s.suffix = "foo"
         s.iterations=10
         s.stonewall = True
         s.starting_gate = sgate_file
         s.do_sync_at_end = True;
         invokeList.append(s)
     threadList=[]
     for s in invokeList: threadList.append(subprocess(s))
     for t in threadList: t.start()
     threads_ready = True
     for i in range(0, thread_ready_timeout):
         threads_ready = True
         for s in invokeList:
             thread_ready_file = s.gen_thread_ready_fname(s.tid)
             if not os.access(thread_ready_file, os.R_OK): threads_ready = False
         if threads_ready: break
         time.sleep(1)
     if not threads_ready: raise Exception("threads did not show up within %d seconds"%thread_ready_timeout)
     time.sleep(1)
     f = open(sgate_file, "w")
     f.close()
     for t in threadList: 
         rtnd_invok = t.receiver.recv()
         t.join()
         self.invok.log.info(str(rtnd_invok))
         if rtnd_invok.status != ok:
             raise Exception("subprocess failure: " + str(t))
Esempio n. 2
0
def create_worker_list(prm):
    # for each thread set up smf_invocation instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = []
    for k in range(0, prm.thread_count):
        nextinv = smallfile.smf_invocation.clone(prm.master_invoke)
        nextinv.tid = "%02d" % k
        if not prm.master_invoke.is_shared_dir:
            nextinv.src_dirs = [ d + os.sep + prm.master_invoke.onhost + os.sep + "thrd_" + nextinv.tid \
                                 for d in nextinv.src_dirs ]
            nextinv.dest_dirs = [ d + os.sep + prm.master_invoke.onhost + os.sep + "thrd_" + nextinv.tid \
                                 for d in nextinv.dest_dirs ]
        t = invoke_process.subprocess(nextinv)
        thread_list.append(t)
        ensure_deleted(nextinv.gen_thread_ready_fname(nextinv.tid))
    return thread_list
def create_worker_list(prm):
  # for each thread set up smf_invocation instance,
  # create a thread instance, and delete the thread-ready file 

  thread_list=[]
  for k in range(0,prm.thread_count):
    nextinv = smallfile.smf_invocation.clone(prm.master_invoke)
    nextinv.tid = "%02d"%k
    if not prm.master_invoke.is_shared_dir:
        nextinv.src_dirs = [ d + os.sep + prm.master_invoke.onhost + os.sep + "thrd_" + nextinv.tid \
                             for d in nextinv.src_dirs ]
        nextinv.dest_dirs = [ d + os.sep + prm.master_invoke.onhost + os.sep + "thrd_" + nextinv.tid \
                             for d in nextinv.dest_dirs ]
    t = invoke_process.subprocess(nextinv)
    thread_list.append(t)
    ensure_deleted(nextinv.gen_thread_ready_fname(nextinv.tid))
  return thread_list
def create_worker_list(prm):

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = []
    for k in range(0, prm.thread_count):
        nextinv = copy.copy(prm.master_invoke)
        nextinv.tid = '%02d' % k
        if not prm.master_invoke.is_shared_dir:
            nextinv.src_dirs = [d + os.sep + prm.master_invoke.onhost
                                + os.sep + 'thrd_' + nextinv.tid
                                for d in nextinv.src_dirs]
            nextinv.dest_dirs = [d + os.sep + prm.master_invoke.onhost
                                 + os.sep + 'thrd_' + nextinv.tid
                                 for d in nextinv.dest_dirs]
        t = invoke_process.subprocess(nextinv)
        thread_list.append(t)
        ensure_deleted(nextinv.gen_thread_ready_fname(nextinv.tid))
    return thread_list
Esempio n. 5
0
 def run(self):
   master_invoke = self.prm.master_invoke
   launch_fn = os.path.join(master_invoke.network_dir, self.remote_host) + '.smf_launch'
   pickle_fn = master_invoke.host_result_filename(self.remote_host)
   abortfn = master_invoke.abort_fn()
   ensure_deleted(launch_fn)
   ensure_deleted(pickle_fn)
   with open(launch_fn, 'w') as launch_file:
     launch_file.write(self.remote_cmd)
     launch_file.close()
   pickle_fn = master_invoke.host_result_filename(self.remote_host)
   #print 'waiting for pickle file %s'%pickle_fn
   self.status = master_invoke.NOTOK  # premature exit means failure
   while not os.path.exists(pickle_fn):
     #print '%s not seen'%pickle_fn
     if os.path.exists(abortfn):
       if master_invoke.verbose: print('test abort seen by host ' + self.remote_host)
       return
     time.sleep(3)
   self.status = master_invoke.OK
def create_worker_list(prm):

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = []
    for k in range(0, prm.thread_count):
        nextinv = copy.copy(prm.master_invoke)
        nextinv.tid = '%02d' % k
        if not prm.master_invoke.is_shared_dir:
            nextinv.src_dirs = [d + os.sep + prm.master_invoke.onhost
                                + os.sep + 'thrd_' + nextinv.tid
                                for d in nextinv.src_dirs]
            nextinv.dest_dirs = [d + os.sep + prm.master_invoke.onhost
                                 + os.sep + 'thrd_' + nextinv.tid
                                 for d in nextinv.dest_dirs]
        t = invoke_process.subprocess(nextinv)
        thread_list.append(t)
        ensure_deleted(nextinv.gen_thread_ready_fname(nextinv.tid))
    return thread_list
Esempio n. 7
0
 def run(self):
     master_invoke = self.prm.master_invoke
     launch_fn = os.path.join(master_invoke.network_dir,
                              self.remote_host) + '.smf_launch'
     pickle_fn = master_invoke.host_result_filename(self.remote_host)
     abortfn = master_invoke.abort_fn()
     ensure_deleted(launch_fn)
     ensure_deleted(pickle_fn)
     if self.prm.master_invoke.verbose:
         print('wrote command %s to launch file %s' % (self.remote_cmd, launch_fn))
     write_sync_file(launch_fn, self.remote_cmd)
     pickle_fn = master_invoke.host_result_filename(self.remote_host)
     # print('waiting for pickle file %s'%pickle_fn)
     self.status = master_invoke.NOTOK  # premature exit means failure
     while not os.path.exists(pickle_fn):
         # print('%s not seen'%pickle_fn)
         if os.path.exists(abortfn):
             if master_invoke.verbose:
                 print('test abort seen by host ' + self.remote_host)
             return
         time.sleep(1.0)
     self.status = master_invoke.OK  # success!
Esempio n. 8
0
 def run(self):
     master_invoke = self.prm.master_invoke
     launch_fn = os.path.join(master_invoke.network_dir,
                              self.remote_host) + '.smf_launch'
     pickle_fn = master_invoke.host_result_filename(self.remote_host)
     abortfn = master_invoke.abort_fn()
     ensure_deleted(launch_fn)
     ensure_deleted(pickle_fn)
     with open(launch_fn, 'w') as launch_file:
         launch_file.write(self.remote_cmd)
         launch_file.close()
     pickle_fn = master_invoke.host_result_filename(self.remote_host)
     #print 'waiting for pickle file %s'%pickle_fn
     self.status = master_invoke.NOTOK  # premature exit means failure
     while not os.path.exists(pickle_fn):
         #print '%s not seen'%pickle_fn
         if os.path.exists(abortfn):
             if master_invoke.verbose:
                 print('test abort seen by host ' + self.remote_host)
             return
         time.sleep(3)
     self.status = master_invoke.OK
Esempio n. 9
0
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        for d in master_invoke.top_dirs:
            ensure_dir_exists(d)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                ensure_dir_exists(d)
                if verbose:
                    print(host + ' saw ' + str(d))

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print('started %d worker threads on host %s' %
              (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    startup_timeout = prm.startup_timeout
    if smallfile.is_windows_os:
        print('adding time for Windows synchronization')
        startup_timeout += 30
    abort_fname = my_host_invoke.abort_fn()
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    for sec in range(0, startup_timeout * 2):
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                if verbose:
                    print('thread %d thread-ready file %s not found...' %
                          (k, fn))
                break
            thread_to_wait_for = k + 1
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise Exception('only %d threads reached starting gate within %d sec' %
                        (thread_to_wait_for, startup_timeout))

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print('host %s creating ready file %s' %
                  (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, 'hi there')
            if verbose:
                print('wrote starting gate file')
        except IOError as e:
            print('error writing starting gate for threads: %s' % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print('awaiting ' + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout + 10):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception('starting signal not seen within %d seconds' %
                            prm.host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print('waiting for thread %s' % t.invoke.tid)
        t.invoke = t.receiver.recv()  # to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, prm)
        except SMFResultException as e:
            print('ERROR: ' + str(e))
            exit_status = NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = \
            master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print('writing invokes to: ' + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print('saving result to filename %s' % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
Esempio n. 10
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Esempio n. 11
0
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        for d in master_invoke.top_dirs:
            ensure_dir_exists(d)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                ensure_dir_exists(d)
                if verbose:
                    print(host + ' saw ' + str(d))

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print('started %d worker threads on host %s' %
              (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    startup_timeout = prm.startup_timeout
    if smallfile.is_windows_os:
        print('adding time for Windows synchronization')
        startup_timeout += 30
    abort_fname = my_host_invoke.abort_fn()
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    for sec in range(0, startup_timeout * 2):
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                if verbose:
                    print('thread %d thread-ready file %s not found...' % (k, fn))
                break
            thread_to_wait_for = k + 1
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise Exception('only %d threads reached starting gate within %d sec'
                        % (thread_to_wait_for, startup_timeout))

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print('host %s creating ready file %s' %
                  (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, 'hi there')
            if verbose:
                print('wrote starting gate file')
        except IOError as e:
            print('error writing starting gate for threads: %s' % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print('awaiting ' + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout + 10):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception('starting signal not seen within %d seconds'
                            % prm.host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print('waiting for thread %s' % t.invoke.tid)
        t.invoke = t.receiver.recv()  # to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, prm)
        except SMFResultException as e:
            print('ERROR: ' + str(e))
            exit_status = NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = \
            master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print('writing invokes to: ' + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print('saving result to filename %s' % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
Esempio n. 12
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm,
                                                remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(ssh_thread.ssh_thread(remote_host,
                                                            this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Esempio n. 13
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir,'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
      python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
      python_prog = 'python'
    elif sys.version.startswith('3'):
      python_prog = 'python3'
    else:
      raise Exception('unrecognized python version %s'%sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)
        
        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
          this_remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct]
        else:
          this_remote_cmd += ' --as-host %s'%remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
          remote_thread_list.append(launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd ))
        else:
          remote_thread_list.append(ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds
  
    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen=-1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
     # FIXME: make timeout criteria be that new new hosts seen in X seconds
     while sec < host_timeout:
      ndirlist = os.listdir(master_invoke.network_dir)
      if master_invoke.verbose: print('shared dir list: ' + str(ndirlist))
      hosts_ready = True
      if os.path.exists(abortfn): raise Exception('worker host signaled abort')
      for j in range(last_host_seen+1, len(prm_host_set)):
        h=prm_host_set[j]
        fn = master_invoke.gen_host_ready_fname(h.strip())
        if verbose: print('checking for host filename '+fn)
        if not os.path.exists(fn):
            hosts_ready = False
            break
        last_host_seen=j
      if hosts_ready: break

      # be patient for large tests
      # give user some feedback about how many hosts have arrived at the starting gate

      time.sleep(sec_delta)
      sec += sec_delta
      sec_delta += 1
      if verbose: print('last_host_seen=%d sec=%d'%(last_host_seen,sec))
    except KeyboardInterrupt as e:
      print('saw SIGINT signal, aborting test')
      exception_seen = e
    except Exception as e:
      exception_seen = e
      hosts_ready = False
    if not hosts_ready:
      smallfile.abort_test(abortfn, [])
      if not exception_seen: 
        raise Exception('hosts did not reach starting gate within %d seconds'%host_timeout)
      else:
        print('saw exception %s, aborting test'%str(e))
    else:
      # ask all hosts to start the test
      # this is like firing the gun at the track meet
      try:
        sync_files.write_sync_file(starting_gate, 'hi')
        if verbose: print('starting gate file %s created'%starting_gate)
      except IOError as e:
        print('error writing starting gate: %s'%os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK: 
          all_ok = False
          print('ERROR: ssh thread for host %s completed with status %d'%(t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
      invoke_list = []
      for h in prm_host_set:  # for each host in test

        # read results for each thread run in that host
        # from python pickle of the list of smf_invocation objects

        pickle_fn = master_invoke.host_result_filename(h)
        if verbose: print('reading pickle file: %s'%pickle_fn)
        host_invoke_list = []
        try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                  host_invoke_list = pickle.load(pickle_file)
                if verbose: print(' read %d invoke objects'%len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
        except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found'%pickle_fn)

      output_results.output_results(invoke_list, prm_host_set, prm.thread_count,pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s'%(h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok: 
        sys.exit(NOTOK)
    sys.exit(OK)
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        os.listdir(master_invoke.network_dir)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                os.listdir(d)  # hack to ensure that
                if verbose:
                    print(host + " saw " + d)

    # for each thread set up smf_invocation instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    starting_gate = thread_list[0].invoke.starting_gate
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print("started %d worker threads on host %s" % (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    abort_fname = my_host_invoke.abort_fn()
    threads_ready = False  # really just to set scope of variable
    k = 0
    for sec in range(0, prm.startup_timeout * 2):
        threads_ready = True
        for t in thread_list:
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                threads_ready = False
                break
        if threads_ready:
            break
        if os.path.exists(abort_fname):
            break
        if verbose:
            print("threads not ready...")
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if not threads_ready:
        abort_test(abort_fname, thread_list)
        raise Exception("threads did not reach starting gate within %d sec" % prm.startup_timeout)

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print("host %s creating ready file %s" % (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, "hi there")
            if verbose:
                print("wrote starting gate file")
        except IOError as e:
            print("error writing starting gate for threads: %s" % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print("awaiting " + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout * 2):
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception("starting signal not seen within %d seconds" % prm.host_startup_timeout)
    if verbose:
        print("starting test on host " + host + " in 2 seconds")
    time.sleep(2 + random.random())

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # for long enough
    # problem is: if servers are heavily loaded you can't use filesystem to communicate this

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print("waiting for thread %s" % t.invoke.tid)
        t.invoke = t.receiver.recv()  # must do this to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            # FIXME: code to aggregate results from list of invoke objects can be shared by multi-host and single-host cases
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, ["localhost"], prm.thread_count, smallfile.pct_files_min)
        except SMFResultException as e:
            print("ERROR: " + str(e))
            exit_status = NOTOK

    else:
        # if we are participating in a multi-host test
        # then write out this host's result in pickle format so test driver can pick up result

        result_filename = master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print("writing invokes to: " + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print("saving result to filename %s" % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
Esempio n. 15
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
            this_remote_cmd += ' --as-host %s' % prm_host_set[(j + 1) %
                                                              host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
        # FIXME: make timeout criteria be that new new hosts seen in X seconds
        while sec < host_timeout:
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose: print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j
            if hosts_ready: break

            # be patient for large tests
            # give user some feedback about how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        if not exception_seen:
            raise Exception(
                'hosts did not reach starting gate within %d seconds' %
                host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:
        # ask all hosts to start the test
        # this is like firing the gun at the track meet
        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            if verbose: print('starting gate file %s created' % starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            all_ok = False
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
        invoke_list = []
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of smf_invocation objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose: print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm_host_set,
                                      prm.thread_count, pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok:
        sys.exit(NOTOK)
    sys.exit(OK)
Esempio n. 16
0
startup_timeout = 20
dirs = master_invoke.iterations / master_invoke.files_per_dir
if dirs > 20:
  dir_creation_timeout = dirs / 10
  print 'extending initialization timeout by %d seconds for directory creation'%dir_creation_timeout
  startup_timeout += dir_creation_timeout
host_startup_timeout = startup_timeout + 5

# for multi-host test

if prm_host_set and not prm_slave:

  # construct list of ssh threads to invoke in parallel

  ssh_thread_list = []
  smallfile.ensure_deleted(starting_gate)
  host_ct = len(prm_host_set)
  for j in range(0, len(prm_host_set)):
        n = prm_host_set[j]
        if prm_permute_host_dirs:
          remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct]
        remote_cmd += ' --slave Y '
        if verbose: print remote_cmd
        pickle_fn = gen_host_result_filename(top_dir, short_hostname(n))
        smallfile.ensure_deleted(pickle_fn)
        ssh_thread_list.append(ssh_thread.ssh_thread(n, remote_cmd))
  time.sleep(2) # give other clients time to see changes

  # start them, pacing starts so that we don't get ssh errors

  for t in ssh_thread_list: