def run(self): master_invoke = self.prm.master_invoke launch_fn = os.path.join(master_invoke.network_dir, self.remote_host) + '.smf_launch' pickle_fn = master_invoke.host_result_filename(self.remote_host) abortfn = master_invoke.abort_fn() ensure_deleted(launch_fn) ensure_deleted(pickle_fn) if self.prm.master_invoke.verbose: print('wrote command %s to launch file %s' % (self.remote_cmd, launch_fn)) write_sync_file(launch_fn, self.remote_cmd) pickle_fn = master_invoke.host_result_filename(self.remote_host) # print('waiting for pickle file %s'%pickle_fn) self.status = master_invoke.NOTOK # premature exit means failure while not os.path.exists(pickle_fn): # print('%s not seen'%pickle_fn) if os.path.exists(abortfn): if master_invoke.verbose: print('test abort seen by host ' + self.remote_host) return time.sleep(1.0) self.status = master_invoke.OK # success!
def run_multi_thread_workload(prm): master_invoke = prm.master_invoke prm_slave = prm.is_slave verbose = master_invoke.verbose host = master_invoke.onhost if not prm_slave: sync_files.create_top_dirs(master_invoke, False) if prm_slave: time.sleep(1.1) for d in master_invoke.top_dirs: ensure_dir_exists(d) for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]: for d in dlist: ensure_dir_exists(d) if verbose: print(host + ' saw ' + str(d)) # for each thread set up SmallfileWorkload instance, # create a thread instance, and delete the thread-ready file thread_list = create_worker_list(prm) my_host_invoke = thread_list[0].invoke # start threads, wait for them to reach starting gate # to do this, look for thread-ready files for t in thread_list: ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid)) for t in thread_list: t.start() if verbose: print('started %d worker threads on host %s' % (len(thread_list), host)) # wait for all threads to reach the starting gate # this makes it more likely that they will start simultaneously startup_timeout = prm.startup_timeout if smallfile.is_windows_os: print('adding time for Windows synchronization') startup_timeout += 30 abort_fname = my_host_invoke.abort_fn() thread_count = len(thread_list) thread_to_wait_for = 0 for sec in range(0, startup_timeout * 2): for k in range(thread_to_wait_for, thread_count): t = thread_list[k] fn = t.invoke.gen_thread_ready_fname(t.invoke.tid) if not os.path.exists(fn): if verbose: print('thread %d thread-ready file %s not found...' % (k, fn)) break thread_to_wait_for = k + 1 if thread_to_wait_for == thread_count: break if os.path.exists(abort_fname): break time.sleep(0.5) # if all threads didn't make it to the starting gate if thread_to_wait_for < thread_count: abort_test(abort_fname, thread_list) raise Exception('only %d threads reached starting gate within %d sec' % (thread_to_wait_for, startup_timeout)) # declare that this host is at the starting gate if prm_slave: host_ready_fn = my_host_invoke.gen_host_ready_fname() if my_host_invoke.verbose: print('host %s creating ready file %s' % (my_host_invoke.onhost, host_ready_fn)) smallfile.touch(host_ready_fn) sg = my_host_invoke.starting_gate if not prm_slave: # special case of no --host-set parameter try: sync_files.write_sync_file(sg, 'hi there') if verbose: print('wrote starting gate file') except IOError as e: print('error writing starting gate for threads: %s' % str(e)) # wait for starting_gate file to be created by test driver # every second we resume scan from last host file not found if verbose: print('awaiting ' + sg) if prm_slave: for sec in range(0, prm.host_startup_timeout + 10): # hack to ensure that directory is up to date # ndlist = os.listdir(my_host_invoke.network_dir) # if verbose: print(str(ndlist)) if os.path.exists(sg): break time.sleep(0.5) if not os.path.exists(sg): abort_test(my_host_invoke.abort_fn(), thread_list) raise Exception('starting signal not seen within %d seconds' % prm.host_startup_timeout) if verbose: print('starting test on host ' + host + ' in 2 seconds') time.sleep(2 + random.random()) # let other hosts see starting gate file # FIXME: don't timeout the test, # instead check thread progress and abort if you see any of them stalled # but if servers are heavily loaded you can't rely on filesystem # wait for all threads on this host to finish for t in thread_list: if verbose: print('waiting for thread %s' % t.invoke.tid) t.invoke = t.receiver.recv() # to get results from sub-process t.join() # if not a slave of some other host, print results (for this host) exit_status = OK if not prm_slave: try: invoke_list = [t.invoke for t in thread_list] output_results.output_results(invoke_list, prm) except SMFResultException as e: print('ERROR: ' + str(e)) exit_status = NOTOK else: # if we are participating in a multi-host test # then write out this host's result in pickle format # so test driver can pick up result result_filename = \ master_invoke.host_result_filename(prm.as_host) if verbose: print('writing invokes to: ' + result_filename) invok_list = [t.invoke for t in thread_list] if verbose: print('saving result to filename %s' % result_filename) for ivk in invok_list: ivk.buf = None ivk.biggest_buf = None sync_files.write_pickle(result_filename, invok_list) time.sleep(1.2) # for benefit of NFS with actimeo=1 sys.exit(exit_status)
def run_multi_host_workload(prm): prm_host_set = prm.host_set prm_permute_host_dirs = prm.permute_host_dirs master_invoke = prm.master_invoke starting_gate = master_invoke.starting_gate verbose = master_invoke.verbose # construct list of ssh threads to invoke in parallel sync_files.create_top_dirs(master_invoke, True) pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle') # if verbose: print('writing ' + pickle_fn)) sync_files.write_pickle(pickle_fn, prm) if os.getenv('PYPY'): python_prog = os.getenv('PYPY') elif sys.version.startswith('2'): python_prog = 'python' elif sys.version.startswith('3'): python_prog = 'python3' else: raise Exception('unrecognized python version %s' % sys.version) # print('python_prog = %s'%python_prog) remote_thread_list = [] host_ct = len(prm_host_set) for j in range(0, len(prm_host_set)): remote_host = prm_host_set[j] smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py') this_remote_cmd = '%s %s --network-sync-dir %s ' \ % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir) # this_remote_cmd = remote_cmd if prm_permute_host_dirs: this_remote_cmd += \ ' --as-host %s' % prm_host_set[(j + 1) % host_ct] else: this_remote_cmd += ' --as-host %s' % remote_host if verbose: print(this_remote_cmd) if smallfile.is_windows_os or prm.launch_by_daemon: remote_thread_list.append( launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd)) else: remote_thread_list.append( ssh_thread.ssh_thread(remote_host, this_remote_cmd)) # start them for t in remote_thread_list: if not prm.launch_by_daemon: # pace starts so that we don't get ssh errors time.sleep(0.1) t.start() # wait for hosts to arrive at starting gate # if only one host, then no wait will occur # as starting gate file is already present # every second we resume scan from last host file not found # FIXME: for very large host sets, # timeout only if no host responds within X seconds exception_seen = None hosts_ready = False # set scope outside while loop abortfn = master_invoke.abort_fn() last_host_seen = -1 sec = 0.0 sec_delta = 0.5 host_timeout = prm.host_startup_timeout if smallfile.is_windows_os: host_timeout += 20 try: while sec < host_timeout: # HACK to force directory entry coherency for Gluster ndirlist = os.listdir(master_invoke.network_dir) if master_invoke.verbose: print('shared dir list: ' + str(ndirlist)) hosts_ready = True if os.path.exists(abortfn): raise Exception('worker host signaled abort') for j in range(last_host_seen + 1, len(prm_host_set)): h = prm_host_set[j] fn = master_invoke.gen_host_ready_fname(h.strip()) if verbose: print('checking for host filename ' + fn) if not os.path.exists(fn): hosts_ready = False break last_host_seen = j # saw this host's ready file # we exit while loop only if no hosts in host_timeout seconds sec = 0.0 if hosts_ready: break # if one of ssh threads has died, no reason to continue kill_remaining_threads = False for t in remote_thread_list: if not t.isAlive(): print('thread %s on host %s has died' % (t, h)) kill_remaining_threads = True break if kill_remaining_threads: break # be patient for large tests # give user some feedback about # how many hosts have arrived at the starting gate time.sleep(sec_delta) sec += sec_delta sec_delta += 1 if verbose: print('last_host_seen=%d sec=%d' % (last_host_seen, sec)) except KeyboardInterrupt as e: print('saw SIGINT signal, aborting test') exception_seen = e except Exception as e: exception_seen = e hosts_ready = False if not hosts_ready: smallfile.abort_test(abortfn, []) print('ERROR: host %s did not reach starting gate' % h) if not exception_seen: raise Exception('hosts did not reach starting gate ' + 'within %d seconds' % host_timeout) else: print('saw exception %s, aborting test' % str(e)) else: # ask all hosts to start the test # this is like firing the gun at the track meet try: sync_files.write_sync_file(starting_gate, 'hi') print('starting all threads by creating starting gate file %s' % starting_gate) except IOError as e: print('error writing starting gate: %s' % os.strerror(e.errno)) # wait for them to finish for t in remote_thread_list: t.join() if t.status != OK: print('ERROR: ssh thread for host %s completed with status %d' % (t.remote_host, t.status)) # attempt to aggregate results by reading pickle files # containing SmallfileWorkload instances # with counters and times that we need try: all_ok = NOTOK invoke_list = [] one_shot_delay = True for h in prm_host_set: # for each host in test # read results for each thread run in that host # from python pickle of the list of SmallfileWorkload objects pickle_fn = master_invoke.host_result_filename(h) if verbose: print('reading pickle file: %s' % pickle_fn) host_invoke_list = [] try: if one_shot_delay and not os.path.exists(pickle_fn): # all threads have joined already, they are done # we allow > 1 sec # for this (NFS) client to see other clients' files time.sleep(1.2) one_shot_delay = False with open(pickle_fn, 'rb') as pickle_file: host_invoke_list = pickle.load(pickle_file) if verbose: print(' read %d invoke objects' % len(host_invoke_list)) invoke_list.extend(host_invoke_list) ensure_deleted(pickle_fn) except IOError as e: if e.errno != errno.ENOENT: raise e print(' pickle file %s not found' % pickle_fn) output_results.output_results(invoke_list, prm) all_ok = OK except IOError as e: print('host %s filename %s: %s' % (h, pickle_fn, str(e))) except KeyboardInterrupt as e: print('control-C signal seen (SIGINT)') except SMFResultException as e: print(str(e)) sys.exit(all_ok)
def run_multi_host_workload(prm, log): # construct list of ssh threads to invoke in parallel if os.getenv('PYPY'): python_prog = os.getenv('PYPY') elif sys.version.startswith('2'): python_prog = 'python' elif sys.version.startswith('3'): python_prog = 'python3' else: raise Exception('unrecognized python version %s' % sys.version) log.debug('python_prog = %s' % python_prog) remote_thread_list = [] host_ct = len(prm.host_set) for j in range(0, len(prm.host_set)): remote_host = prm.host_set[j] fsd_remote_pgm = os.path.join(prm.fsd_remote_dir, 'fs-drift-remote.py') this_remote_cmd = '%s %s --network-sync-dir %s ' \ % (prm.python_prog, fsd_remote_pgm, prm.network_shared_path) this_remote_cmd += ' --as-host %s' % remote_host log.debug(this_remote_cmd) if prm.launch_as_daemon: remote_thread_list.append( launcher_thread.launcher_thread(prm, log, remote_host, this_remote_cmd)) else: remote_thread_list.append( ssh_thread.ssh_thread(log, remote_host, this_remote_cmd)) # start them, pacing starts so that we don't get ssh errors for t in remote_thread_list: if prm.launch_as_daemon: time.sleep(0.1) t.start() # wait for hosts to arrive at starting gate # if only one host, then no wait will occur # as starting gate file is already present # every second we resume scan from last host file not found exception_seen = None abortfn = prm.abort_path sec_delta = 0.5 # timeout if no host replies in next host_timeout seconds per_host_timeout = 10.0 all_host_timeout = 5.0 + len(prm.host_set) / 3 if all_host_timeout < per_host_timeout: per_host_timeout = all_host_timeout / 2 hosts_ready = False # set scope outside while loop last_host_seen = -1 sec = 0.0 start_loop_start = time.time() try: while sec < per_host_timeout: # HACK to force directory entry coherency for Gluster #ndirlist = os.listdir(prm.network_shared_path) #log.debug('shared dir list: ' + str(ndirlist)) hosts_ready = True if os.path.exists(abortfn): raise FsDriftException('worker host signaled abort') for j in range(last_host_seen + 1, len(prm.host_set)): h = prm.host_set[j] fn = multi_thread_workload.gen_host_ready_fname(prm, h.strip()) log.debug('checking for host filename ' + fn) if not os.path.exists(fn): log.info('did not see host filename %s after %f sec' % (fn, sec)) hosts_ready = False break log.debug('saw host filename ' + fn) last_host_seen = j # saw this host's ready file # we exit while loop only if no hosts in per_host_timeout seconds sec = 0.0 if hosts_ready: break # if one of ssh threads has died, no reason to continue kill_remaining_threads = False for t in remote_thread_list: if not t.is_alive(): log.error('thread %s has died' % t) kill_remaining_threads = True break if kill_remaining_threads: break # be patient for large tests # give user some feedback about # how many hosts have arrived at the starting gate time.sleep(sec_delta) sec += sec_delta time_since_loop_start = time.time() - start_loop_start log.debug('last_host_seen=%d sec=%d' % (last_host_seen, sec)) if time_since_loop_start > all_host_timeout: kill_remaining_threads = True break except KeyboardInterrupt as e: log.error('saw SIGINT signal, aborting test') exception_seen = e except Exception as e: exception_seen = e log.exception(e) hosts_ready = False if not hosts_ready: multi_thread_workload.abort_test(prm.abort_path, remote_thread_list) if not exception_seen: log.info( 'no additional hosts reached starting gate within %5.1f seconds' % per_host_timeout) return NOTOK else: raise exception_seen else: # ask all hosts to start the test # this is like firing the gun at the track meet try: sync_files.write_sync_file(prm.starting_gun_path, 'hi') log.debug('starting all threads by creating starting gun file %s' % prm.starting_gun_path) except IOError as e: log.error('error writing starting gun file: %s' % os.strerror(e.errno)) multi_thread_workload.abort_test(prm.abort_path, remote_thread_list) raise e # wait for them to finish for t in remote_thread_list: t.join() if t.status != OK: log.error('ssh thread for host %s completed with status %d' % (t.remote_host, t.status)) # attempt to aggregate results by reading pickle files # containing SmallfileWorkload instances # with counters and times that we need try: invoke_list = [] one_shot_delay = True for h in prm.host_set: # for each host in test # read results for each thread run in that host # from python pickle of the list of SmallfileWorkload objects pickle_fn = multi_thread_workload.host_result_filename(prm, h) log.debug('reading pickle file: %s' % pickle_fn) host_invoke_list = [] try: if one_shot_delay and not os.path.exists(pickle_fn): # all threads have joined already, they are done # we allow > 1 sec # for this (NFS) client to see other clients' files time.sleep(1.2) one_shot_delay = False host_invoke_list = read_pickle(pickle_fn) log.debug(' read %d invoke objects' % len(host_invoke_list)) invoke_list.extend(host_invoke_list) ensure_deleted(pickle_fn) except IOError as e: if e.errno != errno.ENOENT: raise e log.error(' pickle file %s not found' % pickle_fn) output_results.output_results(prm, invoke_list) except IOError as e: log.exception(e) log.error('host %s filename %s: %s' % (h, pickle_fn, str(e))) return NOTOK except KeyboardInterrupt as e: log.error('control-C signal seen (SIGINT)') return NOTOK except FsDriftException as e: log.exception(e) return NOTOK return (OK)
def run_multi_host_workload(prm): prm_host_set = prm.host_set prm_permute_host_dirs = prm.permute_host_dirs master_invoke = prm.master_invoke starting_gate = master_invoke.starting_gate verbose = master_invoke.verbose # construct list of ssh threads to invoke in parallel sync_files.create_top_dirs(master_invoke, True) pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle') # if verbose: print('writing ' + pickle_fn)) sync_files.write_pickle(pickle_fn, prm) if os.getenv('PYPY'): python_prog = os.getenv('PYPY') elif sys.version.startswith('2'): python_prog = 'python' elif sys.version.startswith('3'): python_prog = 'python3' else: raise Exception('unrecognized python version %s' % sys.version) # print('python_prog = %s'%python_prog) remote_thread_list = [] host_ct = len(prm_host_set) for j in range(0, len(prm_host_set)): remote_host = prm_host_set[j] smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py') this_remote_cmd = '%s %s --network-sync-dir %s ' \ % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir) # this_remote_cmd = remote_cmd if prm_permute_host_dirs: this_remote_cmd += \ ' --as-host %s' % prm_host_set[(j + 1) % host_ct] else: this_remote_cmd += ' --as-host %s' % remote_host if verbose: print(this_remote_cmd) if smallfile.is_windows_os or prm.launch_by_daemon: remote_thread_list.append( launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd)) else: remote_thread_list.append(ssh_thread.ssh_thread(remote_host, this_remote_cmd)) # start them for t in remote_thread_list: if not prm.launch_by_daemon: # pace starts so that we don't get ssh errors time.sleep(0.1) t.start() # wait for hosts to arrive at starting gate # if only one host, then no wait will occur # as starting gate file is already present # every second we resume scan from last host file not found # FIXME: for very large host sets, # timeout only if no host responds within X seconds exception_seen = None hosts_ready = False # set scope outside while loop abortfn = master_invoke.abort_fn() last_host_seen = -1 sec = 0.0 sec_delta = 0.5 host_timeout = prm.host_startup_timeout if smallfile.is_windows_os: host_timeout += 20 try: while sec < host_timeout: # HACK to force directory entry coherency for Gluster ndirlist = os.listdir(master_invoke.network_dir) if master_invoke.verbose: print('shared dir list: ' + str(ndirlist)) hosts_ready = True if os.path.exists(abortfn): raise Exception('worker host signaled abort') for j in range(last_host_seen + 1, len(prm_host_set)): h = prm_host_set[j] fn = master_invoke.gen_host_ready_fname(h.strip()) if verbose: print('checking for host filename ' + fn) if not os.path.exists(fn): hosts_ready = False break last_host_seen = j # saw this host's ready file # we exit while loop only if no hosts in host_timeout seconds sec = 0.0 if hosts_ready: break # if one of ssh threads has died, no reason to continue kill_remaining_threads = False for t in remote_thread_list: if not t.isAlive(): print('thread %s on host %s has died' % (t, h)) kill_remaining_threads = True break if kill_remaining_threads: break # be patient for large tests # give user some feedback about # how many hosts have arrived at the starting gate time.sleep(sec_delta) sec += sec_delta sec_delta += 1 if verbose: print('last_host_seen=%d sec=%d' % (last_host_seen, sec)) except KeyboardInterrupt as e: print('saw SIGINT signal, aborting test') exception_seen = e except Exception as e: exception_seen = e hosts_ready = False if not hosts_ready: smallfile.abort_test(abortfn, []) print('ERROR: host %s did not reach starting gate' % h) if not exception_seen: raise Exception('hosts did not reach starting gate ' + 'within %d seconds' % host_timeout) else: print('saw exception %s, aborting test' % str(e)) else: # ask all hosts to start the test # this is like firing the gun at the track meet try: sync_files.write_sync_file(starting_gate, 'hi') print('starting all threads by creating starting gate file %s' % starting_gate) except IOError as e: print('error writing starting gate: %s' % os.strerror(e.errno)) # wait for them to finish for t in remote_thread_list: t.join() if t.status != OK: print('ERROR: ssh thread for host %s completed with status %d' % (t.remote_host, t.status)) # attempt to aggregate results by reading pickle files # containing SmallfileWorkload instances # with counters and times that we need try: all_ok = NOTOK invoke_list = [] one_shot_delay = True for h in prm_host_set: # for each host in test # read results for each thread run in that host # from python pickle of the list of SmallfileWorkload objects pickle_fn = master_invoke.host_result_filename(h) if verbose: print('reading pickle file: %s' % pickle_fn) host_invoke_list = [] try: if one_shot_delay and not os.path.exists(pickle_fn): # all threads have joined already, they are done # we allow > 1 sec # for this (NFS) client to see other clients' files time.sleep(1.2) one_shot_delay = False with open(pickle_fn, 'rb') as pickle_file: host_invoke_list = pickle.load(pickle_file) if verbose: print(' read %d invoke objects' % len(host_invoke_list)) invoke_list.extend(host_invoke_list) ensure_deleted(pickle_fn) except IOError as e: if e.errno != errno.ENOENT: raise e print(' pickle file %s not found' % pickle_fn) output_results.output_results(invoke_list, prm) all_ok = OK except IOError as e: print('host %s filename %s: %s' % (h, pickle_fn, str(e))) except KeyboardInterrupt as e: print('control-C signal seen (SIGINT)') except SMFResultException as e: print(str(e)) sys.exit(all_ok)
def terminate(self): sync_files.write_sync_file(self.params.abort_path, 'shut it down') self.status = NOTOK
def run_multi_host_workload(prm): prm_host_set = prm.host_set prm_slave = prm.is_slave prm_permute_host_dirs = prm.permute_host_dirs master_invoke = prm.master_invoke starting_gate = master_invoke.starting_gate verbose = master_invoke.verbose host = master_invoke.onhost # construct list of ssh threads to invoke in parallel sync_files.create_top_dirs(master_invoke, True) pickle_fn = os.path.join(prm.master_invoke.network_dir,'param.pickle') #if verbose: print('writing ' + pickle_fn) sync_files.write_pickle(pickle_fn, prm) if os.getenv('PYPY'): python_prog = os.getenv('PYPY') elif sys.version.startswith('2'): python_prog = 'python' elif sys.version.startswith('3'): python_prog = 'python3' else: raise Exception('unrecognized python version %s'%sys.version) #print('python_prog = %s'%python_prog) remote_thread_list = [] host_ct = len(prm_host_set) for j in range(0, len(prm_host_set)): remote_host = prm_host_set[j] smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py') this_remote_cmd = '%s %s --network-sync-dir %s '%\ (python_prog, smf_remote_pgm, prm.master_invoke.network_dir) #this_remote_cmd = remote_cmd if prm_permute_host_dirs: this_remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct] else: this_remote_cmd += ' --as-host %s'%remote_host if verbose: print(this_remote_cmd) if smallfile.is_windows_os: remote_thread_list.append(launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd )) else: remote_thread_list.append(ssh_thread.ssh_thread(remote_host, this_remote_cmd)) # start them, pacing starts so that we don't get ssh errors for t in remote_thread_list: t.start() # wait for hosts to arrive at starting gate # if only one host, then no wait will occur as starting gate file is already present # every second we resume scan from last host file not found # FIXME: for very large host sets, timeout only if no host responds within X seconds exception_seen = None hosts_ready = False # set scope outside while loop abortfn = master_invoke.abort_fn() last_host_seen=-1 sec = 0 sec_delta = 0.5 host_timeout = prm.host_startup_timeout if smallfile.is_windows_os: host_timeout += 20 try: # FIXME: make timeout criteria be that new new hosts seen in X seconds while sec < host_timeout: ndirlist = os.listdir(master_invoke.network_dir) if master_invoke.verbose: print('shared dir list: ' + str(ndirlist)) hosts_ready = True if os.path.exists(abortfn): raise Exception('worker host signaled abort') for j in range(last_host_seen+1, len(prm_host_set)): h=prm_host_set[j] fn = master_invoke.gen_host_ready_fname(h.strip()) if verbose: print('checking for host filename '+fn) if not os.path.exists(fn): hosts_ready = False break last_host_seen=j if hosts_ready: break # be patient for large tests # give user some feedback about how many hosts have arrived at the starting gate time.sleep(sec_delta) sec += sec_delta sec_delta += 1 if verbose: print('last_host_seen=%d sec=%d'%(last_host_seen,sec)) except KeyboardInterrupt as e: print('saw SIGINT signal, aborting test') exception_seen = e except Exception as e: exception_seen = e hosts_ready = False if not hosts_ready: smallfile.abort_test(abortfn, []) if not exception_seen: raise Exception('hosts did not reach starting gate within %d seconds'%host_timeout) else: print('saw exception %s, aborting test'%str(e)) else: # ask all hosts to start the test # this is like firing the gun at the track meet try: sync_files.write_sync_file(starting_gate, 'hi') if verbose: print('starting gate file %s created'%starting_gate) except IOError as e: print('error writing starting gate: %s'%os.strerror(e.errno)) # wait for them to finish all_ok = True for t in remote_thread_list: t.join() if t.status != OK: all_ok = False print('ERROR: ssh thread for host %s completed with status %d'%(t.remote_host, t.status)) # attempt to aggregate results by reading pickle files # containing smf_invocation instances with counters and times that we need try: invoke_list = [] for h in prm_host_set: # for each host in test # read results for each thread run in that host # from python pickle of the list of smf_invocation objects pickle_fn = master_invoke.host_result_filename(h) if verbose: print('reading pickle file: %s'%pickle_fn) host_invoke_list = [] try: if not os.path.exists(pickle_fn): time.sleep(1.2) with open(pickle_fn, 'rb') as pickle_file: host_invoke_list = pickle.load(pickle_file) if verbose: print(' read %d invoke objects'%len(host_invoke_list)) invoke_list.extend(host_invoke_list) ensure_deleted(pickle_fn) except IOError as e: if e.errno != errno.ENOENT: raise e print(' pickle file %s not found'%pickle_fn) output_results.output_results(invoke_list, prm_host_set, prm.thread_count,pct_files_min) except IOError as e: print('host %s filename %s: %s'%(h, pickle_fn, str(e))) all_ok = False except KeyboardInterrupt as e: print('control-C signal seen (SIGINT)') all_ok = False if not all_ok: sys.exit(NOTOK) sys.exit(OK)
def run_multi_thread_workload(prm): master_invoke = prm.master_invoke prm_slave = prm.is_slave verbose = master_invoke.verbose host = master_invoke.onhost if not prm_slave: sync_files.create_top_dirs(master_invoke, False) if prm_slave: time.sleep(1.1) os.listdir(master_invoke.network_dir) for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]: for d in dlist: os.listdir(d) # hack to ensure that if verbose: print(host + " saw " + d) # for each thread set up smf_invocation instance, # create a thread instance, and delete the thread-ready file thread_list = create_worker_list(prm) starting_gate = thread_list[0].invoke.starting_gate my_host_invoke = thread_list[0].invoke # start threads, wait for them to reach starting gate # to do this, look for thread-ready files for t in thread_list: ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid)) for t in thread_list: t.start() if verbose: print("started %d worker threads on host %s" % (len(thread_list), host)) # wait for all threads to reach the starting gate # this makes it more likely that they will start simultaneously abort_fname = my_host_invoke.abort_fn() threads_ready = False # really just to set scope of variable k = 0 for sec in range(0, prm.startup_timeout * 2): threads_ready = True for t in thread_list: fn = t.invoke.gen_thread_ready_fname(t.invoke.tid) if not os.path.exists(fn): threads_ready = False break if threads_ready: break if os.path.exists(abort_fname): break if verbose: print("threads not ready...") time.sleep(0.5) # if all threads didn't make it to the starting gate if not threads_ready: abort_test(abort_fname, thread_list) raise Exception("threads did not reach starting gate within %d sec" % prm.startup_timeout) # declare that this host is at the starting gate if prm_slave: host_ready_fn = my_host_invoke.gen_host_ready_fname() if my_host_invoke.verbose: print("host %s creating ready file %s" % (my_host_invoke.onhost, host_ready_fn)) smallfile.touch(host_ready_fn) sg = my_host_invoke.starting_gate if not prm_slave: # special case of no --host-set parameter try: sync_files.write_sync_file(sg, "hi there") if verbose: print("wrote starting gate file") except IOError as e: print("error writing starting gate for threads: %s" % str(e)) # wait for starting_gate file to be created by test driver # every second we resume scan from last host file not found if verbose: print("awaiting " + sg) if prm_slave: for sec in range(0, prm.host_startup_timeout * 2): if os.path.exists(sg): break time.sleep(0.5) if not os.path.exists(sg): abort_test(my_host_invoke.abort_fn(), thread_list) raise Exception("starting signal not seen within %d seconds" % prm.host_startup_timeout) if verbose: print("starting test on host " + host + " in 2 seconds") time.sleep(2 + random.random()) # FIXME: don't timeout the test, # instead check thread progress and abort if you see any of them stalled # for long enough # problem is: if servers are heavily loaded you can't use filesystem to communicate this # wait for all threads on this host to finish for t in thread_list: if verbose: print("waiting for thread %s" % t.invoke.tid) t.invoke = t.receiver.recv() # must do this to get results from sub-process t.join() # if not a slave of some other host, print results (for this host) exit_status = OK if not prm_slave: try: # FIXME: code to aggregate results from list of invoke objects can be shared by multi-host and single-host cases invoke_list = [t.invoke for t in thread_list] output_results.output_results(invoke_list, ["localhost"], prm.thread_count, smallfile.pct_files_min) except SMFResultException as e: print("ERROR: " + str(e)) exit_status = NOTOK else: # if we are participating in a multi-host test # then write out this host's result in pickle format so test driver can pick up result result_filename = master_invoke.host_result_filename(prm.as_host) if verbose: print("writing invokes to: " + result_filename) invok_list = [t.invoke for t in thread_list] if verbose: print("saving result to filename %s" % result_filename) for ivk in invok_list: ivk.buf = None ivk.biggest_buf = None sync_files.write_pickle(result_filename, invok_list) time.sleep(1.2) # for benefit of NFS with actimeo=1 sys.exit(exit_status)
def run_multi_thread_workload(prm): host = prm.as_host if host == None: host = 'localhost' prm_slave = (prm.host_set != []) # FIXME: get coherent logging level interface verbose = os.getenv('LOGLEVEL_DEBUG' != None) host_startup_timeout = 5 + len(prm.host_set) / 3 # for each thread set up SmallfileWorkload instance, # create a thread instance, and delete the thread-ready file thread_list = create_worker_list(prm) my_host_invoke = thread_list[0].invoke my_log = fsd_log.start_log('%s.master' % host) my_log.debug(prm) # start threads, wait for them to reach starting gate # to do this, look for thread-ready files for t in thread_list: ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid)) for t in thread_list: t.start() my_log.debug('started %d worker threads on host %s' % (len(thread_list), host)) # wait for all threads to reach the starting gate # this makes it more likely that they will start simultaneously abort_fname = prm.abort_path thread_count = len(thread_list) thread_to_wait_for = 0 startup_timeout = 3 sec = 0.0 while sec < startup_timeout: for k in range(thread_to_wait_for, thread_count): t = thread_list[k] fn = t.invoke.gen_thread_ready_fname(t.invoke.tid) if not os.path.exists(fn): my_log.debug('thread %d thread-ready file %s not found yet with %f sec left' % (k, fn, (startup_timeout - sec))) break thread_to_wait_for = k + 1 # we only timeout if no more threads have reached starting gate # in startup_timeout sec sec = 0.0 if thread_to_wait_for == thread_count: break if os.path.exists(abort_fname): break sec += 0.5 time.sleep(0.5) # if all threads didn't make it to the starting gate if thread_to_wait_for < thread_count: abort_test(abort_fname, thread_list) raise FsDriftException('only %d threads reached starting gate' % thread_to_wait_for) # declare that this host is at the starting gate if prm_slave: host_ready_fn = gen_host_ready_fname(prm, prm.as_host) my_log.debug('host %s creating ready file %s' % (my_host_invoke.onhost, host_ready_fn)) common.touch(host_ready_fn) sg = prm.starting_gun_path if not prm_slave: my_log.debug('wrote starting gate file ') sync_files.write_sync_file(sg, 'hi there') # wait for starting_gate file to be created by test driver # every second we resume scan from last host file not found if prm_slave: my_log.debug('awaiting ' + sg) for sec in range(0, host_startup_timeout+3): # hack to ensure that directory is up to date # ndlist = os.listdir(my_host_invoke.network_dir) # if verbose: print(str(ndlist)) if os.path.exists(sg): break if os.path.exists(prm.abort_path): log.info('saw abort file %s, aborting test' % prm.abort_path) break time.sleep(1) if not os.path.exists(sg): abort_test(prm.abort_path, thread_list) raise Exception('starting signal not seen within %d seconds' % host_startup_timeout) if verbose: print('starting test on host ' + host + ' in 2 seconds') time.sleep(2 + random.random()) # let other hosts see starting gate file # FIXME: don't timeout the test, # instead check thread progress and abort if you see any of them stalled # but if servers are heavily loaded you can't rely on filesystem # wait for all threads on this host to finish for t in thread_list: my_log.debug('waiting for thread %s' % t.invoke.tid) t.retrieve() t.join() # if not a slave of some other host, print results (for this host) if not prm_slave: try: worker_list = [ t.invoke for t in thread_list ] output_results.output_results(prm, worker_list) except FsDriftException as e: print('ERROR: ' + str(e)) return NOTOK else: # if we are participating in a multi-host test # then write out this host's result in pickle format # so test driver can pick up result result_filename = host_result_filename(prm, prm.as_host) my_log.debug('saving result to filename %s' % result_filename) worker_list = [ t.invoke for t in thread_list ] sync_files.write_pickle(result_filename, worker_list) time.sleep(1.2) # for benefit of NFS with actimeo=1 return OK
def run_multi_host_workload(prm): prm_host_set = prm.host_set prm_slave = prm.is_slave prm_permute_host_dirs = prm.permute_host_dirs master_invoke = prm.master_invoke starting_gate = master_invoke.starting_gate verbose = master_invoke.verbose host = master_invoke.onhost # construct list of ssh threads to invoke in parallel sync_files.create_top_dirs(master_invoke, True) pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle') #if verbose: print('writing ' + pickle_fn) sync_files.write_pickle(pickle_fn, prm) if os.getenv('PYPY'): python_prog = os.getenv('PYPY') elif sys.version.startswith('2'): python_prog = 'python' elif sys.version.startswith('3'): python_prog = 'python3' else: raise Exception('unrecognized python version %s' % sys.version) #print('python_prog = %s'%python_prog) remote_thread_list = [] host_ct = len(prm_host_set) for j in range(0, len(prm_host_set)): remote_host = prm_host_set[j] smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py') this_remote_cmd = '%s %s --network-sync-dir %s '%\ (python_prog, smf_remote_pgm, prm.master_invoke.network_dir) #this_remote_cmd = remote_cmd if prm_permute_host_dirs: this_remote_cmd += ' --as-host %s' % prm_host_set[(j + 1) % host_ct] else: this_remote_cmd += ' --as-host %s' % remote_host if verbose: print(this_remote_cmd) if smallfile.is_windows_os: remote_thread_list.append( launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd)) else: remote_thread_list.append( ssh_thread.ssh_thread(remote_host, this_remote_cmd)) # start them, pacing starts so that we don't get ssh errors for t in remote_thread_list: t.start() # wait for hosts to arrive at starting gate # if only one host, then no wait will occur as starting gate file is already present # every second we resume scan from last host file not found # FIXME: for very large host sets, timeout only if no host responds within X seconds exception_seen = None hosts_ready = False # set scope outside while loop abortfn = master_invoke.abort_fn() last_host_seen = -1 sec = 0 sec_delta = 0.5 host_timeout = prm.host_startup_timeout if smallfile.is_windows_os: host_timeout += 20 try: # FIXME: make timeout criteria be that new new hosts seen in X seconds while sec < host_timeout: ndirlist = os.listdir(master_invoke.network_dir) if master_invoke.verbose: print('shared dir list: ' + str(ndirlist)) hosts_ready = True if os.path.exists(abortfn): raise Exception('worker host signaled abort') for j in range(last_host_seen + 1, len(prm_host_set)): h = prm_host_set[j] fn = master_invoke.gen_host_ready_fname(h.strip()) if verbose: print('checking for host filename ' + fn) if not os.path.exists(fn): hosts_ready = False break last_host_seen = j if hosts_ready: break # be patient for large tests # give user some feedback about how many hosts have arrived at the starting gate time.sleep(sec_delta) sec += sec_delta sec_delta += 1 if verbose: print('last_host_seen=%d sec=%d' % (last_host_seen, sec)) except KeyboardInterrupt as e: print('saw SIGINT signal, aborting test') exception_seen = e except Exception as e: exception_seen = e hosts_ready = False if not hosts_ready: smallfile.abort_test(abortfn, []) if not exception_seen: raise Exception( 'hosts did not reach starting gate within %d seconds' % host_timeout) else: print('saw exception %s, aborting test' % str(e)) else: # ask all hosts to start the test # this is like firing the gun at the track meet try: sync_files.write_sync_file(starting_gate, 'hi') if verbose: print('starting gate file %s created' % starting_gate) except IOError as e: print('error writing starting gate: %s' % os.strerror(e.errno)) # wait for them to finish all_ok = True for t in remote_thread_list: t.join() if t.status != OK: all_ok = False print('ERROR: ssh thread for host %s completed with status %d' % (t.remote_host, t.status)) # attempt to aggregate results by reading pickle files # containing smf_invocation instances with counters and times that we need try: invoke_list = [] for h in prm_host_set: # for each host in test # read results for each thread run in that host # from python pickle of the list of smf_invocation objects pickle_fn = master_invoke.host_result_filename(h) if verbose: print('reading pickle file: %s' % pickle_fn) host_invoke_list = [] try: if not os.path.exists(pickle_fn): time.sleep(1.2) with open(pickle_fn, 'rb') as pickle_file: host_invoke_list = pickle.load(pickle_file) if verbose: print(' read %d invoke objects' % len(host_invoke_list)) invoke_list.extend(host_invoke_list) ensure_deleted(pickle_fn) except IOError as e: if e.errno != errno.ENOENT: raise e print(' pickle file %s not found' % pickle_fn) output_results.output_results(invoke_list, prm_host_set, prm.thread_count, pct_files_min) except IOError as e: print('host %s filename %s: %s' % (h, pickle_fn, str(e))) all_ok = False except KeyboardInterrupt as e: print('control-C signal seen (SIGINT)') all_ok = False if not all_ok: sys.exit(NOTOK) sys.exit(OK)