Example #1
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Example #2
0
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        for d in master_invoke.top_dirs:
            ensure_dir_exists(d)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                ensure_dir_exists(d)
                if verbose:
                    print(host + ' saw ' + str(d))

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print('started %d worker threads on host %s' %
              (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    startup_timeout = prm.startup_timeout
    if smallfile.is_windows_os:
        print('adding time for Windows synchronization')
        startup_timeout += 30
    abort_fname = my_host_invoke.abort_fn()
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    for sec in range(0, startup_timeout * 2):
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                if verbose:
                    print('thread %d thread-ready file %s not found...' %
                          (k, fn))
                break
            thread_to_wait_for = k + 1
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise Exception('only %d threads reached starting gate within %d sec' %
                        (thread_to_wait_for, startup_timeout))

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print('host %s creating ready file %s' %
                  (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, 'hi there')
            if verbose:
                print('wrote starting gate file')
        except IOError as e:
            print('error writing starting gate for threads: %s' % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print('awaiting ' + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout + 10):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception('starting signal not seen within %d seconds' %
                            prm.host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print('waiting for thread %s' % t.invoke.tid)
        t.invoke = t.receiver.recv()  # to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, prm)
        except SMFResultException as e:
            print('ERROR: ' + str(e))
            exit_status = NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = \
            master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print('writing invokes to: ' + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print('saving result to filename %s' % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        for d in master_invoke.top_dirs:
            ensure_dir_exists(d)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                ensure_dir_exists(d)
                if verbose:
                    print(host + ' saw ' + str(d))

    # for each thread set up SmallfileWorkload instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print('started %d worker threads on host %s' %
              (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    startup_timeout = prm.startup_timeout
    if smallfile.is_windows_os:
        print('adding time for Windows synchronization')
        startup_timeout += 30
    abort_fname = my_host_invoke.abort_fn()
    thread_count = len(thread_list)
    thread_to_wait_for = 0
    for sec in range(0, startup_timeout * 2):
        for k in range(thread_to_wait_for, thread_count):
            t = thread_list[k]
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                if verbose:
                    print('thread %d thread-ready file %s not found...' % (k, fn))
                break
            thread_to_wait_for = k + 1
        if thread_to_wait_for == thread_count:
            break
        if os.path.exists(abort_fname):
            break
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if thread_to_wait_for < thread_count:
        abort_test(abort_fname, thread_list)
        raise Exception('only %d threads reached starting gate within %d sec'
                        % (thread_to_wait_for, startup_timeout))

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print('host %s creating ready file %s' %
                  (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, 'hi there')
            if verbose:
                print('wrote starting gate file')
        except IOError as e:
            print('error writing starting gate for threads: %s' % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print('awaiting ' + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout + 10):
            # hack to ensure that directory is up to date
            #   ndlist = os.listdir(my_host_invoke.network_dir)
            # if verbose: print(str(ndlist))
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception('starting signal not seen within %d seconds'
                            % prm.host_startup_timeout)
    if verbose:
        print('starting test on host ' + host + ' in 2 seconds')
    time.sleep(2 + random.random())  # let other hosts see starting gate file

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # but if servers are heavily loaded you can't rely on filesystem

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print('waiting for thread %s' % t.invoke.tid)
        t.invoke = t.receiver.recv()  # to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, prm)
        except SMFResultException as e:
            print('ERROR: ' + str(e))
            exit_status = NOTOK
    else:

        # if we are participating in a multi-host test
        # then write out this host's result in pickle format
        # so test driver can pick up result

        result_filename = \
            master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print('writing invokes to: ' + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print('saving result to filename %s' % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
Example #4
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')

    # if verbose: print('writing ' + pickle_fn))

    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)

    # print('python_prog = %s'%python_prog)

    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s ' \
            % (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        # this_remote_cmd = remote_cmd

        if prm_permute_host_dirs:
            this_remote_cmd += \
                ' --as-host %s' % prm_host_set[(j + 1) % host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose:
            print(this_remote_cmd)
        if smallfile.is_windows_os or prm.launch_by_daemon:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm,
                                                remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(ssh_thread.ssh_thread(remote_host,
                                                            this_remote_cmd))

    # start them

    for t in remote_thread_list:
        if not prm.launch_by_daemon:
            # pace starts so that we don't get ssh errors
            time.sleep(0.1)
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur
    # as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets,
    # timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0.0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os:
        host_timeout += 20

    try:
        while sec < host_timeout:
            # HACK to force directory entry coherency for Gluster
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose:
                    print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j  # saw this host's ready file
                # we exit while loop only if no hosts in host_timeout seconds
                sec = 0.0
            if hosts_ready:
                break

            # if one of ssh threads has died, no reason to continue

            kill_remaining_threads = False
            for t in remote_thread_list:
                if not t.isAlive():
                    print('thread %s on host %s has died' % (t, h))
                    kill_remaining_threads = True
                    break
            if kill_remaining_threads:
                break

            # be patient for large tests
            # give user some feedback about
            # how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        print('ERROR: host %s did not reach starting gate' % h)
        if not exception_seen:
            raise Exception('hosts did not reach starting gate ' +
                            'within %d seconds' % host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:

        # ask all hosts to start the test
        # this is like firing the gun at the track meet

        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            print('starting all threads by creating starting gate file %s' %
                  starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing SmallfileWorkload instances
    # with counters and times that we need

    try:
        all_ok = NOTOK
        invoke_list = []
        one_shot_delay = True
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of SmallfileWorkload objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose:
                print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if one_shot_delay and not os.path.exists(pickle_fn):

                    # all threads have joined already, they are done
                    # we allow > 1 sec
                    # for this (NFS) client to see other clients' files

                    time.sleep(1.2)
                    one_shot_delay = False
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm)
        all_ok = OK
    except IOError as e:

        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
    except SMFResultException as e:
        print(str(e))

    sys.exit(all_ok)
Example #5
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir,'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
      python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
      python_prog = 'python'
    elif sys.version.startswith('3'):
      python_prog = 'python3'
    else:
      raise Exception('unrecognized python version %s'%sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir, 'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)
        
        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
          this_remote_cmd += ' --as-host %s'%prm_host_set[(j+1)%host_ct]
        else:
          this_remote_cmd += ' --as-host %s'%remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
          remote_thread_list.append(launcher_thread.launcher_thread(prm, remote_host, this_remote_cmd ))
        else:
          remote_thread_list.append(ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds
  
    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen=-1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
     # FIXME: make timeout criteria be that new new hosts seen in X seconds
     while sec < host_timeout:
      ndirlist = os.listdir(master_invoke.network_dir)
      if master_invoke.verbose: print('shared dir list: ' + str(ndirlist))
      hosts_ready = True
      if os.path.exists(abortfn): raise Exception('worker host signaled abort')
      for j in range(last_host_seen+1, len(prm_host_set)):
        h=prm_host_set[j]
        fn = master_invoke.gen_host_ready_fname(h.strip())
        if verbose: print('checking for host filename '+fn)
        if not os.path.exists(fn):
            hosts_ready = False
            break
        last_host_seen=j
      if hosts_ready: break

      # be patient for large tests
      # give user some feedback about how many hosts have arrived at the starting gate

      time.sleep(sec_delta)
      sec += sec_delta
      sec_delta += 1
      if verbose: print('last_host_seen=%d sec=%d'%(last_host_seen,sec))
    except KeyboardInterrupt as e:
      print('saw SIGINT signal, aborting test')
      exception_seen = e
    except Exception as e:
      exception_seen = e
      hosts_ready = False
    if not hosts_ready:
      smallfile.abort_test(abortfn, [])
      if not exception_seen: 
        raise Exception('hosts did not reach starting gate within %d seconds'%host_timeout)
      else:
        print('saw exception %s, aborting test'%str(e))
    else:
      # ask all hosts to start the test
      # this is like firing the gun at the track meet
      try:
        sync_files.write_sync_file(starting_gate, 'hi')
        if verbose: print('starting gate file %s created'%starting_gate)
      except IOError as e:
        print('error writing starting gate: %s'%os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK: 
          all_ok = False
          print('ERROR: ssh thread for host %s completed with status %d'%(t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
      invoke_list = []
      for h in prm_host_set:  # for each host in test

        # read results for each thread run in that host
        # from python pickle of the list of smf_invocation objects

        pickle_fn = master_invoke.host_result_filename(h)
        if verbose: print('reading pickle file: %s'%pickle_fn)
        host_invoke_list = []
        try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                  host_invoke_list = pickle.load(pickle_file)
                if verbose: print(' read %d invoke objects'%len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
        except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found'%pickle_fn)

      output_results.output_results(invoke_list, prm_host_set, prm.thread_count,pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s'%(h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok: 
        sys.exit(NOTOK)
    sys.exit(OK)
def run_multi_thread_workload(prm):

    master_invoke = prm.master_invoke
    prm_slave = prm.is_slave
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    if not prm_slave:
        sync_files.create_top_dirs(master_invoke, False)

    if prm_slave:
        time.sleep(1.1)
        os.listdir(master_invoke.network_dir)
        for dlist in [master_invoke.src_dirs, master_invoke.dest_dirs]:
            for d in dlist:
                os.listdir(d)  # hack to ensure that
                if verbose:
                    print(host + " saw " + d)

    # for each thread set up smf_invocation instance,
    # create a thread instance, and delete the thread-ready file

    thread_list = create_worker_list(prm)
    starting_gate = thread_list[0].invoke.starting_gate
    my_host_invoke = thread_list[0].invoke

    # start threads, wait for them to reach starting gate
    # to do this, look for thread-ready files

    for t in thread_list:
        ensure_deleted(t.invoke.gen_thread_ready_fname(t.invoke.tid))
    for t in thread_list:
        t.start()
    if verbose:
        print("started %d worker threads on host %s" % (len(thread_list), host))

    # wait for all threads to reach the starting gate
    # this makes it more likely that they will start simultaneously

    abort_fname = my_host_invoke.abort_fn()
    threads_ready = False  # really just to set scope of variable
    k = 0
    for sec in range(0, prm.startup_timeout * 2):
        threads_ready = True
        for t in thread_list:
            fn = t.invoke.gen_thread_ready_fname(t.invoke.tid)
            if not os.path.exists(fn):
                threads_ready = False
                break
        if threads_ready:
            break
        if os.path.exists(abort_fname):
            break
        if verbose:
            print("threads not ready...")
        time.sleep(0.5)

    # if all threads didn't make it to the starting gate

    if not threads_ready:
        abort_test(abort_fname, thread_list)
        raise Exception("threads did not reach starting gate within %d sec" % prm.startup_timeout)

    # declare that this host is at the starting gate

    if prm_slave:
        host_ready_fn = my_host_invoke.gen_host_ready_fname()
        if my_host_invoke.verbose:
            print("host %s creating ready file %s" % (my_host_invoke.onhost, host_ready_fn))
        smallfile.touch(host_ready_fn)

    sg = my_host_invoke.starting_gate
    if not prm_slave:  # special case of no --host-set parameter
        try:
            sync_files.write_sync_file(sg, "hi there")
            if verbose:
                print("wrote starting gate file")
        except IOError as e:
            print("error writing starting gate for threads: %s" % str(e))

    # wait for starting_gate file to be created by test driver
    # every second we resume scan from last host file not found

    if verbose:
        print("awaiting " + sg)
    if prm_slave:
        for sec in range(0, prm.host_startup_timeout * 2):
            if os.path.exists(sg):
                break
            time.sleep(0.5)
        if not os.path.exists(sg):
            abort_test(my_host_invoke.abort_fn(), thread_list)
            raise Exception("starting signal not seen within %d seconds" % prm.host_startup_timeout)
    if verbose:
        print("starting test on host " + host + " in 2 seconds")
    time.sleep(2 + random.random())

    # FIXME: don't timeout the test,
    # instead check thread progress and abort if you see any of them stalled
    # for long enough
    # problem is: if servers are heavily loaded you can't use filesystem to communicate this

    # wait for all threads on this host to finish

    for t in thread_list:
        if verbose:
            print("waiting for thread %s" % t.invoke.tid)
        t.invoke = t.receiver.recv()  # must do this to get results from sub-process
        t.join()

    # if not a slave of some other host, print results (for this host)

    exit_status = OK
    if not prm_slave:
        try:
            # FIXME: code to aggregate results from list of invoke objects can be shared by multi-host and single-host cases
            invoke_list = [t.invoke for t in thread_list]
            output_results.output_results(invoke_list, ["localhost"], prm.thread_count, smallfile.pct_files_min)
        except SMFResultException as e:
            print("ERROR: " + str(e))
            exit_status = NOTOK

    else:
        # if we are participating in a multi-host test
        # then write out this host's result in pickle format so test driver can pick up result

        result_filename = master_invoke.host_result_filename(prm.as_host)
        if verbose:
            print("writing invokes to: " + result_filename)
        invok_list = [t.invoke for t in thread_list]
        if verbose:
            print("saving result to filename %s" % result_filename)
        for ivk in invok_list:
            ivk.buf = None
            ivk.biggest_buf = None
        sync_files.write_pickle(result_filename, invok_list)
        time.sleep(1.2)  # for benefit of NFS with actimeo=1

    sys.exit(exit_status)
Example #7
0
def run_multi_host_workload(prm):

    prm_host_set = prm.host_set
    prm_slave = prm.is_slave
    prm_permute_host_dirs = prm.permute_host_dirs
    master_invoke = prm.master_invoke

    starting_gate = master_invoke.starting_gate
    verbose = master_invoke.verbose
    host = master_invoke.onhost

    # construct list of ssh threads to invoke in parallel

    sync_files.create_top_dirs(master_invoke, True)
    pickle_fn = os.path.join(prm.master_invoke.network_dir, 'param.pickle')
    #if verbose: print('writing ' + pickle_fn)
    sync_files.write_pickle(pickle_fn, prm)
    if os.getenv('PYPY'):
        python_prog = os.getenv('PYPY')
    elif sys.version.startswith('2'):
        python_prog = 'python'
    elif sys.version.startswith('3'):
        python_prog = 'python3'
    else:
        raise Exception('unrecognized python version %s' % sys.version)
    #print('python_prog = %s'%python_prog)
    remote_thread_list = []
    host_ct = len(prm_host_set)
    for j in range(0, len(prm_host_set)):
        remote_host = prm_host_set[j]
        smf_remote_pgm = os.path.join(prm.remote_pgm_dir,
                                      'smallfile_remote.py')
        this_remote_cmd = '%s %s --network-sync-dir %s '%\
           (python_prog, smf_remote_pgm, prm.master_invoke.network_dir)

        #this_remote_cmd = remote_cmd
        if prm_permute_host_dirs:
            this_remote_cmd += ' --as-host %s' % prm_host_set[(j + 1) %
                                                              host_ct]
        else:
            this_remote_cmd += ' --as-host %s' % remote_host
        if verbose: print(this_remote_cmd)
        if smallfile.is_windows_os:
            remote_thread_list.append(
                launcher_thread.launcher_thread(prm, remote_host,
                                                this_remote_cmd))
        else:
            remote_thread_list.append(
                ssh_thread.ssh_thread(remote_host, this_remote_cmd))

    # start them, pacing starts so that we don't get ssh errors

    for t in remote_thread_list:
        t.start()

    # wait for hosts to arrive at starting gate
    # if only one host, then no wait will occur as starting gate file is already present
    # every second we resume scan from last host file not found
    # FIXME: for very large host sets, timeout only if no host responds within X seconds

    exception_seen = None
    hosts_ready = False  # set scope outside while loop
    abortfn = master_invoke.abort_fn()
    last_host_seen = -1
    sec = 0
    sec_delta = 0.5
    host_timeout = prm.host_startup_timeout
    if smallfile.is_windows_os: host_timeout += 20

    try:
        # FIXME: make timeout criteria be that new new hosts seen in X seconds
        while sec < host_timeout:
            ndirlist = os.listdir(master_invoke.network_dir)
            if master_invoke.verbose:
                print('shared dir list: ' + str(ndirlist))
            hosts_ready = True
            if os.path.exists(abortfn):
                raise Exception('worker host signaled abort')
            for j in range(last_host_seen + 1, len(prm_host_set)):
                h = prm_host_set[j]
                fn = master_invoke.gen_host_ready_fname(h.strip())
                if verbose: print('checking for host filename ' + fn)
                if not os.path.exists(fn):
                    hosts_ready = False
                    break
                last_host_seen = j
            if hosts_ready: break

            # be patient for large tests
            # give user some feedback about how many hosts have arrived at the starting gate

            time.sleep(sec_delta)
            sec += sec_delta
            sec_delta += 1
            if verbose:
                print('last_host_seen=%d sec=%d' % (last_host_seen, sec))
    except KeyboardInterrupt as e:
        print('saw SIGINT signal, aborting test')
        exception_seen = e
    except Exception as e:
        exception_seen = e
        hosts_ready = False
    if not hosts_ready:
        smallfile.abort_test(abortfn, [])
        if not exception_seen:
            raise Exception(
                'hosts did not reach starting gate within %d seconds' %
                host_timeout)
        else:
            print('saw exception %s, aborting test' % str(e))
    else:
        # ask all hosts to start the test
        # this is like firing the gun at the track meet
        try:
            sync_files.write_sync_file(starting_gate, 'hi')
            if verbose: print('starting gate file %s created' % starting_gate)
        except IOError as e:
            print('error writing starting gate: %s' % os.strerror(e.errno))

    # wait for them to finish

    all_ok = True
    for t in remote_thread_list:
        t.join()
        if t.status != OK:
            all_ok = False
            print('ERROR: ssh thread for host %s completed with status %d' %
                  (t.remote_host, t.status))

    # attempt to aggregate results by reading pickle files
    # containing smf_invocation instances with counters and times that we need

    try:
        invoke_list = []
        for h in prm_host_set:  # for each host in test

            # read results for each thread run in that host
            # from python pickle of the list of smf_invocation objects

            pickle_fn = master_invoke.host_result_filename(h)
            if verbose: print('reading pickle file: %s' % pickle_fn)
            host_invoke_list = []
            try:
                if not os.path.exists(pickle_fn): time.sleep(1.2)
                with open(pickle_fn, 'rb') as pickle_file:
                    host_invoke_list = pickle.load(pickle_file)
                if verbose:
                    print(' read %d invoke objects' % len(host_invoke_list))
                invoke_list.extend(host_invoke_list)
                ensure_deleted(pickle_fn)
            except IOError as e:
                if e.errno != errno.ENOENT: raise e
                print('  pickle file %s not found' % pickle_fn)

        output_results.output_results(invoke_list, prm_host_set,
                                      prm.thread_count, pct_files_min)

    except IOError as e:
        print('host %s filename %s: %s' % (h, pickle_fn, str(e)))
        all_ok = False
    except KeyboardInterrupt as e:
        print('control-C signal seen (SIGINT)')
        all_ok = False
    if not all_ok:
        sys.exit(NOTOK)
    sys.exit(OK)