def get_launch_command(self, basepath, processes, hostfile, runpath=None): """Get the process launch command used to run daos_perf Args: basepath (str): DAOS base path processes (int): number of host processes hostfile (str): file defining host names and slots Returns: str: returns daos_perf command """ attach_info_path = os.path.join(basepath, "install/tmp") load_mpi('openmpi') orterun_bin = find_executable('orterun') if orterun_bin is None: raise DaosPerfFailed("orterun not found") orterun_cmd = [ orterun_bin, "-np {}".format(processes), "--hostfile {}".format(hostfile), "--map-by node", "-x DAOS_SINGLETON_CLI=1", "-x CRT_ATTACH_INFO_PATH={}".format(attach_info_path), ] command = " ".join(orterun_cmd + [self.__str__()]) return command
def configure_mpi(prereqs, env, libs, required=None): """Check if mpi exists and configure environment""" if env.subst("$MPI_PKG") != "": return _configure_mpi_pkg(env, libs) mpis = ['openmpi', 'mpich'] if not required is None: if isinstance(required, str): mpis = [required] else: mpis = required for mpi in mpis: load_mpi(mpi) comp = mpi if mpi == "openmpi": comp = "ompi" if prereqs.check_component(comp): prereqs.require(env, comp) print("%s is installed" % mpi) libs.append('mpi') return comp print("No %s installed and/or loaded" % mpi) print("No OMPI installed") return None
def start_servers(self, server_groups=None): """Start the servers and clients. Args: server_groups (dict, optional): [description]. Defaults to None. """ if server_groups is None: server_groups = {self.server_group: self.hostlist_servers} if isinstance(server_groups, dict): # Optionally start servers on a different subset of hosts with a # different server group for group, hosts in server_groups.items(): self.log.info("Starting servers: group=%s, hosts=%s", group, hosts) self.server_managers.append( ServerManager(self.bin, os.path.join(self.ompi_prefix, "bin"))) self.server_managers[-1].get_params(self) self.server_managers[-1].runner.job.yaml_params.name = group self.server_managers[-1].hosts = (hosts, self.workdir, self.hostfile_servers_slots) if self.prefix != "/usr": if self.server_managers[-1].runner.export.value is None: self.server_managers[-1].runner.export.value = [] self.server_managers[-1].runner.export.value.extend( ["PATH"]) load_mpi("orterun") yamlfile = os.path.join(self.tmp, "daos_avocado_test.yaml") try: self.server_managers[-1].start(yamlfile) except ServerFailed as error: self.multi_log(" {}".format(error)) self.fail("Error starting server: {}".format(error))
def __init__(self, job, subprocess=False): """Create a Orterun object. Args: job (ExecutableCommand): command object to manage. subprocess (bool, optional): whether the command is run as a subprocess. Defaults to False. """ load_mpi("openmpi") path = os.path.dirname(find_executable("orterun")) super(Orterun, self).__init__("/run/orterun/*", "orterun", job, path, subprocess) # Default mca values to avoid queue pair errors mca_default = { "btl_openib_warn_default_gid_prefix": "0", "btl": "tcp,self", "oob": "tcp", "pml": "ob1", } self.hostfile = FormattedParameter("--hostfile {}", None) self.processes = FormattedParameter("--np {}", 1) self.display_map = FormattedParameter("--display-map", False) self.map_by = FormattedParameter("--map-by {}", "node") self.export = FormattedParameter("-x {}", None) self.enable_recovery = FormattedParameter("--enable-recovery", True) self.report_uri = FormattedParameter("--report-uri {}", None) self.allow_run_as_root = FormattedParameter("--allow-run-as-root", None) self.mca = FormattedParameter("--mca {}", mca_default) self.pprnode = FormattedParameter("--map-by ppr:{}:node", None) self.tag_output = FormattedParameter("--tag-output", True) self.ompi_server = FormattedParameter("--ompi-server {}", None) self.working_dir = FormattedParameter("-wdir {}", None)
def __init__(self, job, subprocess=False, mpitype="openmpi"): """Create a Mpirun object. Args: job (ExecutableCommand): command object to manage. subprocess (bool, optional): whether the command is run as a subprocess. Defaults to False. """ load_mpi(mpitype) path = os.path.dirname(find_executable("mpirun")) super(Mpirun, self).__init__("/run/mpirun", "mpirun", job, path, subprocess) mca_default = None if mpitype == "openmpi": # Default mca values to avoid queue pair errors w/ OpenMPI mca_default = { "btl_openib_warn_default_gid_prefix": "0", "btl": "tcp,self", "oob": "tcp", "pml": "ob1", } self.hostfile = FormattedParameter("-hostfile {}", None) self.processes = FormattedParameter("-np {}", 1) self.ppn = FormattedParameter("-ppn {}", None) self.envlist = FormattedParameter("-envlist {}", None) self.mca = FormattedParameter("--mca {}", mca_default) self.working_dir = FormattedParameter("-wdir {}", None) self.mpitype = mpitype
def mpich_installed(self, hostlist): """Check if mpich is installed. Args: hostlist (list): list of hosts Returns: bool: whether mpich is installed on the first host in the list """ load_mpi('mpich') # checking mpich install cmd = "set -e; " \ "export MODULEPATH=/usr/share/modules:/etc/modulefiles; " \ "for mod in mpi/mpich-x86_64 gnu-mpich; do " \ "if module is-avail $mod >/dev/null 2>&1; then " \ "module load $mod >/dev/null 2>&1; " \ "break; " \ "fi; " \ "done; " \ "command -v mpichversion" cmd = '/usr/bin/ssh {} {}'.format(hostlist[0], cmd) try: result = run_command(cmd) self.mpichinstall = \ result.stdout.rstrip()[:-len('bin/mpichversion')] return True except DaosTestError as excep: print("Mpich not installed \n {}".format(excep)) return False return False
def write_file(self, orterun, processes, hostfile, size, timeout=60): """Write a file to the pool. Args: orterun (str): full path to the orterun command processes (int): number of processes to launch hosts (list): list of clients from which to write the file size (int): size of the file to create in bytes timeout (int, optional): number of seconds before timing out the command. Defaults to 60 seconds. Returns: process.CmdResult: command execution result """ self.log.info("Writing %s bytes to pool %s", size, self.uuid) env = { "DAOS_POOL": self.uuid, "DAOS_SVCL": "1", "PYTHONPATH": os.getenv("PYTHONPATH", "") } load_mpi("openmpi") current_path = os.path.dirname(os.path.abspath(__file__)) command = "{} --np {} --hostfile {} {} {} testfile".format( orterun, processes, hostfile, os.path.join(current_path, "write_some_data.py"), size) return run_command(command, timeout, True, env=env)
def run(self): """Run the mpirun command. Raises: CommandFailure: if there is an error running the command """ load_mpi(self.mpitype) return super(Mpirun, self).run()
def run(self): """Run the orterun command. Raises: CommandFailure: if there is an error running the command """ load_mpi("openmpi") return super(Orterun, self).run()
def setUp(self): """Set up run before each test.""" super(TestWithoutServers, self).setUp() load_mpi('openmpi') self.orterun = find_executable('orterun') if self.orterun is None: self.fail("Could not find orterun") # hardware tests segfault in MPI_Init without this option self.client_mca = "--mca btl_openib_warn_default_gid_prefix 0" self.client_mca += " --mca pml ob1" self.client_mca += " --mca btl tcp,self" self.client_mca += " --mca oob tcp" self.ompi_prefix = os.path.dirname(os.path.dirname(self.orterun)) # get paths from the build_vars generated by build with open('../../.build_vars.json') as build_vars: build_paths = json.load(build_vars) self.basepath = os.path.normpath(os.path.join(build_paths['PREFIX'], '..') + os.path.sep) self.prefix = build_paths['PREFIX'] try: self.ofi_prefix = build_paths['OFI_PREFIX'] except KeyError: self.ofi_prefix = "/usr" self.bin = os.path.join(self.prefix, 'bin') self.daos_test = os.path.join(self.prefix, 'bin', 'daos_test') # set default shared dir for daos tests in case DAOS_TEST_SHARED_DIR # is not set, for RPM env and non-RPM env. if self.prefix != "/usr": self.tmp = os.path.join(self.prefix, 'tmp') else: self.tmp = os.getenv( 'DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test')) if not os.path.exists(self.tmp): os.makedirs(self.tmp) # setup fault injection, this MUST be before API setup fault_list = self.params.get("fault_list", '/run/faults/*') if fault_list: # not using workdir because the huge path was messing up # orterun or something, could re-evaluate this later self.fault_file = fault_config_utils.write_fault_file(self.tmp, fault_list, None) os.environ["D_FI_CONFIG"] = self.fault_file self.context = DaosContext(self.prefix + '/lib64/') self.d_log = DaosLog(self.context) self.test_log.daos_log = self.d_log
def run_test(self, test_repo, test_name): """Execute function to be used by test functions below. test_repo --absolute or relative (to self.mpichinstall) location of test repository test_name --name of the test to be run """ # Required to run daos command load_mpi("openmpi") # Create pool self.add_pool(connect=False) # create container self.add_container(self.pool) # initialize MpioUtils self.mpio = MpioUtils() if not self.mpio.mpich_installed(self.hostlist_clients): self.fail("Exiting Test: Mpich not installed") # fix up a relative test_repo specification if test_repo[0] != '/': test_repo = os.path.join(self.mpio.mpichinstall, test_repo) # initialize test specific variables client_processes = self.params.get("np", '/run/client_processes/') try: # running tests result = self.mpio.run_mpiio_tests(self.hostfile_clients, self.pool.uuid, test_repo, test_name, client_processes, self.container.uuid) except MpioFailed as excep: self.fail("<{0} Test Failed> \n{1}".format(test_name, excep)) # Check output for errors for output in (result.stdout_text, result.stderr_text): match = re.findall( r"(non-zero exit code|MPI_Abort|MPI_ABORT|ERROR)", output) if match: self.log.info( "The following error messages have been detected in the %s " "output:", test_name) for item in match: self.log.info(" %s", item) self.fail( "Error messages detected in {} output".format(test_name))
def mpich_installed(self, hostlist): """Check if mpich is installed""" load_mpi('mpich') try: # checking mpich install self.mpichinstall = subprocess.check_output([ "ssh", hostlist[0], "command -v mpichversion" ]).rstrip()[:-len('bin/mpichversion')] return True except subprocess.CalledProcessError as excep: print("Mpich not installed \n {}".format(excep)) return False
def get_environment(self, manager, log_file=None): """Get the environment variables to export for the daos_racer command. Args: manager (DaosServerManager): the job manager used to start daos_server from which the server config values can be obtained to set the required environment variables. Returns: EnvironmentVariables: a dictionary of environment variable names and values to export prior to running daos_racer """ env = super().get_environment(manager, log_file) env["OMPI_MCA_btl_openib_warn_default_gid_prefix"] = "0" env["OMPI_MCA_btl"] = "tcp,self" env["OMPI_MCA_oob"] = "tcp" env["OMPI_MCA_pml"] = "ob1" env["D_LOG_MASK"] = "ERR" if not load_mpi("openmpi"): raise MPILoadError("openmpi") env["LD_LIBRARY_PATH"] = os.environ["LD_LIBRARY_PATH"] return env
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.params.get("daos_test", self.TEST_PATH) num_clients = self.params.get("num_clients", '/run/daos_tests/num_clients/*') num_replicas = self.params.get("num_replicas", '/run/daos_tests/num_replicas/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.params.get("args", self.TEST_PATH, "") dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename cmd = " ".join( [ self.orterun, self.client_mca, "-n", str(num_clients), "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", self.daos_test, "-s", str(num_replicas), "-n", dmg_config_file, "".join(["-", subtest]), str(args) ] ) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) load_mpi("openmpi") try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result) self.fail( "{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.params.get("daos_test", '/run/daos_tests/Tests/*') num_clients = self.params.get("num_clients", '/run/daos_tests/num_clients/*') num_replicas = self.params.get("num_replicas", '/run/daos_tests/num_replicas/*') scm_size = self.params.get("scm_size", '/run/pool/*') args = self.params.get("args", '/run/daos_tests/Tests/*', "") cmd = "{} {} -n {} -x D_LOG_FILE={} \ -x D_LOG_MASK=DEBUG -x DD_MASK=mgmt,io,md,epc,rebuild \ {} -s {} -n {} {}".format(self.orterun, self.client_mca, num_clients, get_log_file(self.client_log), self.daos_test, num_replicas, subtest, args) env = {} env['CMOCKA_XML_FILE'] = "%g_results.xml" env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) load_mpi("openmpi") try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status is not 0: # fake a JUnit failure output with open(self.subtest_name + "_results.xml", "w") as results_xml: results_xml.write('''<?xml version="1.0" encoding="UTF-8"?> <testsuite name="{0}" errors="1" failures="0" skipped="0" tests="1" time="0.0"> <testcase name="ALL" time="0.0" > <error message="Test failed to start up"/> <system-out> <![CDATA[{1}]]> </system-out> <system-err> <![CDATA[{2}]]> </system-err> </testcase> </testsuite>'''.format(self.subtest_name, result.result.stdout, result.result.stderr)) self.fail("{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def __init__(self, job, subprocess=False, mpitype="openmpi"): """Create a Mpirun object. Args: job (ExecutableCommand): command object to manage. subprocess (bool, optional): whether the command is run as a subprocess. Defaults to False. """ load_mpi(mpitype) path = os.path.dirname(find_executable("mpirun")) super(Mpirun, self).__init__( "/run/mpirun", "mpirun", job, path, subprocess) self.hostfile = FormattedParameter("-hostfile {}", None) self.processes = FormattedParameter("-np {}", 1) self.ppn = FormattedParameter("-ppn {}", None) self.envlist = FormattedParameter("-envlist {}", None) self.mpitype = mpitype
def run_test(self, test_repo, test_name): """ Executable function to be used by test functions below test_repo --location of test repository test_name --name of the test to be run """ # Required to run daos command load_mpi("openmpi") # create container self._create_cont() # initialize MpioUtils self.mpio = MpioUtils() if not self.mpio.mpich_installed(self.hostlist_clients): self.fail("Exiting Test: Mpich not installed") # initialize test specific variables client_processes = self.params.get("np", '/run/client_processes/') try: # running tests self.mpio.run_mpiio_tests(self.hostfile_clients, self.pool.uuid, self.pool.svc_ranks, test_repo, test_name, client_processes, self.cont_uuid) except MpioFailed as excep: self.fail("<{0} Test Failed> \n{1}".format(test_name, excep)) # Parsing output to look for failures # stderr directed to stdout stdout = os.path.join(self.logdir, "stdout") searchfile = open(stdout, "r") error_message = [ "non-zero exit code", "MPI_Abort", "MPI_ABORT", "ERROR" ] for line in searchfile: for error in error_message: if error in line: self.fail( "Test Failed with error_message: {}".format(error))
def run(self): """Run the mpirun command. Raises: CommandFailure: if there is an error running the command """ if not load_mpi(self.mpitype): raise CommandFailure("Failed to load {}".format(self.mpitype)) return super(Mpirun, self).run()
def run(self): """Run the orterun command. Raises: CommandFailure: if there is an error running the command """ if not load_mpi("openmpi"): raise CommandFailure("Failed to load openmpi") return super().run()
def run(self): """Run the mpirun command. Raises: CommandFailure: if there is an error running the command """ if not load_mpi(self.mpi_type): raise MPILoadError(self.mpi_type) return super().run()
def mpich_installed(self, hostlist): """Check if mpich is installed. Args: hostlist (list): list of hosts Returns: bool: whether mpich is installed on the first host in the list """ load_mpi('mpich') try: # checking mpich install cmd = "/usr/bin/ssh {} command -v mpichversion".format(hostlist[0]) result = run_command(cmd) self.mpichinstall = \ result.stdout.rstrip()[:-len('bin/mpichversion')] return True except DaosTestError as excep: print("Mpich not installed \n {}".format(excep)) return False
def run_simul(self, include=None, exclude=None, raise_exception=True): """ Run simul include str: comma-separated list of tests to include exclude str: comma-separated list of tests to exclude If include value is set, exclude value is ignored and vice versa. """ mpi_type = self.params.get("mpi_type", "/run/*", "") simul_path = self.params.get("simul_path", "/run/*", "") # Create a pool self.log.info("Create a pool") self.add_pool() # Create a container self.log.info("Create container") self.add_container(self.pool) # Setup dfuse dfuse_hosts = self.agent_managers[0].hosts dfuse_mount_dir = self.params.get("mount_dir", '/run/dfuse/*') self.start_dfuse(dfuse_hosts, self.pool, self.container) self.dfuse.check_running() # The use of MPI here is to run in parallel all simul tests on a single host. if not load_mpi(mpi_type): raise MPILoadError(mpi_type) # Run simul sumil_cmd = os.path.join(simul_path, "simul") if include and not exclude: cmd = "{0} -vv -d {1} -i {2}".format(sumil_cmd, dfuse_mount_dir, include) elif exclude and not include: cmd = "{0} -vv -d {1} -e {2}".format(sumil_cmd, dfuse_mount_dir, exclude) else: self.fail( "##Both include and exclude tests are selected both or empty.") self.log.info("Running simul on %s", mpi_type) try: result = run_command(cmd, output_check="combined", raise_exception=raise_exception) finally: self.stop_dfuse() return result
def configure_mpi(env): """Check if mpi exists and configure environment""" if GetOption('help'): return True env['CXX'] = None if env.subst("$MPI_PKG") != "": return _configure_mpi_pkg(env) for mpi in ['openmpi', 'mpich']: if not load_mpi(mpi): continue if _find_mpicc(env): print("%s is installed" % mpi) return True print("No %s installed and/or loaded" % mpi) print("No MPI installed") return False
def configure_mpi(env, libs, required=None): """Check if mpi exists and configure environment""" if env.subst("$MPI_PKG") != "": return _configure_mpi_pkg(env, libs) mpis = ['openmpi', 'mpich'] if not required is None: if isinstance(required, str): mpis = [required] else: mpis = required for mpi in mpis: if not load_mpi(mpi): continue comp = mpi if mpi == "openmpi": comp = "ompi" if _find_mpicc(env): print("%s is installed" % mpi) return comp print("No %s installed and/or loaded" % mpi) print("No MPI installed") return None
elif "No such process" in e: print("The daos_server process is no longer available" " and could not be killed.") else: print("Unable to shut down DAOS server: {}".format(e)) if __name__ == "__main__": """ Start a DAOS server and then run the four stages of the client. """ print("Running rdb tests") rc = 0 binfo = BuildInfo(os.path.join(build_root, ".build_vars.json")); debug_cmds = "-x D_LOG_MASK=DEBUG,RPC=ERR,MEM=ERR " + \ "-x DD_SUBSYS=all -x DD_MASK=all" load_mpi('openmpi') orterun = find_executable('orterun') if orterun is None: raise ServerFailedToStart("No orterun installed") try: # Server operations p = start_server(binfo, orterun) counter = 0 daos_server = daos_server_pid() while daos_server is None: if counter >= 120: raise ServerTimedOut("No DAOS server process detected before "\ "timeout") counter += 1
print("The daos_server process is no longer available" " and could not be killed.") else: print("Unable to shut down DAOS server: {}".format(e)) if __name__ == "__main__": """ Start a DAOS server and then run the four stages of the client. """ print("Running rdb tests") rc = 0 binfo = BuildInfo(os.path.join(build_root, ".build_vars.json")) debug_cmds = "-x D_LOG_MASK=DEBUG,RPC=ERR,MEM=ERR " + \ "-x DD_SUBSYS=all -x DD_MASK=all" if not load_mpi('openmpi'): raise ServerFailedToStart("No orterun installed") orterun = find_executable('orterun') if orterun is None: raise ServerFailedToStart("No orterun installed") try: # Server operations p = start_server(binfo, orterun) counter = 0 daos_server = daos_server_pid() while daos_server is None: if counter >= 120: raise ServerTimedOut("No DAOS server process detected before "\ "timeout")
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: num_clients = self.params.get("num_clients", '/run/daos_tests/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.get_test_param("args", "") stopped_ranks = self.get_test_param("stopped_ranks", []) dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename if self.hostlist_clients: dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) dmg.copy_configuration(self.hostlist_clients) self.client_mca += " --mca btl_tcp_if_include eth0" cmd = " ".join([ self.orterun, self.client_mca, "-n", str(num_clients), "--hostfile", self.hostfile_clients, "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "--map-by node", "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args) ]) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_cmocka_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) if not load_mpi("openmpi"): self.fail("Failed to load openmpi") # Update the expected status for each ranks that will be stopped by this # test to avoid a false failure during tearDown(). if "random" in stopped_ranks: # Set each expected rank state to be either stopped or running for manager in self.server_managers: manager.update_expected_states( None, ["Joined", "Stopped", "Evicted"]) else: # Set the specific expected rank state to stopped for rank in stopped_ranks: for manager in self.server_managers: manager.update_expected_states(rank, ["Stopped", "Evicted"]) try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result) self.fail("{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def run_server(test, hostfile, setname, uri_path=None, env_dict=None, clean=True): # pylint: disable=unused-argument """Launch DAOS servers in accordance with the supplied hostfile. Args: test (Test): avocado Test object hostfile (str): hostfile defining on which hosts to start servers setname (str): session name uri_path (str, optional): path to uri file. Defaults to None. env_dict (dict, optional): dictionary on env variable names and values. Defaults to None. clean (bool, optional): clean the mount point. Defaults to True. Raises: ServerFailed: if there is an error starting the servers """ global SESSIONS # pylint: disable=global-variable-not-assigned try: servers = ([ line.split(' ')[0] for line in genio.read_all_lines(hostfile) ]) server_count = len(servers) # Pile of build time variables with open("../../.build_vars.json") as json_vars: build_vars = json.load(json_vars) # Create the DAOS server configuration yaml file to pass # with daos_server -o <FILE_NAME> print("Creating the server yaml file in {}".format(test.tmp)) server_yaml = os.path.join(test.tmp, AVOCADO_FILE) server_config = DaosServerConfig() server_config.get_params(test) access_points = ":".join((servers[0], str(server_config.port))) server_config.access_points.value = access_points.split() server_config.update_log_files(getattr(test, "control_log"), getattr(test, "helper_log"), getattr(test, "server_log")) server_config.create_yaml(server_yaml) # first make sure there are no existing servers running print("Removing any existing server processes") kill_server(servers) # clean the tmpfs on the servers if clean: print("Cleaning the server tmpfs directories") result = pcmd(servers, "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | " "xargs -0r rm -rf", verbose=False) if len(result) > 1 or 0 not in result: raise ServerFailed( "Error cleaning tmpfs on servers: {}".format(", ".join( [str(result[key]) for key in result if key != 0]))) load_mpi('openmpi') orterun_bin = find_executable('orterun') if orterun_bin is None: raise ServerFailed("Can't find orterun") server_cmd = [orterun_bin, "--np", str(server_count)] server_cmd.extend(["--mca", "btl_openib_warn_default_gid_prefix", "0"]) server_cmd.extend(["--mca", "btl", "tcp,self"]) server_cmd.extend(["--mca", "oob", "tcp"]) server_cmd.extend(["--mca", "pml", "ob1"]) server_cmd.extend(["--hostfile", hostfile]) server_cmd.extend(["--enable-recovery", "--tag-output"]) # Add any user supplied environment if env_dict is not None: for key, value in env_dict.items(): os.environ[key] = value server_cmd.extend(["-x", "{}={}".format(key, value)]) # the remote orte needs to know where to find daos, in the # case that it's not in the system prefix # but it should already be in our PATH, so just pass our # PATH along to the remote if build_vars["PREFIX"] != "/usr": server_cmd.extend(["-x", "PATH"]) # Run server in insecure mode until Certificate tests are in place server_cmd.extend([ os.path.join(build_vars["PREFIX"], "bin", "daos_server"), "--debug", "--config", server_yaml, "start", "-i", "--recreate-superblocks" ]) print("Start CMD>>>>{0}".format(' '.join(server_cmd))) resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) SESSIONS[setname] = subprocess.Popen(server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fdesc = SESSIONS[setname].stdout.fileno() fstat = fcntl.fcntl(fdesc, fcntl.F_GETFL) fcntl.fcntl(fdesc, fcntl.F_SETFL, fstat | os.O_NONBLOCK) timeout = 600 start_time = time.time() matches = 0 pattern = "DAOS I/O server.*started" expected_data = "Starting Servers\n" while True: output = "" try: output = SESSIONS[setname].stdout.read() except IOError as excpn: if excpn.errno != errno.EAGAIN: raise ServerFailed("Server didn't start: {}".format(excpn)) continue match = re.findall(pattern, output) expected_data += output matches += len(match) if not output or matches == server_count or \ time.time() - start_time > timeout: print("<SERVER>: {}".format(expected_data)) if matches != server_count: raise ServerFailed("Server didn't start!") break print("<SERVER> server started and took {} seconds to start".format( time.time() - start_time)) except Exception as error: print("<SERVER> Exception occurred: {0}".format(str(error))) traceback.print_exception(error.__class__, error, sys.exc_info()[2]) # We need to end the session now -- exit the shell try: SESSIONS[setname].send_signal(signal.SIGINT) time.sleep(5) # get the stderr error = SESSIONS[setname].stderr.read() if SESSIONS[setname].poll() is None: SESSIONS[setname].kill() retcode = SESSIONS[setname].wait() print("<SERVER> server start return code: {}\nstderr:\n{}".format( retcode, error)) except KeyError: pass raise ServerFailed("Server didn't start!")