Ejemplo n.º 1
0
    def get_launch_command(self, basepath, processes, hostfile, runpath=None):
        """Get the process launch command used to run daos_perf
        Args:
            basepath (str): DAOS base path
            processes (int): number of host processes
            hostfile (str): file defining host names and slots
        Returns:
            str: returns daos_perf command
        """
        attach_info_path = os.path.join(basepath, "install/tmp")

        load_mpi('openmpi')
        orterun_bin = find_executable('orterun')
        if orterun_bin is None:
            raise DaosPerfFailed("orterun not found")

        orterun_cmd = [
            orterun_bin,
            "-np {}".format(processes),
            "--hostfile {}".format(hostfile),
            "--map-by node",
            "-x DAOS_SINGLETON_CLI=1",
            "-x CRT_ATTACH_INFO_PATH={}".format(attach_info_path),
        ]
        command = " ".join(orterun_cmd + [self.__str__()])

        return command
Ejemplo n.º 2
0
def configure_mpi(prereqs, env, libs, required=None):
    """Check if mpi exists and configure environment"""
    if env.subst("$MPI_PKG") != "":
        return _configure_mpi_pkg(env, libs)

    mpis = ['openmpi', 'mpich']
    if not required is None:
        if isinstance(required, str):
            mpis = [required]
        else:
            mpis = required

    for mpi in mpis:
        load_mpi(mpi)
        comp = mpi
        if mpi == "openmpi":
            comp = "ompi"
        if prereqs.check_component(comp):
            prereqs.require(env, comp)
            print("%s is installed" % mpi)
            libs.append('mpi')
            return comp
        print("No %s installed and/or loaded" % mpi)
    print("No OMPI installed")
    return None
Ejemplo n.º 3
0
    def start_servers(self, server_groups=None):
        """Start the servers and clients.

        Args:
            server_groups (dict, optional): [description]. Defaults to None.
        """
        if server_groups is None:
            server_groups = {self.server_group: self.hostlist_servers}

        if isinstance(server_groups, dict):
            # Optionally start servers on a different subset of hosts with a
            # different server group
            for group, hosts in server_groups.items():
                self.log.info("Starting servers: group=%s, hosts=%s", group,
                              hosts)
                self.server_managers.append(
                    ServerManager(self.bin,
                                  os.path.join(self.ompi_prefix, "bin")))
                self.server_managers[-1].get_params(self)
                self.server_managers[-1].runner.job.yaml_params.name = group
                self.server_managers[-1].hosts = (hosts, self.workdir,
                                                  self.hostfile_servers_slots)
                if self.prefix != "/usr":
                    if self.server_managers[-1].runner.export.value is None:
                        self.server_managers[-1].runner.export.value = []
                    self.server_managers[-1].runner.export.value.extend(
                        ["PATH"])
                load_mpi("orterun")
                yamlfile = os.path.join(self.tmp, "daos_avocado_test.yaml")

                try:
                    self.server_managers[-1].start(yamlfile)
                except ServerFailed as error:
                    self.multi_log("  {}".format(error))
                    self.fail("Error starting server: {}".format(error))
Ejemplo n.º 4
0
    def __init__(self, job, subprocess=False):
        """Create a Orterun object.

        Args:
            job (ExecutableCommand): command object to manage.
            subprocess (bool, optional): whether the command is run as a
                subprocess. Defaults to False.
        """
        load_mpi("openmpi")
        path = os.path.dirname(find_executable("orterun"))
        super(Orterun, self).__init__("/run/orterun/*", "orterun", job, path,
                                      subprocess)

        # Default mca values to avoid queue pair errors
        mca_default = {
            "btl_openib_warn_default_gid_prefix": "0",
            "btl": "tcp,self",
            "oob": "tcp",
            "pml": "ob1",
        }

        self.hostfile = FormattedParameter("--hostfile {}", None)
        self.processes = FormattedParameter("--np {}", 1)
        self.display_map = FormattedParameter("--display-map", False)
        self.map_by = FormattedParameter("--map-by {}", "node")
        self.export = FormattedParameter("-x {}", None)
        self.enable_recovery = FormattedParameter("--enable-recovery", True)
        self.report_uri = FormattedParameter("--report-uri {}", None)
        self.allow_run_as_root = FormattedParameter("--allow-run-as-root",
                                                    None)
        self.mca = FormattedParameter("--mca {}", mca_default)
        self.pprnode = FormattedParameter("--map-by ppr:{}:node", None)
        self.tag_output = FormattedParameter("--tag-output", True)
        self.ompi_server = FormattedParameter("--ompi-server {}", None)
        self.working_dir = FormattedParameter("-wdir {}", None)
Ejemplo n.º 5
0
    def __init__(self, job, subprocess=False, mpitype="openmpi"):
        """Create a Mpirun object.

        Args:
            job (ExecutableCommand): command object to manage.
            subprocess (bool, optional): whether the command is run as a
                subprocess. Defaults to False.
        """
        load_mpi(mpitype)
        path = os.path.dirname(find_executable("mpirun"))
        super(Mpirun, self).__init__("/run/mpirun", "mpirun", job, path,
                                     subprocess)

        mca_default = None
        if mpitype == "openmpi":
            # Default mca values to avoid queue pair errors w/ OpenMPI
            mca_default = {
                "btl_openib_warn_default_gid_prefix": "0",
                "btl": "tcp,self",
                "oob": "tcp",
                "pml": "ob1",
            }

        self.hostfile = FormattedParameter("-hostfile {}", None)
        self.processes = FormattedParameter("-np {}", 1)
        self.ppn = FormattedParameter("-ppn {}", None)
        self.envlist = FormattedParameter("-envlist {}", None)
        self.mca = FormattedParameter("--mca {}", mca_default)
        self.working_dir = FormattedParameter("-wdir {}", None)

        self.mpitype = mpitype
Ejemplo n.º 6
0
    def mpich_installed(self, hostlist):
        """Check if mpich is installed.

        Args:
            hostlist (list): list of hosts

        Returns:
            bool: whether mpich is installed on the first host in the list

        """
        load_mpi('mpich')

        # checking mpich install
        cmd = "set -e; "                                                \
              "export MODULEPATH=/usr/share/modules:/etc/modulefiles; " \
              "for mod in mpi/mpich-x86_64 gnu-mpich; do "              \
                  "if module is-avail $mod >/dev/null 2>&1; then "      \
                      "module load $mod >/dev/null 2>&1; "              \
                      "break; "                                         \
                  "fi; "                                                \
              "done; "                                                  \
              "command -v mpichversion"
        cmd = '/usr/bin/ssh {} {}'.format(hostlist[0], cmd)
        try:
            result = run_command(cmd)
            self.mpichinstall = \
                result.stdout.rstrip()[:-len('bin/mpichversion')]
            return True

        except DaosTestError as excep:
            print("Mpich not installed \n {}".format(excep))
            return False
        return False
Ejemplo n.º 7
0
    def write_file(self, orterun, processes, hostfile, size, timeout=60):
        """Write a file to the pool.

        Args:
            orterun (str): full path to the orterun command
            processes (int): number of processes to launch
            hosts (list): list of clients from which to write the file
            size (int): size of the file to create in bytes
            timeout (int, optional): number of seconds before timing out the
                command. Defaults to 60 seconds.

        Returns:
            process.CmdResult: command execution result

        """
        self.log.info("Writing %s bytes to pool %s", size, self.uuid)
        env = {
            "DAOS_POOL": self.uuid,
            "DAOS_SVCL": "1",
            "PYTHONPATH": os.getenv("PYTHONPATH", "")
        }
        load_mpi("openmpi")
        current_path = os.path.dirname(os.path.abspath(__file__))
        command = "{} --np {} --hostfile {} {} {} testfile".format(
            orterun, processes, hostfile,
            os.path.join(current_path, "write_some_data.py"), size)
        return run_command(command, timeout, True, env=env)
Ejemplo n.º 8
0
    def run(self):
        """Run the mpirun command.

        Raises:
            CommandFailure: if there is an error running the command

        """
        load_mpi(self.mpitype)
        return super(Mpirun, self).run()
Ejemplo n.º 9
0
    def run(self):
        """Run the orterun command.

        Raises:
            CommandFailure: if there is an error running the command

        """
        load_mpi("openmpi")
        return super(Orterun, self).run()
Ejemplo n.º 10
0
    def setUp(self):
        """Set up run before each test."""
        super(TestWithoutServers, self).setUp()

        load_mpi('openmpi')

        self.orterun = find_executable('orterun')
        if self.orterun is None:
            self.fail("Could not find orterun")

        # hardware tests segfault in MPI_Init without this option
        self.client_mca = "--mca btl_openib_warn_default_gid_prefix 0"
        self.client_mca += " --mca pml ob1"
        self.client_mca += " --mca btl tcp,self"
        self.client_mca += " --mca oob tcp"
        self.ompi_prefix = os.path.dirname(os.path.dirname(self.orterun))
        # get paths from the build_vars generated by build
        with open('../../.build_vars.json') as build_vars:
            build_paths = json.load(build_vars)
        self.basepath = os.path.normpath(os.path.join(build_paths['PREFIX'],
                                                      '..') + os.path.sep)
        self.prefix = build_paths['PREFIX']
        try:
            self.ofi_prefix = build_paths['OFI_PREFIX']
        except KeyError:
            self.ofi_prefix = "/usr"
        self.bin = os.path.join(self.prefix, 'bin')
        self.daos_test = os.path.join(self.prefix, 'bin', 'daos_test')

        # set default shared dir for daos tests in case DAOS_TEST_SHARED_DIR
        # is not set, for RPM env and non-RPM env.
        if self.prefix != "/usr":
            self.tmp = os.path.join(self.prefix, 'tmp')
        else:
            self.tmp = os.getenv(
                'DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test'))
        if not os.path.exists(self.tmp):
            os.makedirs(self.tmp)

        # setup fault injection, this MUST be before API setup
        fault_list = self.params.get("fault_list", '/run/faults/*')
        if fault_list:
            # not using workdir because the huge path was messing up
            # orterun or something, could re-evaluate this later
            self.fault_file = fault_config_utils.write_fault_file(self.tmp,
                                                                  fault_list,
                                                                  None)
            os.environ["D_FI_CONFIG"] = self.fault_file

        self.context = DaosContext(self.prefix + '/lib64/')
        self.d_log = DaosLog(self.context)
        self.test_log.daos_log = self.d_log
Ejemplo n.º 11
0
    def run_test(self, test_repo, test_name):
        """Execute function to be used by test functions below.

        test_repo       --absolute or relative (to self.mpichinstall) location
                          of test repository
        test_name       --name of the test to be run
        """
        # Required to run daos command
        load_mpi("openmpi")

        # Create pool
        self.add_pool(connect=False)

        # create container
        self.add_container(self.pool)

        # initialize MpioUtils
        self.mpio = MpioUtils()
        if not self.mpio.mpich_installed(self.hostlist_clients):
            self.fail("Exiting Test: Mpich not installed")

        # fix up a relative test_repo specification
        if test_repo[0] != '/':
            test_repo = os.path.join(self.mpio.mpichinstall, test_repo)

        # initialize test specific variables
        client_processes = self.params.get("np", '/run/client_processes/')

        try:
            # running tests
            result = self.mpio.run_mpiio_tests(self.hostfile_clients,
                                               self.pool.uuid, test_repo,
                                               test_name, client_processes,
                                               self.container.uuid)
        except MpioFailed as excep:
            self.fail("<{0} Test Failed> \n{1}".format(test_name, excep))

        # Check output for errors
        for output in (result.stdout_text, result.stderr_text):
            match = re.findall(
                r"(non-zero exit code|MPI_Abort|MPI_ABORT|ERROR)", output)
            if match:
                self.log.info(
                    "The following error messages have been detected in the %s "
                    "output:", test_name)
                for item in match:
                    self.log.info("  %s", item)
                self.fail(
                    "Error messages detected in {} output".format(test_name))
Ejemplo n.º 12
0
    def mpich_installed(self, hostlist):
        """Check if mpich is installed"""

        load_mpi('mpich')

        try:
            # checking mpich install
            self.mpichinstall = subprocess.check_output([
                "ssh", hostlist[0], "command -v mpichversion"
            ]).rstrip()[:-len('bin/mpichversion')]

            return True

        except subprocess.CalledProcessError as excep:
            print("Mpich not installed \n {}".format(excep))
            return False
Ejemplo n.º 13
0
    def get_environment(self, manager, log_file=None):
        """Get the environment variables to export for the daos_racer command.

        Args:
            manager (DaosServerManager): the job manager used to start
                daos_server from which the server config values can be obtained
                to set the required environment variables.

        Returns:
            EnvironmentVariables: a dictionary of environment variable names and
                values to export prior to running daos_racer

        """
        env = super().get_environment(manager, log_file)
        env["OMPI_MCA_btl_openib_warn_default_gid_prefix"] = "0"
        env["OMPI_MCA_btl"] = "tcp,self"
        env["OMPI_MCA_oob"] = "tcp"
        env["OMPI_MCA_pml"] = "ob1"
        env["D_LOG_MASK"] = "ERR"

        if not load_mpi("openmpi"):
            raise MPILoadError("openmpi")

        env["LD_LIBRARY_PATH"] = os.environ["LD_LIBRARY_PATH"]

        return env
Ejemplo n.º 14
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.params.get("daos_test", self.TEST_PATH)
        num_clients = self.params.get("num_clients",
                                      '/run/daos_tests/num_clients/*')
        num_replicas = self.params.get("num_replicas",
                                       '/run/daos_tests/num_replicas/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        nvme_size = self.params.get("nvme_size", '/run/pool/*')
        args = self.params.get("args", self.TEST_PATH, "")
        dmg = self.get_dmg_command()
        dmg_config_file = dmg.yaml.filename

        cmd = " ".join(
            [
                self.orterun,
                self.client_mca,
                "-n", str(num_clients),
                "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]),
                "-x", "D_LOG_MASK=DEBUG",
                "-x", "DD_MASK=mgmt,io,md,epc,rebuild",
                self.daos_test,
                "-s", str(num_replicas),
                "-n", dmg_config_file,
                "".join(["-", subtest]),
                str(args)
            ]
        )

        env = {}
        env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_results.xml")
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)
        if not nvme_size:
            nvme_size = 0
        env['POOL_NVME_SIZE'] = "{}".format(nvme_size)

        load_mpi("openmpi")
        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status != 0:
                # fake a JUnit failure output
                self.create_results_xml(self.subtest_name, result)
                self.fail(
                    "{0} failed with return code={1}.\n".format(
                        cmd, result.result.exit_status))
Ejemplo n.º 15
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.params.get("daos_test", '/run/daos_tests/Tests/*')
        num_clients = self.params.get("num_clients",
                                      '/run/daos_tests/num_clients/*')
        num_replicas = self.params.get("num_replicas",
                                       '/run/daos_tests/num_replicas/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        args = self.params.get("args", '/run/daos_tests/Tests/*', "")

        cmd = "{} {} -n {} -x D_LOG_FILE={} \
            -x D_LOG_MASK=DEBUG -x DD_MASK=mgmt,io,md,epc,rebuild \
            {} -s {} -n {} {}".format(self.orterun,
                                      self.client_mca, num_clients,
                                      get_log_file(self.client_log),
                                      self.daos_test, num_replicas, subtest,
                                      args)

        env = {}
        env['CMOCKA_XML_FILE'] = "%g_results.xml"
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)

        load_mpi("openmpi")
        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status is not 0:
                # fake a JUnit failure output
                with open(self.subtest_name + "_results.xml",
                          "w") as results_xml:
                    results_xml.write('''<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="{0}" errors="1" failures="0" skipped="0" tests="1" time="0.0">
  <testcase name="ALL" time="0.0" >
    <error message="Test failed to start up"/>
    <system-out>
<![CDATA[{1}]]>
    </system-out>
    <system-err>
<![CDATA[{2}]]>
    </system-err>
  </testcase>
</testsuite>'''.format(self.subtest_name, result.result.stdout,
                       result.result.stderr))
                self.fail("{0} failed with return code={1}.\n".format(
                    cmd, result.result.exit_status))
Ejemplo n.º 16
0
    def __init__(self, job, subprocess=False, mpitype="openmpi"):
        """Create a Mpirun object.

        Args:
            job (ExecutableCommand): command object to manage.
            subprocess (bool, optional): whether the command is run as a
                subprocess. Defaults to False.
        """
        load_mpi(mpitype)
        path = os.path.dirname(find_executable("mpirun"))
        super(Mpirun, self).__init__(
            "/run/mpirun", "mpirun", job, path, subprocess)

        self.hostfile = FormattedParameter("-hostfile {}", None)
        self.processes = FormattedParameter("-np {}", 1)
        self.ppn = FormattedParameter("-ppn {}", None)
        self.envlist = FormattedParameter("-envlist {}", None)
        self.mpitype = mpitype
Ejemplo n.º 17
0
    def run_test(self, test_repo, test_name):
        """
        Executable function to be used by test functions below
        test_repo       --location of test repository
        test_name       --name of the test to be run
        """
        # Required to run daos command
        load_mpi("openmpi")

        # create container
        self._create_cont()

        # initialize MpioUtils
        self.mpio = MpioUtils()
        if not self.mpio.mpich_installed(self.hostlist_clients):
            self.fail("Exiting Test: Mpich not installed")

        # initialize test specific variables
        client_processes = self.params.get("np", '/run/client_processes/')

        try:
            # running tests
            self.mpio.run_mpiio_tests(self.hostfile_clients, self.pool.uuid,
                                      self.pool.svc_ranks, test_repo,
                                      test_name, client_processes,
                                      self.cont_uuid)
        except MpioFailed as excep:
            self.fail("<{0} Test Failed> \n{1}".format(test_name, excep))

        # Parsing output to look for failures
        # stderr directed to stdout
        stdout = os.path.join(self.logdir, "stdout")
        searchfile = open(stdout, "r")
        error_message = [
            "non-zero exit code", "MPI_Abort", "MPI_ABORT", "ERROR"
        ]

        for line in searchfile:
            for error in error_message:
                if error in line:
                    self.fail(
                        "Test Failed with error_message: {}".format(error))
Ejemplo n.º 18
0
    def run(self):
        """Run the mpirun command.

        Raises:
            CommandFailure: if there is an error running the command

        """
        if not load_mpi(self.mpitype):
            raise CommandFailure("Failed to load {}".format(self.mpitype))

        return super(Mpirun, self).run()
Ejemplo n.º 19
0
    def run(self):
        """Run the orterun command.

        Raises:
            CommandFailure: if there is an error running the command

        """
        if not load_mpi("openmpi"):
            raise CommandFailure("Failed to load openmpi")

        return super().run()
Ejemplo n.º 20
0
    def run(self):
        """Run the mpirun command.

        Raises:
            CommandFailure: if there is an error running the command

        """
        if not load_mpi(self.mpi_type):
            raise MPILoadError(self.mpi_type)

        return super().run()
Ejemplo n.º 21
0
    def mpich_installed(self, hostlist):
        """Check if mpich is installed.

        Args:
            hostlist (list): list of hosts

        Returns:
            bool: whether mpich is installed on the first host in the list

        """
        load_mpi('mpich')
        try:
            # checking mpich install
            cmd = "/usr/bin/ssh {} command -v mpichversion".format(hostlist[0])
            result = run_command(cmd)
            self.mpichinstall = \
                result.stdout.rstrip()[:-len('bin/mpichversion')]
            return True

        except DaosTestError as excep:
            print("Mpich not installed \n {}".format(excep))
            return False
Ejemplo n.º 22
0
    def run_simul(self, include=None, exclude=None, raise_exception=True):
        """ Run simul
        include str: comma-separated list of tests to include
        exclude str: comma-separated list of tests to exclude
        If include value is set, exclude value is ignored and vice versa.
        """
        mpi_type = self.params.get("mpi_type", "/run/*", "")
        simul_path = self.params.get("simul_path", "/run/*", "")

        # Create a pool
        self.log.info("Create a pool")
        self.add_pool()

        # Create a container
        self.log.info("Create container")
        self.add_container(self.pool)

        # Setup dfuse
        dfuse_hosts = self.agent_managers[0].hosts
        dfuse_mount_dir = self.params.get("mount_dir", '/run/dfuse/*')
        self.start_dfuse(dfuse_hosts, self.pool, self.container)
        self.dfuse.check_running()

        # The use of MPI here is to run in parallel all simul tests on a single host.
        if not load_mpi(mpi_type):
            raise MPILoadError(mpi_type)

        # Run simul
        sumil_cmd = os.path.join(simul_path, "simul")
        if include and not exclude:
            cmd = "{0} -vv -d {1} -i {2}".format(sumil_cmd, dfuse_mount_dir,
                                                 include)
        elif exclude and not include:
            cmd = "{0} -vv -d {1} -e {2}".format(sumil_cmd, dfuse_mount_dir,
                                                 exclude)
        else:
            self.fail(
                "##Both include and exclude tests are selected both or empty.")

        self.log.info("Running simul on %s", mpi_type)
        try:
            result = run_command(cmd,
                                 output_check="combined",
                                 raise_exception=raise_exception)
        finally:
            self.stop_dfuse()

        return result
Ejemplo n.º 23
0
def configure_mpi(env):
    """Check if mpi exists and configure environment"""

    if GetOption('help'):
        return True

    env['CXX'] = None

    if env.subst("$MPI_PKG") != "":
        return _configure_mpi_pkg(env)

    for mpi in ['openmpi', 'mpich']:
        if not load_mpi(mpi):
            continue
        if _find_mpicc(env):
            print("%s is installed" % mpi)
            return True
        print("No %s installed and/or loaded" % mpi)
    print("No MPI installed")
    return False
Ejemplo n.º 24
0
def configure_mpi(env, libs, required=None):
    """Check if mpi exists and configure environment"""
    if env.subst("$MPI_PKG") != "":
        return _configure_mpi_pkg(env, libs)

    mpis = ['openmpi', 'mpich']
    if not required is None:
        if isinstance(required, str):
            mpis = [required]
        else:
            mpis = required

    for mpi in mpis:
        if not load_mpi(mpi):
            continue
        comp = mpi
        if mpi == "openmpi":
            comp = "ompi"
        if _find_mpicc(env):
            print("%s is installed" % mpi)
            return comp
        print("No %s installed and/or loaded" % mpi)
    print("No MPI installed")
    return None
Ejemplo n.º 25
0
        elif "No such process" in e:
            print("The daos_server process is no longer available"
                  " and could not be killed.")
        else:
            print("Unable to shut down DAOS server: {}".format(e))

if __name__ == "__main__":
    """
    Start a DAOS server and then run the four stages of the client.
    """
    print("Running rdb tests")
    rc = 0
    binfo = BuildInfo(os.path.join(build_root, ".build_vars.json"));
    debug_cmds = "-x D_LOG_MASK=DEBUG,RPC=ERR,MEM=ERR " + \
                 "-x DD_SUBSYS=all -x DD_MASK=all"
    load_mpi('openmpi')
    orterun = find_executable('orterun')
    if orterun is None:
        raise ServerFailedToStart("No orterun installed")

    try:
        # Server operations
        p = start_server(binfo, orterun)

        counter = 0
        daos_server = daos_server_pid()
        while daos_server is None:
            if counter >= 120:
                raise ServerTimedOut("No DAOS server process detected before "\
                                     "timeout")
            counter += 1
Ejemplo n.º 26
0
            print("The daos_server process is no longer available"
                  " and could not be killed.")
        else:
            print("Unable to shut down DAOS server: {}".format(e))


if __name__ == "__main__":
    """
    Start a DAOS server and then run the four stages of the client.
    """
    print("Running rdb tests")
    rc = 0
    binfo = BuildInfo(os.path.join(build_root, ".build_vars.json"))
    debug_cmds = "-x D_LOG_MASK=DEBUG,RPC=ERR,MEM=ERR " + \
                 "-x DD_SUBSYS=all -x DD_MASK=all"
    if not load_mpi('openmpi'):
        raise ServerFailedToStart("No orterun installed")
    orterun = find_executable('orterun')
    if orterun is None:
        raise ServerFailedToStart("No orterun installed")

    try:
        # Server operations
        p = start_server(binfo, orterun)

        counter = 0
        daos_server = daos_server_pid()
        while daos_server is None:
            if counter >= 120:
                raise ServerTimedOut("No DAOS server process detected before "\
                                     "timeout")
Ejemplo n.º 27
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.get_test_param("daos_test")
        num_clients = self.get_test_param("num_clients")
        if num_clients is None:
            num_clients = self.params.get("num_clients", '/run/daos_tests/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        nvme_size = self.params.get("nvme_size", '/run/pool/*')
        args = self.get_test_param("args", "")
        stopped_ranks = self.get_test_param("stopped_ranks", [])
        dmg = self.get_dmg_command()
        dmg_config_file = dmg.yaml.filename
        if self.hostlist_clients:
            dmg.copy_certificates(get_log_file("daosCA/certs"),
                                  self.hostlist_clients)
            dmg.copy_configuration(self.hostlist_clients)
        self.client_mca += " --mca btl_tcp_if_include eth0"

        cmd = " ".join([
            self.orterun, self.client_mca, "-n",
            str(num_clients), "--hostfile", self.hostfile_clients, "-x",
            "=".join(["D_LOG_FILE",
                      get_log_file(self.client_log)]), "--map-by node", "-x",
            "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild",
            self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]),
            str(args)
        ])

        env = {}
        env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir,
                                              "%g_cmocka_results.xml")
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)
        if not nvme_size:
            nvme_size = 0
        env['POOL_NVME_SIZE'] = "{}".format(nvme_size)

        if not load_mpi("openmpi"):
            self.fail("Failed to load openmpi")

        # Update the expected status for each ranks that will be stopped by this
        # test to avoid a false failure during tearDown().
        if "random" in stopped_ranks:
            # Set each expected rank state to be either stopped or running
            for manager in self.server_managers:
                manager.update_expected_states(
                    None, ["Joined", "Stopped", "Evicted"])
        else:
            # Set the specific expected rank state to stopped
            for rank in stopped_ranks:
                for manager in self.server_managers:
                    manager.update_expected_states(rank,
                                                   ["Stopped", "Evicted"])

        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status != 0:
                # fake a JUnit failure output
                self.create_results_xml(self.subtest_name, result)
                self.fail("{0} failed with return code={1}.\n".format(
                    cmd, result.result.exit_status))
Ejemplo n.º 28
0
def run_server(test,
               hostfile,
               setname,
               uri_path=None,
               env_dict=None,
               clean=True):
    # pylint: disable=unused-argument
    """Launch DAOS servers in accordance with the supplied hostfile.

    Args:
        test (Test): avocado Test object
        hostfile (str): hostfile defining on which hosts to start servers
        setname (str): session name
        uri_path (str, optional): path to uri file. Defaults to None.
        env_dict (dict, optional): dictionary on env variable names and values.
            Defaults to None.
        clean (bool, optional): clean the mount point. Defaults to True.

    Raises:
        ServerFailed: if there is an error starting the servers

    """
    global SESSIONS  # pylint: disable=global-variable-not-assigned
    try:
        servers = ([
            line.split(' ')[0] for line in genio.read_all_lines(hostfile)
        ])
        server_count = len(servers)

        # Pile of build time variables
        with open("../../.build_vars.json") as json_vars:
            build_vars = json.load(json_vars)

        # Create the DAOS server configuration yaml file to pass
        # with daos_server -o <FILE_NAME>
        print("Creating the server yaml file in {}".format(test.tmp))
        server_yaml = os.path.join(test.tmp, AVOCADO_FILE)
        server_config = DaosServerConfig()
        server_config.get_params(test)
        access_points = ":".join((servers[0], str(server_config.port)))
        server_config.access_points.value = access_points.split()
        server_config.update_log_files(getattr(test, "control_log"),
                                       getattr(test, "helper_log"),
                                       getattr(test, "server_log"))
        server_config.create_yaml(server_yaml)

        # first make sure there are no existing servers running
        print("Removing any existing server processes")
        kill_server(servers)

        # clean the tmpfs on the servers
        if clean:
            print("Cleaning the server tmpfs directories")
            result = pcmd(servers,
                          "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | "
                          "xargs -0r rm -rf",
                          verbose=False)
            if len(result) > 1 or 0 not in result:
                raise ServerFailed(
                    "Error cleaning tmpfs on servers: {}".format(", ".join(
                        [str(result[key]) for key in result if key != 0])))
        load_mpi('openmpi')
        orterun_bin = find_executable('orterun')
        if orterun_bin is None:
            raise ServerFailed("Can't find orterun")

        server_cmd = [orterun_bin, "--np", str(server_count)]
        server_cmd.extend(["--mca", "btl_openib_warn_default_gid_prefix", "0"])
        server_cmd.extend(["--mca", "btl", "tcp,self"])
        server_cmd.extend(["--mca", "oob", "tcp"])
        server_cmd.extend(["--mca", "pml", "ob1"])
        server_cmd.extend(["--hostfile", hostfile])
        server_cmd.extend(["--enable-recovery", "--tag-output"])

        # Add any user supplied environment
        if env_dict is not None:
            for key, value in env_dict.items():
                os.environ[key] = value
                server_cmd.extend(["-x", "{}={}".format(key, value)])

        # the remote orte needs to know where to find daos, in the
        # case that it's not in the system prefix
        # but it should already be in our PATH, so just pass our
        # PATH along to the remote
        if build_vars["PREFIX"] != "/usr":
            server_cmd.extend(["-x", "PATH"])

        # Run server in insecure mode until Certificate tests are in place
        server_cmd.extend([
            os.path.join(build_vars["PREFIX"], "bin",
                         "daos_server"), "--debug", "--config", server_yaml,
            "start", "-i", "--recreate-superblocks"
        ])

        print("Start CMD>>>>{0}".format(' '.join(server_cmd)))

        resource.setrlimit(resource.RLIMIT_CORE,
                           (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

        SESSIONS[setname] = subprocess.Popen(server_cmd,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE)
        fdesc = SESSIONS[setname].stdout.fileno()
        fstat = fcntl.fcntl(fdesc, fcntl.F_GETFL)
        fcntl.fcntl(fdesc, fcntl.F_SETFL, fstat | os.O_NONBLOCK)
        timeout = 600
        start_time = time.time()
        matches = 0
        pattern = "DAOS I/O server.*started"
        expected_data = "Starting Servers\n"
        while True:
            output = ""
            try:
                output = SESSIONS[setname].stdout.read()
            except IOError as excpn:
                if excpn.errno != errno.EAGAIN:
                    raise ServerFailed("Server didn't start: {}".format(excpn))
                continue
            match = re.findall(pattern, output)
            expected_data += output
            matches += len(match)
            if not output or matches == server_count or \
               time.time() - start_time > timeout:
                print("<SERVER>: {}".format(expected_data))
                if matches != server_count:
                    raise ServerFailed("Server didn't start!")
                break
        print("<SERVER> server started and took {} seconds to start".format(
            time.time() - start_time))

    except Exception as error:
        print("<SERVER> Exception occurred: {0}".format(str(error)))
        traceback.print_exception(error.__class__, error, sys.exc_info()[2])
        # We need to end the session now -- exit the shell
        try:
            SESSIONS[setname].send_signal(signal.SIGINT)
            time.sleep(5)
            # get the stderr
            error = SESSIONS[setname].stderr.read()
            if SESSIONS[setname].poll() is None:
                SESSIONS[setname].kill()
            retcode = SESSIONS[setname].wait()
            print("<SERVER> server start return code: {}\nstderr:\n{}".format(
                retcode, error))
        except KeyError:
            pass
        raise ServerFailed("Server didn't start!")