Ejemplo n.º 1
0
    def run(self, args):
        simulator = ClusterSimulator(args.config, args.url)
        log.info("Running %s servers in %s/" %
                 (len(simulator.servers), args.config))
        simulator.start_all()

        return simulator
Ejemplo n.º 2
0
    def register(self, args):
        simulator = ClusterSimulator(args.config, args.url)
        server_count = len(simulator.servers)

        if args.secret:
            server_secret = args.secret
            worker_secret = args.secret
        elif args.username and args.password:
            server_secret = self._acquire_token(
                args.url,
                args.username,
                args.password,
                server_count,
                preferred_profile='base_managed_rh7')
            worker_secret = self._acquire_token(
                args.url,
                args.username,
                args.password,
                server_count,
                preferred_profile='posix_copytool_worker')
        else:
            sys.stderr.write(
                "Must pass either --secret or --username and --password\n")
            sys.exit(-1)

        log.info("Registering %s servers in %s/" % (server_count, args.config))
        register_count = simulator.register_all(server_secret, worker_secret)

        if args.create_pdu_entries and register_count > 0:
            self.create_pdu_entries(simulator, args)

        return simulator
Ejemplo n.º 3
0
    def setup(self, args):
        log.info("Setting up simulator configuration for %s servers in %s/" %
                 (args.server_count, args.config))

        worker_count = int(args.worker_count)

        server_count = int(args.server_count)
        if args.volume_count:
            volume_count = int(args.volume_count)
        else:
            volume_count = server_count * 2

        reset_agent_config()
        set_agent_config('copytool_fifo_directory', '/tmp')

        simulator = ClusterSimulator(args.config, args.url)
        simulator.setup(server_count, worker_count, volume_count,
                        int(args.nid_count), int(args.cluster_size),
                        int(args.psu_count), int(args.su_size))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description="Simulated benchmarks")
    parser.add_argument('--remote_simulator',
                        required=False,
                        help="Disable built-in simulator (run it separately)",
                        default=False)
    parser.add_argument('--debug',
                        required=False,
                        help="Enable DEBUG-level logs",
                        default=False)
    parser.add_argument('--url',
                        required=False,
                        help="Manager URL",
                        default="https://localhost:8000")
    parser.add_argument('--username',
                        required=False,
                        help="REST API username",
                        default='admin')
    parser.add_argument('--password',
                        required=False,
                        help="REST API password",
                        default='lustre')
    parser.add_argument('--servers', help="server count", default=8, type=int)
    subparsers = parser.add_subparsers()

    log_ingest_parser = subparsers.add_parser("reset")
    log_ingest_parser.set_defaults(
        func=lambda args, simulator: Benchmark(args, simulator).reset())

    log_ingest_parser = subparsers.add_parser("log_ingest_rate")
    log_ingest_parser.set_defaults(func=lambda args, simulator: LogIngestRate(
        args, simulator).run_benchmark())

    server_count_limit_parser = subparsers.add_parser("server_count_limit")
    server_count_limit_parser.set_defaults(
        func=lambda args, simulator: ServerCountLimit(args, simulator
                                                      ).run_benchmark())

    server_count_limit_parser = subparsers.add_parser(
        "concurrent_registration_limit")
    server_count_limit_parser.set_defaults(
        func=lambda args, simulator: ConcurrentRegistrationLimit(
            args, simulator).run_benchmark())

    server_count_limit_parser = subparsers.add_parser("filesystem_size_limit")
    server_count_limit_parser.set_defaults(
        func=lambda args, simulator: FilesystemSizeLimit(args, simulator
                                                         ).run_benchmark())

    args = parser.parse_args()

    if args.debug:
        log.setLevel(logging.DEBUG)

    if not args.remote_simulator:
        log.info("Starting simulator...")

        # Enable logging by agent code run within simulator
        from chroma_agent.log import daemon_log
        daemon_log.setLevel(logging.DEBUG)
        handler = logging.FileHandler("chroma-agent.log")
        handler.setFormatter(
            logging.Formatter('[%(asctime)s] %(message)s',
                              '%d/%b/%Y:%H:%M:%S'))
        daemon_log.addHandler(handler)
        daemon_log.info("Enabled agent logging within simulator")

        from cluster_sim.simulator import ClusterSimulator
        simulator = ClusterSimulator(folder=None, url=args.url + "/")
        simulator.power.setup(1)
        simulator.start_all()
        simulator.setup(0,
                        0,
                        0,
                        nid_count=1,
                        cluster_size=4,
                        pdu_count=1,
                        su_size=0)
    else:
        simulator = xmlrpclib.ServerProxy("http://localhost:%s" %
                                          SIMULATOR_PORT,
                                          allow_none=True)

    try:
        log.info("Starting benchmark...")
        args.func(args, simulator)
    except:
        # Because we do a hard exit at the end here, explicitly log terminating
        # exceptions or they would get lost.
        log.error(traceback.format_exc())
        raise
    finally:
        # Do a hard exit to avoid dealing with lingering threads (not the cleanest, but
        # this isn't production code).
        os._exit(-1)

    log.info("Complete.")
Ejemplo n.º 5
0
    def setUp(self):
        super(ApiTestCaseWithTestReset, self).setUp()

        if config.get('simulator', False):
            # When we're running with the simulator, parts of the simulated
            # Copytools use agent code, and the agent code expects to find
            # a populated agent-side configuration. The safe way to handle
            # this requirement is to use mock to patch in a fresh
            # ConfigStore instance for each test run.
            try:
                from chroma_agent.config_store import ConfigStore
            except ImportError:
                raise ImportError(
                    "Cannot import agent, do you need to do a 'setup.py develop' of it?"
                )

            import mock  # Mock is only available when running the simulator, hence local inclusion
            self.mock_config = ConfigStore(tempfile.mkdtemp())
            mock.patch('chroma_agent.config', self.mock_config).start()
            from chroma_agent.action_plugins.settings_management import reset_agent_config, set_agent_config
            reset_agent_config()
            # Allow the worker to create a fifo in /tmp rather than /var/spool
            set_agent_config('copytool_fifo_directory',
                             self.COPYTOOL_TESTING_FIFO_ROOT)

            try:
                from cluster_sim.simulator import ClusterSimulator
            except ImportError:
                raise ImportError(
                    "Cannot import simulator, do you need to do a 'setup.py develop' of it?"
                )

            # The simulator's state directory will be left behind when a test fails,
            # so make sure it has a unique-per-run name
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
            state_path = 'simulator_state_%s.%s_%s' % (
                self.__class__.__name__, self._testMethodName, timestamp)
            if os.path.exists(state_path):
                raise RuntimeError(
                    "Simulator state folder already exists at '%s'!" %
                    state_path)

            # Hook up the agent log to a file
            from chroma_agent.agent_daemon import daemon_log
            handler = logging.FileHandler(
                os.path.join(config.get('log_dir', '/var/log/'),
                             'chroma_test_agent.log'))
            handler.setFormatter(
                logging.Formatter('[%(asctime)s] %(message)s',
                                  '%d/%b/%Y:%H:%M:%S'))
            daemon_log.addHandler(handler)
            daemon_log.setLevel(logging.DEBUG)

            self.simulator = ClusterSimulator(
                state_path, config['chroma_managers'][0]['server_http_url'])
            volume_count = max(
                [len(s['device_paths']) for s in self.config_servers])
            self.simulator.setup(len(self.config_servers),
                                 len(self.config_workers),
                                 volume_count,
                                 self.SIMULATOR_NID_COUNT,
                                 self.SIMULATOR_CLUSTER_SIZE,
                                 len(config['power_distribution_units']),
                                 su_size=0)
            self.remote_operations = SimulatorRemoteOperations(
                self, self.simulator)
            if self.TESTS_NEED_POWER_CONTROL:
                self.simulator.power.start()
        else:
            self.remote_operations = RealRemoteOperations(self)

        if self.quick_setup is False:
            # Ensure that all servers are up and available
            for server in self.TEST_SERVERS:
                logger.info(
                    "Checking that %s is running and restarting if necessary..."
                    % server['fqdn'])
                self.remote_operations.await_server_boot(server['fqdn'],
                                                         restart=True)
                logger.info("%s is running" % server['fqdn'])
                self.remote_operations.inject_log_message(
                    server['fqdn'], "==== "
                    "starting test %s "
                    "=====" % self)

            if config.get('reset', True):
                self.reset_cluster()
            elif config.get('soft_reset', True):
                # Reset the manager via the API
                self.wait_until_true(self.api_contactable)
                self.remote_operations.unmount_clients()
                self.api_force_clear()
                self.remote_operations.clear_ha(self.TEST_SERVERS)
                self.remote_operations.clear_lnet_config(self.TEST_SERVERS)

            if config.get('managed'):
                # Ensure that config from previous runs doesn't linger into
                # this one.
                self.remote_operations.remove_config(self.TEST_SERVERS)

                # If there are no configuration options for a given server
                # (e.g. corosync_config), then this is a noop and no config file
                # is written.
                self.remote_operations.write_config(self.TEST_SERVERS)

                # cleanup linux devices
                self.cleanup_linux_devices(self.TEST_SERVERS)

                # cleanup zfs pools
                self.cleanup_zfs_pools(
                    self.config_servers, self.CZP_EXPORTPOOLS |
                    (self.CZP_RECREATEZPOOLS if config.get(
                        'new_zpools_each_test', False) else
                     self.CZP_REMOVEDATASETS), None, True)

            # Enable agent debugging
            self.remote_operations.enable_agent_debug(self.TEST_SERVERS)

        self.wait_until_true(self.supervisor_controlled_processes_running)
        self.initial_supervisor_controlled_process_start_times = self.get_supervisor_controlled_process_start_times(
        )
Ejemplo n.º 6
0
class ApiTestCaseWithTestReset(UtilityTestCase):
    """
    Adds convenience for interacting with the chroma api.
    """
    # These are sufficient for tests existing at time of writing.
    # Tests may ask for different values by defining these at class scope.
    SIMULATOR_NID_COUNT = 5
    SIMULATOR_CLUSTER_SIZE = 2

    # Used by tests so that we don't need to be root
    COPYTOOL_TESTING_FIFO_ROOT = "/tmp"

    # Most tests do not need simulated PDUs, so don't bother starting them.
    # Flip this setting on a per-class basis for groups of tests which do
    # actually need PDUs.
    TESTS_NEED_POWER_CONTROL = False

    # By default, work with all configured servers. Tests which will
    # only ever be using a subset of servers can override this to
    # gain a slight decrease in running time.
    TEST_SERVERS = config['lustre_servers']

    # Storage for details of the rest api provided by the manager. Presumes that the api does not change during
    # execution of the whole test suite and so stores the result once in a class variable. A class variable
    # because instances of this class come and go but the api today is constant.
    _chroma_manager_api = None

    _chroma_manager = None
    _unauthorized_chroma_manager = None

    def __init__(self, methodName='runTest'):
        super(ApiTestCaseWithTestReset, self).__init__(methodName)
        self.remote_operations = None
        self.simulator = None
        self.down_node_expected = False

    def setUp(self):
        super(ApiTestCaseWithTestReset, self).setUp()

        if config.get('simulator', False):
            # When we're running with the simulator, parts of the simulated
            # Copytools use agent code, and the agent code expects to find
            # a populated agent-side configuration. The safe way to handle
            # this requirement is to use mock to patch in a fresh
            # ConfigStore instance for each test run.
            try:
                from chroma_agent.config_store import ConfigStore
            except ImportError:
                raise ImportError(
                    "Cannot import agent, do you need to do a 'setup.py develop' of it?"
                )

            import mock  # Mock is only available when running the simulator, hence local inclusion
            self.mock_config = ConfigStore(tempfile.mkdtemp())
            mock.patch('chroma_agent.config', self.mock_config).start()
            from chroma_agent.action_plugins.settings_management import reset_agent_config, set_agent_config
            reset_agent_config()
            # Allow the worker to create a fifo in /tmp rather than /var/spool
            set_agent_config('copytool_fifo_directory',
                             self.COPYTOOL_TESTING_FIFO_ROOT)

            try:
                from cluster_sim.simulator import ClusterSimulator
            except ImportError:
                raise ImportError(
                    "Cannot import simulator, do you need to do a 'setup.py develop' of it?"
                )

            # The simulator's state directory will be left behind when a test fails,
            # so make sure it has a unique-per-run name
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
            state_path = 'simulator_state_%s.%s_%s' % (
                self.__class__.__name__, self._testMethodName, timestamp)
            if os.path.exists(state_path):
                raise RuntimeError(
                    "Simulator state folder already exists at '%s'!" %
                    state_path)

            # Hook up the agent log to a file
            from chroma_agent.agent_daemon import daemon_log
            handler = logging.FileHandler(
                os.path.join(config.get('log_dir', '/var/log/'),
                             'chroma_test_agent.log'))
            handler.setFormatter(
                logging.Formatter('[%(asctime)s] %(message)s',
                                  '%d/%b/%Y:%H:%M:%S'))
            daemon_log.addHandler(handler)
            daemon_log.setLevel(logging.DEBUG)

            self.simulator = ClusterSimulator(
                state_path, config['chroma_managers'][0]['server_http_url'])
            volume_count = max(
                [len(s['device_paths']) for s in self.config_servers])
            self.simulator.setup(len(self.config_servers),
                                 len(self.config_workers),
                                 volume_count,
                                 self.SIMULATOR_NID_COUNT,
                                 self.SIMULATOR_CLUSTER_SIZE,
                                 len(config['power_distribution_units']),
                                 su_size=0)
            self.remote_operations = SimulatorRemoteOperations(
                self, self.simulator)
            if self.TESTS_NEED_POWER_CONTROL:
                self.simulator.power.start()
        else:
            self.remote_operations = RealRemoteOperations(self)

        if self.quick_setup is False:
            # Ensure that all servers are up and available
            for server in self.TEST_SERVERS:
                logger.info(
                    "Checking that %s is running and restarting if necessary..."
                    % server['fqdn'])
                self.remote_operations.await_server_boot(server['fqdn'],
                                                         restart=True)
                logger.info("%s is running" % server['fqdn'])
                self.remote_operations.inject_log_message(
                    server['fqdn'], "==== "
                    "starting test %s "
                    "=====" % self)

            if config.get('reset', True):
                self.reset_cluster()
            elif config.get('soft_reset', True):
                # Reset the manager via the API
                self.wait_until_true(self.api_contactable)
                self.remote_operations.unmount_clients()
                self.api_force_clear()
                self.remote_operations.clear_ha(self.TEST_SERVERS)
                self.remote_operations.clear_lnet_config(self.TEST_SERVERS)

            if config.get('managed'):
                # Ensure that config from previous runs doesn't linger into
                # this one.
                self.remote_operations.remove_config(self.TEST_SERVERS)

                # If there are no configuration options for a given server
                # (e.g. corosync_config), then this is a noop and no config file
                # is written.
                self.remote_operations.write_config(self.TEST_SERVERS)

                # cleanup linux devices
                self.cleanup_linux_devices(self.TEST_SERVERS)

                # cleanup zfs pools
                self.cleanup_zfs_pools(
                    self.config_servers, self.CZP_EXPORTPOOLS |
                    (self.CZP_RECREATEZPOOLS if config.get(
                        'new_zpools_each_test', False) else
                     self.CZP_REMOVEDATASETS), None, True)

            # Enable agent debugging
            self.remote_operations.enable_agent_debug(self.TEST_SERVERS)

        self.wait_until_true(self.supervisor_controlled_processes_running)
        self.initial_supervisor_controlled_process_start_times = self.get_supervisor_controlled_process_start_times(
        )

    def tearDown(self):
        if self.simulator:
            self.simulator.stop()
            self.simulator.join()

            # Clean up the temp agent config
            import mock  # Mock is only available when running the simulator, hence local inclusion
            mock.patch.stopall()
            shutil.rmtree(self.mock_config.path)

            passed = sys.exc_info() == (None, None, None)
            if passed:
                shutil.rmtree(self.simulator.folder)
        else:
            if self.remote_operations:
                # Check that all servers are up and available after the test
                down_nodes = []
                for server in self.TEST_SERVERS:
                    if not self.remote_operations.host_contactable(
                            server['address']):
                        down_nodes.append(server['address'])
                    else:
                        self.remote_operations.inject_log_message(
                            server['fqdn'],
                            "==== stopping test %s =====" % self)

                if len(down_nodes) and (self.down_node_expected is False):
                    logger.warning(
                        "After test, some servers were no longer running: %s" %
                        ", ".join(down_nodes))
                    raise RuntimeError("AWOL servers after test: %s" %
                                       ", ".join(down_nodes))

        self.assertTrue(self.supervisor_controlled_processes_running())
        self.assertEqual(
            self.initial_supervisor_controlled_process_start_times,
            self.get_supervisor_controlled_process_start_times())

    @property
    def config_servers(self):
        return [
            s for s in config['lustre_servers']
            if 'worker' not in s.get('profile', "")
        ]

    @property
    def config_workers(self):
        return [
            w for w in config['lustre_servers']
            if 'worker' in w.get('profile', "")
        ]

    @property
    def chroma_manager(self):
        if self._chroma_manager is None:
            user = config['chroma_managers'][0]['users'][0]
            self._chroma_manager = AuthorizedHttpRequests(
                user['username'],
                user['password'],
                server_http_url=config['chroma_managers'][0]
                ['server_http_url'])
        return self._chroma_manager

    @property
    def unauthorized_chroma_manager(self):
        if self._unauthorized_chroma_manager is None:
            self._unauthorized_chroma_manager = HttpRequests(
                server_http_url=config['chroma_managers'][0]
                ['server_http_url'])
        return self._unauthorized_chroma_manager

    def restart_chroma_manager(self, fqdn):
        self.remote_operations.restart_chroma_manager(fqdn)
        # Process start times will be outdated after restarting chroma-manager so must update start times
        self.initial_supervisor_controlled_process_start_times = self.get_supervisor_controlled_process_start_times(
        )

    def api_contactable(self):
        try:
            self.chroma_manager.get('/api/system_status/')
            return True
        except requests.ConnectionError:
            return False

    def supervisor_controlled_processes_running(self):
        # Use the api to verify the processes controlled by supervisor are all in a RUNNING state
        try:
            response = self.chroma_manager.get('/api/system_status/')
        except requests.ConnectionError:
            logger.warning("Connection error trying to connect to the manager")
            return False

        self.assertEqual(response.successful, True, response.text)
        system_status = response.json
        non_running_processes = []
        for process in system_status['supervisor']:
            if not process['statename'] == 'RUNNING':
                non_running_processes.append(process)

        if non_running_processes:
            logger.warning(
                "Supervisor processes found not to be running: '%s'" %
                non_running_processes)
            return False
        else:
            return True

    def get_supervisor_controlled_process_start_times(self):
        response = self.chroma_manager.get('/api/system_status/')
        self.assertEqual(response.successful, True, response.text)
        system_status = response.json
        supervisor_controlled_process_start_times = {}
        for process in system_status['supervisor']:
            supervisor_controlled_process_start_times[
                process['name']] = process['start']
        return supervisor_controlled_process_start_times

    def _print_command(self, command, disposition, msg):
        print "COMMAND %s: %s" % (command['id'], disposition)
        print "-----------------------------------------------------------"
        print command
        if msg:
            print "Run to %s" % msg
        print ''

        for job_uri in command['jobs']:
            job = self.get_json_by_uri(job_uri)
            job_steps = [self.get_json_by_uri(s) for s in job['steps']]
            if disposition == "FAILED":
                if job['errored']:
                    print "Job %s Errored (%s):" % (job['id'],
                                                    job['description'])
                    print job
                    print ''
                    for step in job_steps:
                        if step['state'] == 'failed':
                            print "Step %s failed:" % step['id']
                            for k, v in step.iteritems():
                                print "%s: %s" % (k, v)
                            print ''
            elif disposition == "TIMED OUT":
                if job['state'] != "complete":
                    print "Job %s incomplete:" % job['id']
                    print job
                    print ''
                    for step in job_steps:
                        if step['state'] == "incomplete":
                            print "Step %s incomplete:" % step['id']
                            print step
                            print ''
            else:
                print job
                for step in job_steps:
                    print step

    def wait_for_command(self,
                         chroma_manager,
                         command_id,
                         timeout=TEST_TIMEOUT,
                         verify_successful=True,
                         test_for_eventual_completion=True,
                         msg=None):
        logger.debug("wait_for_command: %s" %
                     self.get_json_by_uri('/api/command/%s/' % command_id))
        # TODO: More elegant timeout?
        running_time = 0
        command_complete = False
        while running_time < timeout and not command_complete:
            command = self.get_json_by_uri('/api/command/%s/' % command_id)
            command_complete = command['complete']
            if not command_complete:
                time.sleep(1)
                running_time += 1

        if running_time >= timeout:
            self._print_command(command, "TIMED OUT", msg)

            # Now wait again to see if the command eventually succeeds. The test will still fail but we get some idea
            # if we just aren't waiting long enough of if there is an issue.
            if test_for_eventual_completion:
                retry_time = time.time()
                # If it fails the second time this will throw an exception and never return.
                self._print_command(
                    self.wait_for_command(chroma_manager,
                                          command_id,
                                          timeout=timeout,
                                          verify_successful=verify_successful,
                                          test_for_eventual_completion=False),
                    "COMPLETED %s SECONDS AFTER TIMEOUT" %
                    int(time.time() - retry_time), None)
        else:
            logger.debug("command for %s complete: %s" % (msg, command))

        self.assertTrue(command_complete, "%s: --> %s" % (msg, command))
        if verify_successful and (command['errored'] or command['cancelled']):
            self._print_command(command, "FAILED", msg)

            self.assertFalse(command['errored'] or command['cancelled'],
                             command)

        return command

    def wait_for_commands(self,
                          chroma_manager,
                          command_ids,
                          timeout=TEST_TIMEOUT,
                          verify_successful=True):
        assert type(
            chroma_manager
        ) is AuthorizedHttpRequests, "chroma_manager is not an AuthorizedHttpRequests: %s" % chroma_manager
        assert type(
            command_ids
        ) is list, "command_ids is not an int: %s" % type(command_ids)
        assert type(
            timeout) is int, "timeout is not an int: %s" % type(timeout)
        assert type(verify_successful
                    ) is bool, "verify_successful is not a bool: %s" % type(
                        verify_successful)

        for command_id in command_ids:
            self.wait_for_command(chroma_manager, command_id, timeout,
                                  verify_successful)

    def wait_last_command_complete(self,
                                   timeout=TEST_TIMEOUT,
                                   verify_successful=True):
        '''
        This actually waits for all commands to be complete and then verifies that the last command completed successfully
        :param timeout: Time to wait for commands to complete
        :param verify_successful: True if we should verify the last command completed OK.
        :return: No return value
        '''
        assert type(
            timeout) is int, "timeout is not an int: %s" % type(timeout)
        assert type(verify_successful
                    ) is bool, "verify_successful is not a bool: %s" % type(
                        verify_successful)

        self.wait_for_assert(
            lambda: self.assertEqual(
                0,
                len(
                    self.get_json_by_uri('/api/command/',
                                         args={'complete': False})['objects'])
            ), timeout)

        if verify_successful:
            response = self.get_json_by_uri('/api/command/', args={'limit': 0})
            last_command = max(response['objects'],
                               key=lambda command: int(command['id']))
            self.wait_for_command(self.chroma_manager, last_command['id'],
                                  timeout, True)

    def wait_alerts(self, expected_alerts, **filters):
        "Wait and assert correct number of matching alerts."
        expected_alerts.sort()

        for _ in util.wait(TEST_TIMEOUT):
            alerts = [
                alert['alert_type']
                for alert in self.get_list("/api/alert/", filters)
            ]
            alerts.sort()
            if alerts == expected_alerts:
                return alerts

        raise AssertionError(alerts)

    def get_list(self, url, args=None):
        args = args if args else {}
        assert type(args) is dict, "args is not a dictionary: %s" % type(args)

        response = self.chroma_manager.get(url, params=args)
        self.assertEqual(response.status_code, 200, response.content)
        return response.json['objects']

    def get_by_uri(self, uri, args=None, verify_successful=True):
        args = args if args else {}
        assert type(args) is dict, "args is not a dictionary: %s" % type(args)
        assert type(verify_successful
                    ) is bool, "verify_successful is not a bool: %s" % type(
                        verify_successful)

        response = self.chroma_manager.get(uri, params=args)

        if verify_successful:
            self.assertEqual(response.status_code, 200, response.content)

        return response

    def get_json_by_uri(self, uri, args=None, verify_successful=True):
        args = args if args else {}
        assert type(args) is dict, "args is not a dictionary: %s" % type(args)
        assert type(verify_successful
                    ) is bool, "verify_successful is not a bool: %s" % type(
                        verify_successful)

        return self.get_by_uri(uri, args, verify_successful).json

    def wait_for_action(self, victim, timeout=TEST_TIMEOUT, **filters):
        """
        Check victim's available_actions until the desired action is available
        or the timeout is reached, filtering on action keys: class_name, state.
        """
        for _ in util.wait(timeout):
            actions = self.get_json_by_uri(
                victim['resource_uri'])['available_actions']
            for action in actions:
                if all(action.get(key) == filters[key] for key in filters):
                    return action
        actions = [
            dict((key, action.get(key)) for key in filters)
            for action in actions
        ]
        raise AssertionError('{0} not found in {1}'.format(filters, actions))

    def run_command(self, jobs, message=None, verify_successful=True):
        message = message if message else "Test command"
        assert type(message) in [
            str, unicode
        ], "message is not a str/unicode: %s" % type(message)
        assert type(verify_successful
                    ) is bool, "verify_successful is not a bool: %s" % type(
                        verify_successful)

        logger.debug("Running %s (%s)" % (jobs, message))
        response = self.chroma_manager.post('/api/command/',
                                            body=dict(jobs=jobs,
                                                      message=message))

        if verify_successful:
            self.assertTrue(response.successful, response.text)
            self.wait_for_command(self.chroma_manager, response.json['id'])
        return response

    def post_by_uri(self, uri, object, verify_successful=True):
        logger.debug("post_by_uri(%s, ...)" % uri)
        response = self.chroma_manager.post(uri, body=object)

        if response.status_code == 204:
            logger.warning("post_by_uri(%s, ...) - no-op" % uri)
            command = None
        else:
            self.assertEquals(response.status_code, 202, response.content)
            command = self.wait_for_command(
                self.chroma_manager,
                response.json['command']['id'],
                verify_successful=verify_successful)

        return command

    VERIFY_SUCCESS_NO = 0
    VERIFY_SUCCESS_INSTANT = 1
    VERIFY_SUCCESS_WAIT = 2

    def set_value(self,
                  uri,
                  value_name,
                  value,
                  verify_successful=VERIFY_SUCCESS_INSTANT,
                  msg=None):
        logger.debug("set_%s %s %s%s" %
                     (value_name, uri, value, ": %s" % msg if msg else ""))
        object = self.get_json_by_uri(uri)
        # We do this because some objects will presume a put with 'state' is a state change.
        # Do it before the object value setting so that if state is being set we don't break it.
        object.pop('state', None)
        object[value_name] = value

        response = self.chroma_manager.put(uri, body=object)
        if response.status_code == 204:
            logger.warning("set_%s %s %s - no-op" % (value_name, uri, value))
            command = None
        else:
            self.assertEquals(response.status_code, 202, response.content)

            json = response.json

            if 'cancelled' in json and 'complete' in json and 'errored' in json:
                command_id = json['id']
            # If the state is not a change command will be none and so we have nothing to wait for.
            elif response.json['command'] is None:
                command_id = None
            else:
                command_id = json['command']['id']

            if command_id:
                command = self.wait_for_command(
                    self.chroma_manager,
                    command_id,
                    verify_successful=(verify_successful !=
                                       self.VERIFY_SUCCESS_NO),
                    msg=msg,
                    timeout=LONG_TEST_TIMEOUT)
            else:
                command = None

        if verify_successful == self.VERIFY_SUCCESS_INSTANT:
            self.assertValue(uri, value_name, value)
        elif verify_successful == self.VERIFY_SUCCESS_WAIT:
            self.wait_for_assert(
                lambda: self.assertValue(uri, value_name, value))

        return command

    def set_state(self, uri, state, verify_successful=True, msg=None):
        return self.set_value(
            uri, 'state', state, self.VERIFY_SUCCESS_INSTANT
            if verify_successful else self.VERIFY_SUCCESS_NO, msg)

    def set_state_dry_run(self, uri, state):
        stateful_object = self.get_json_by_uri(uri)

        stateful_object['state'] = state
        stateful_object['dry_run'] = True

        response = self.chroma_manager.put(uri, body=stateful_object)

        self.assertEquals(response.status_code, 200, response.content)

        return response.json

    def delete_by_uri(self, uri, verify_successful=True):
        response = self.chroma_manager.delete(uri)

        if verify_successful:
            self.assertEqual(response.status_code, 204)

        return response

    def assertNoAlerts(self, uri, of_severity=None, of_type=None):
        """Fail if alert_item (as tastypie uri) is found in any active alerts"""

        # The intent of this method was to check for no alerts, but it was
        # really just checking no active alerts that have not been dismissed.
        # callers of this code, like test_alerting.py expected no alerts of
        # this type.  Now you can't have alerts in that state.  Users dismiss
        # alerts.  The code that created dismissed alerts, now creates alerts
        # in the warning state.
        # TODO: fix each test that calls this method to be sure the proper
        # state is being tested.

        data = {'active': True}
        if of_type is not None:
            data['alert_type'] = of_type
        if of_severity is not None:  # Severity should be 'ERROR', 'WARNING', 'INFO'
            data['severity'] = of_severity
        alerts = self.get_list("/api/alert/", data)
        self.assertNotIn(uri, [a['alert_item'] for a in alerts])

    def assertHasAlert(self, uri, of_severity=None, of_type=None):
        data = {'active': True}
        if of_severity is not None:  # Severity should be 'ERROR', 'WARNING', 'INFO'
            data['severity'] = of_severity
        if of_type is not None:
            data['alert_type'] = of_type
        alerts = self.get_list("/api/alert/", data)
        self.assertIn(uri, [a['alert_item'] for a in alerts],
                      [a['alert_item'] for a in alerts])

    def get_alert(self, uri, regex=None, alert_type=None, active=True):
        """Given that there is an active alert for object `uri` whose
           message matches `regex`, return it.  Raise an AssertionError
           if no such alert exists"""

        all_alerts = self.get_list("/api/alert/", {
            'active': active,
            'limit': 0
        })
        alerts = [a for a in all_alerts if a['alert_item'] == uri]
        if not alerts:
            raise AssertionError("No alerts for object %s (alerts are %s)" %
                                 (uri, all_alerts))

        if regex is not None:
            alerts = [a for a in alerts if re.match(regex, a['message'])]
            if not alerts:
                raise AssertionError(
                    "No alerts for object %s matching %s (alerts are %s)" %
                    (uri, regex, all_alerts))

        if alert_type is not None:
            alerts = [a for a in alerts if a['alert_type'] == alert_type]
            if not alerts:
                raise AssertionError(
                    "No alerts of type %s found (alerts are %s)" %
                    (alert_type, all_alerts))

        if len(alerts) > 1:
            raise AssertionError("Multiple alerts match: %s" % alerts)

        return alerts[0]

    def assertValue(self, uri, value_name, value):
        logger.debug("assertValue %s %s %s" % (value_name, uri, value))
        obj = self.get_json_by_uri(uri)
        self.assertEqual(
            str(obj[value_name]),
            str(value))  # Convert to strings so we don't get type issues.

    def assertState(self, uri, state):
        self.assertValue(uri, 'state', state)

    def get_filesystem(self, filesystem_id):
        return self.get_json_by_uri("/api/filesystem/%s/" % filesystem_id)

    def get_filesystem_by_name(self, name):
        filesystems = self.get_list("/api/filesystem/")
        try:
            return [f for f in filesystems if f['name'] == name][0]
        except IndexError:
            raise KeyError("No filesystem named %s" % name)

    @property
    def chroma_manager_api(self):
        '''
        Provides the details of the rest api provided by the manager. Presumes that the api does not change during
        execution of the whole test suite and so stores the result once in a class variable. A class variable
        because instances of this class come and go but the api today is constant.

        The call to get_json_by_uri creates quite a lot of useful debug, but it is quite a lot of another advantage of
        only fetching the data once is the we only see the debug once.

        :return: hash of the api
        '''
        if ApiTestCaseWithTestReset._chroma_manager_api is None:
            ApiTestCaseWithTestReset._chroma_manager_api = self.get_json_by_uri(
                "/api/")

        return ApiTestCaseWithTestReset._chroma_manager_api

    def reset_cluster(self):
        """
        Will fully wipe a test cluster:
          - dropping and recreating the manager database
          - unmounting any lustre filesystems from the clients
          - unconfiguring any chroma targets in pacemaker
        """
        if config.get('managed'):
            self.remote_operations.unmount_clients()
        self.reset_chroma_manager_db()
        self.remote_operations.stop_agents(s['address']
                                           for s in config['lustre_servers'])
        if config.get('managed'):
            self.remote_operations.clear_ha(self.TEST_SERVERS)
            self.remote_operations.clear_lnet_config(self.TEST_SERVERS)

    def reset_chroma_manager_db(self):
        for chroma_manager in config['chroma_managers']:
            superuser = [u for u in chroma_manager['users'] if u['super']][0]

            # Stop all manager services
            result = self.remote_command(chroma_manager['address'],
                                         'chroma-config stop',
                                         expected_return_code=None)
            if not result.exit_status == 0:
                logger.warn(
                    "chroma-config stop failed: rc:%s out:'%s' err:'%s'" %
                    (result.exit_status, result.stdout, result.stderr))

            # Wait for all of the manager services to stop
            running_time = 0
            services = ['chroma-supervisor']
            while services and running_time < TEST_TIMEOUT:
                for service in services:
                    result = self.remote_command(chroma_manager['address'],
                                                 'service %s status' % service,
                                                 expected_return_code=None)
                    if result.exit_status == 3:
                        services.remove(service)
                time.sleep(1)
                running_time += 1
            self.assertEqual(
                services, [],
                "Not all services were stopped by chroma-config before timeout: %s"
                % services)

            # Completely nuke the database to start from a clean state.
            self.remote_command(
                chroma_manager['address'],
                'service postgresql stop && rm -fr /var/lib/pgsql/data/*')

            # Run chroma-config setup to recreate the database and start the manager.
            result = self.remote_command(
                chroma_manager['address'],
                "chroma-config setup %s %s %s %s &> config_setup.log" %
                (superuser['username'], superuser['password'],
                 chroma_manager.get('ntp_server',
                                    "localhost"), "--no-dbspace-check"),
                expected_return_code=None)
            chroma_config_exit_status = result.exit_status
            if not chroma_config_exit_status == 0:
                result = self.remote_command(chroma_manager['address'],
                                             "cat config_setup.log")
                self.assertEqual(
                    0, chroma_config_exit_status,
                    "chroma-config setup failed: '%s'" % result.stdout)

            # Register the default bundles and profile again
            result = self.remote_command(
                chroma_manager['address'],
                "for bundle_meta in /var/lib/chroma/repo/*/%s/meta; do chroma-config bundle register $(dirname $bundle_meta); done &> config_bundle.log"
                % platform.dist()[1][0:1],
                expected_return_code=None)
            chroma_config_exit_status = result.exit_status
            if not chroma_config_exit_status == 0:
                result = self.remote_command(chroma_manager['address'],
                                             "cat config_bundle.log")
                self.assertEqual(
                    0, chroma_config_exit_status,
                    "chroma-config bundle register failed: '%s'" %
                    result.stdout)

            result = self.remote_command(chroma_manager['address'],
                                         "ls /tmp/iml-*/",
                                         expected_return_code=None)
            installer_contents = result.stdout
            self.assertEqual(
                0, result.exit_status,
                "Could not find installer! Expected the installer to be in /tmp/. \n'%s' '%s'"
                % (installer_contents, result.stderr))

            logger.debug("Installer contents: %s" % installer_contents)

            # get a list of profiles in the installer and re-register them
            profiles = " ".join([
                line for line in installer_contents.split("\n")
                if 'profile' in line
            ])
            logger.debug("Found these profiles: %s" % profiles)
            result = self.remote_command(
                chroma_manager['address'],
                "for profile_pat in %s; do chroma-config profile register /tmp/iml-*/$profile_pat; done &> config_profile.log"
                % profiles,
                expected_return_code=None)
            chroma_config_exit_status = result.exit_status
            if not chroma_config_exit_status == 0:
                result = self.remote_command(chroma_manager['address'],
                                             "cat config_profile.log")
                self.assertEqual(
                    0, chroma_config_exit_status,
                    "chroma-config profile register failed: '%s'" %
                    result.stdout)

    def graceful_teardown(self, chroma_manager):
        """
        Removes all filesystems, MGSs, and hosts from chroma via the api.  This is
        not guaranteed to work, and should be done at the end of tests in order to
        verify that the manager instance was in a nice state, rather than
        in setUp/tearDown hooks to ensure a clean system.
        """
        response = chroma_manager.get('/api/filesystem/', params={'limit': 0})
        self.assertEqual(response.status_code, 200)
        filesystems = response.json['objects']

        self.remote_operations.unmount_clients()

        if len(filesystems) > 0:
            # Remove filesystems
            remove_filesystem_command_ids = []
            for filesystem in filesystems:
                response = chroma_manager.delete(filesystem['resource_uri'])
                self.assertTrue(response.successful, response.text)
                command_id = response.json['command']['id']
                self.assertTrue(command_id)
                remove_filesystem_command_ids.append(command_id)

            self.wait_for_commands(chroma_manager,
                                   remove_filesystem_command_ids,
                                   timeout=LONG_TEST_TIMEOUT)

        # Remove MGT
        response = chroma_manager.get('/api/target/',
                                      params={
                                          'kind': 'MGT',
                                          'limit': 0
                                      })
        self.assertTrue(response.successful, response.text)
        mgts = response.json['objects']

        if len(mgts) > 0:
            remove_mgt_command_ids = []
            for mgt in mgts:
                response = chroma_manager.delete(mgt['resource_uri'])
                command_id = response.json['command']['id']
                self.assertTrue(command_id)
                remove_mgt_command_ids.append(command_id)

            self.wait_for_commands(chroma_manager, remove_mgt_command_ids)

        # Remove hosts
        response = chroma_manager.get('/api/host/', params={'limit': 0})
        self.assertTrue(response.successful, response.text)
        hosts = response.json['objects']

        if len(hosts) > 0:
            remove_host_command_ids = []
            for host in hosts:
                response = chroma_manager.delete(host['resource_uri'])
                self.assertTrue(response.successful, response.text)
                command_id = response.json['command']['id']
                self.assertTrue(command_id)
                remove_host_command_ids.append(command_id)

            self.wait_for_commands(chroma_manager,
                                   remove_host_command_ids,
                                   timeout=LONG_TEST_TIMEOUT)

        self.assertDatabaseClear()

    def api_clear_resource(self, resource):
        response = self.chroma_manager.get('/api/%s' % resource,
                                           params={'limit': 0})
        self.assertTrue(response.successful, response.text)
        objects = response.json['objects']

        for obj in objects:
            response = self.chroma_manager.delete(obj['resource_uri'])
            self.assertTrue(response.successful, response.text)

    def api_force_clear(self):
        """
        Clears the Chroma instance via the API (by issuing ForceRemoveHost
        commands) -- note that this will *not* unconfigure storage servers or
        remove corosync resources: do that separately.
        """

        response = self.chroma_manager.get('/api/host/', params={'limit': 0})
        self.assertTrue(response.successful, response.text)
        hosts = response.json['objects']

        if len(hosts) > 0:
            remove_host_command_ids = []
            for host in hosts:
                command = self.chroma_manager.post(
                    "/api/command/",
                    body={
                        'jobs': [{
                            'class_name': 'ForceRemoveHostJob',
                            'args': {
                                'host_id': host['id']
                            }
                        }],
                        'message':
                        "Test force remove hosts"
                    }).json
                remove_host_command_ids.append(command['id'])

            self.wait_for_commands(self.chroma_manager,
                                   remove_host_command_ids)

        self.api_clear_resource('power_control_type')

    def assertDatabaseClear(self, chroma_manager=None):
        """
        Checks that the manager API is now clear of filesystems, targets,
        hosts and volumes.
        """

        if chroma_manager is None:
            chroma_manager = self.chroma_manager

        # Verify there are zero filesystems
        response = chroma_manager.get('/api/filesystem/', params={'limit': 0})
        self.assertTrue(response.successful, response.text)
        filesystems = response.json['objects']
        self.assertEqual(0, len(filesystems))

        # Verify there are zero mgts
        response = chroma_manager.get('/api/target/', params={'kind': 'MGT'})
        self.assertTrue(response.successful, response.text)
        mgts = response.json['objects']
        self.assertEqual(0, len(mgts))

        # Verify there are now zero hosts in the database.
        response = chroma_manager.get('/api/host/', )
        self.assertTrue(response.successful, response.text)
        hosts = response.json['objects']
        self.assertEqual(0, len(hosts))

        # Verify there are now zero volumes in the database.
        response = chroma_manager.get('/api/volume/', params={'limit': 0})
        self.assertTrue(response.successful, response.text)
        volumes = response.json['objects']
        self.assertEqual(0, len(volumes))

    def reset_accounts(self, chroma_manager):
        """Remove any user accounts which are not in the config (such as
        those left hanging by tests)"""

        configured_usernames = [
            u['username'] for u in config['chroma_managers'][0]['users']
        ]

        response = chroma_manager.get('/api/user/', data={'limit': 0})
        self.assertEqual(response.status_code, 200)
        for user in response.json['objects']:
            if not user['username'] in configured_usernames:
                chroma_manager.delete(user['resource_uri'])

    @classmethod
    def linux_devices_exist(cls):
        return any(lustre_device['backend_filesystem'] == 'linux'
                   for lustre_device in config['lustre_devices'])

    @classmethod
    def zfs_devices_exist(cls):
        return any(lustre_device['backend_filesystem'] == 'zfs'
                   for lustre_device in config['lustre_devices'])

    # Not all combinations are possible, remove zpools without remove datasets makes no sense for example.
    CZP_REMOVEDATASETS = 0x1
    CZP_REMOVEZPOOLS = 0x2
    CZP_REMOVEDATASETSANDZPOOLS = (CZP_REMOVEDATASETS | CZP_REMOVEZPOOLS)
    CZP_CREATEZPOOLS = 0x4
    CZP_RECREATEZPOOLS = (CZP_REMOVEDATASETS | CZP_REMOVEZPOOLS
                          | CZP_CREATEZPOOLS)
    CZP_EXPORTPOOLS = 0x8

    def cleanup_zfs_pools(self, test_servers, action, zpool_datasets,
                          devices_must_exist):
        """
        Make sure any zpools are imported onto server[0], and then remove any datasets on the pools. Finally
        zpools are imported are exported leaving them not present on any node.

        Very ZFS specific code.
        :param test_servers: Servers that have have access to the zpools
        :param action: Action defined by the CZP constants.
        :param zpool_datasets: List of datasets to remove, None means remove all datasets.
        :param devices_must_exist: Error is devices do not exist when moving.
        """
        if (self.simulator is not None) or (self.zfs_devices_exist() is False):
            return

        # If ZFS if not installed on the test servers, then presume no ZFS to clear from any.
        # Might need to improve on this moving forwards.
        try:
            self.execute_simultaneous_commands(
                TestBlockDeviceZfs.zfs_install_commands(),
                [server['fqdn'] for server in test_servers],
                'checking for zfs presence',
                expected_return_code=None)
        except AssertionError:
            return

        first_test_server = test_servers[0]
        imported_zpools = []

        def dataset_match(datasets, device_path):
            return next(
                (dataset
                 for dataset in datasets if dataset.startswith(device_path)),
                None) is not None

        for lustre_device in config['lustre_devices']:
            if ((lustre_device['backend_filesystem'] == 'zfs')
                    and ((zpool_datasets is None) or dataset_match(
                        zpool_datasets, first_test_server['device_paths'][
                            lustre_device['path_index']]))):

                zfs_device = TestBlockDevice(
                    'zfs', first_test_server['device_paths'][
                        lustre_device['path_index']])

                self.execute_simultaneous_commands(
                    zfs_device.release_commands,
                    [server['fqdn'] for server in test_servers],
                    'export zfs device %s' % zfs_device,
                    expected_return_code=None)

                try:
                    self.execute_commands(
                        zfs_device.capture_commands,
                        first_test_server['fqdn'],
                        'import zfs device %s' % zfs_device,
                        expected_return_code=0 if devices_must_exist else None)
                    imported_zpools.append(zfs_device)
                except AssertionError:
                    # We could not import so if we are going to CZP_REMOVEZPOOLS then we might as well now try and
                    # dd the disk to get rid of the thing, otherwise raise the error.
                    if action & self.CZP_REMOVEZPOOLS:
                        ldiskfs_device = TestBlockDevice(
                            'linux', first_test_server['device_paths'][
                                lustre_device['path_index']])

                        self.execute_simultaneous_commands(
                            ldiskfs_device.destroy_commands,
                            [first_test_server['fqdn']],
                            'clearing disk because zfs import failed %s' %
                            ldiskfs_device,
                            expected_return_code=0)
                    else:
                        raise

        if zpool_datasets is None:
            zpool_datasets = self.execute_commands(
                TestBlockDeviceZfs.list_devices_commands(),
                first_test_server['fqdn'], 'listing zfs devices')[0].split()

        zpools_datasets = [
            zpool_dataset for zpool_dataset in zpool_datasets
            if zpool_dataset != ''
        ]

        if action & self.CZP_REMOVEDATASETS:
            # Sorting longest first means
            # zpool/dataset1/dataset2
            #  is deleted before
            # zpool/dataset1
            zpools_datasets.sort(key=lambda value: -len(value))

            # Have to remove datasets before zpools
            for remove_dataset in [True, False
                                   ] if (action
                                         & self.CZP_REMOVEZPOOLS) else [True]:
                for zpool_dataset in zpools_datasets:
                    if remove_dataset == ('/' in zpool_dataset):
                        zfs_device = TestBlockDevice('zfs', zpool_dataset)

                        self.execute_commands(
                            zfs_device.destroy_commands,
                            first_test_server['fqdn'],
                            'destroy zfs device %s' % zfs_device)

        if action & self.CZP_CREATEZPOOLS:
            for lustre_device in config['lustre_devices']:
                if lustre_device['backend_filesystem'] == 'zfs':
                    zfs_device = TestBlockDevice(
                        'zfs', first_test_server['zpool_device_paths'][
                            lustre_device['path_index']])

                    self.execute_commands(zfs_device.prepare_device_commands,
                                          first_test_server['fqdn'],
                                          'create zfs device %s' % zfs_device)

        if action & self.CZP_EXPORTPOOLS:
            for zfs_device in imported_zpools:
                self.execute_commands(zfs_device.release_commands,
                                      first_test_server['fqdn'],
                                      'export zfs device %s' % zfs_device)

    def cleanup_linux_devices(self, test_servers):
        """
        Destroy any partitions on the block device.

        Very linux specific code.
        :return:
        """
        if (self.simulator
                is not None) or (self.linux_devices_exist() is False):
            return

        first_test_server = test_servers[0]

        # Erase all volumes if the config does not indicate that there is already
        # a pre-existing file system (in the case of the monitoring only tests).
        for lustre_device in config['lustre_devices']:
            if lustre_device['backend_filesystem'] == 'linux':
                linux_device = TestBlockDevice(
                    'linux', first_test_server['device_paths'][
                        lustre_device['path_index']])

                self.execute_simultaneous_commands(
                    linux_device.destroy_commands,
                    [server['fqdn'] for server in test_servers],
                    'clear block device %s' % linux_device)

    @property
    def quick_setup(self):
        """
        Defining IML_QUICK_TEST_SETUP will mean that test setup is shortened to enable hands on development and debug
        of to be quicker.  It is not intended to be safe to use for real tests and is not really a part way towards test
        optimization.  It is to reduce the cycle time of development where a 10 second test can take 20 minutes
        for each run.

        Use with caution, only use when running 1 test - not a series of tests etc.

        :return:
        """
        return 'IML_QUICK_TEST_SETUP' in os.environ