Beispiel #1
0
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    config.set('settings', 'server', {'url': url})
    crypto = Crypto(config.path)
    agent_client = AgentClient(url + 'reregister/', ActionPluginManager(),
                               DevicePluginManager(), ServerProperties(),
                               crypto)
    data = {'address': address, 'fqdn': agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
Beispiel #2
0
def register_server(url, ca, secret, address=None):
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    crypto = Crypto(config.path)
    # Call delete in case we are over-writing a previous configuration that wasn't removed properly
    crypto.delete()
    crypto.install_authority(ca)

    agent_client = AgentClient(url + "register/%s/" % secret,
                               ActionPluginManager(), DevicePluginManager(),
                               ServerProperties(), crypto)

    registration_result = agent_client.register(address)
    crypto.install_certificate(registration_result['certificate'])

    config.set('settings', 'server', {'url': url})

    console_log.info("Enabling chroma-agent service")
    agent_service.enable()

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return registration_result
Beispiel #3
0
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    conf.set_server_url(url)
    crypto = Crypto(conf.ENV_PATH)
    agent_client = AgentClient(
        url + "reregister/",
        ActionPluginManager(),
        DevicePluginManager(),
        ServerProperties(),
        crypto,
    )
    data = {"address": address, "fqdn": agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
 def start_agent(self):
     with self._lock:
         self._agent_client = AgentClient(
             url=self._simulator.url + "message/",
             action_plugins=FakeActionPlugins(self, self._simulator),
             device_plugins=FakeDevicePlugins(self),
             server_properties=self,
             crypto=self.crypto)
         self._agent_client.start()
Beispiel #5
0
def main():
    """handle unexpected exceptions"""
    parser = argparse.ArgumentParser(
        description="Integrated Manager for Lustre software Agent")

    parser.add_argument("--publish-zconf", action="store_true")
    parser.parse_args()

    signal.signal(signal.SIGHUP, signal.SIG_IGN)

    daemon_log_setup()
    console_log_setup()
    daemon_log.info("Starting")

    try:
        daemon_log.info("Entering main loop")
        try:
            url = urljoin(os.environ["IML_MANAGER_URL"], "agent/message/")
        except KeyError as e:
            daemon_log.error(
                "No configuration found (must be registered before running the agent service), "
                "details: %s" % e)
            return

        if config.profile_managed is False:
            # This is kind of terrible. The design of DevicePluginManager is
            # such that it can be called with either class methods or
            # instantiated and then called with instance methods. As such,
            # we can't pass in a list of excluded plugins to the instance
            # constructor. Well, we could, but it would only work some
            # of the time and that would be even more awful.
            import chroma_agent.plugin_manager
            chroma_agent.plugin_manager.EXCLUDED_PLUGINS += ['corosync']

        agent_client = AgentClient(url, ActionPluginManager(),
                                   DevicePluginManager(), ServerProperties(),
                                   Crypto(ENV_PATH))

        def teardown_callback(*args, **kwargs):
            agent_client.stop()
            agent_client.join()
            [function() for function in agent_daemon_teardown_functions]

        signal.signal(signal.SIGINT, teardown_callback)
        signal.signal(signal.SIGTERM, teardown_callback)
        signal.signal(signal.SIGUSR1, decrease_loglevel)
        signal.signal(signal.SIGUSR2, increase_loglevel)

        # Call any agent daemon startup methods that were registered.
        [function() for function in agent_daemon_startup_functions]

        agent_client.start()
        # Waking-wait to pick up signals
        while not agent_client.stopped.is_set():
            agent_client.stopped.wait(timeout=10)

        agent_client.join()
    except Exception, e:
        backtrace = '\n'.join(traceback.format_exception(*(sys.exc_info())))
        daemon_log.error("Unhandled exception: %s" % backtrace)
    def register(self, fqdn, secret):
        try:
            log.debug("register %s" % fqdn)
            server = self.servers[fqdn]

            if server.agent_is_running:
                # e.g. if the server was added then force-removed then re-added
                server.shutdown_agent()

            if not server.is_worker and not self.power.server_has_power(fqdn):
                raise RuntimeError(
                    "Not registering %s, none of its PSUs are powered" % fqdn)

            client = AgentClient(url=self.url + "register/%s/" % secret,
                                 action_plugins=FakeActionPlugins(
                                     self, server),
                                 device_plugins=FakeDevicePlugins(server),
                                 server_properties=server,
                                 crypto=server.crypto)

            try:
                registration_result = client.register()
            except ConnectionError as e:
                log.error("Registration connection failed for %s: %s" %
                          (fqdn, e))
                return
            except HttpError as e:
                log.error("Registration request failed for %s: %s" % (fqdn, e))
                return
            server.crypto.install_certificate(
                registration_result['certificate'])

            # Immediately start the agent after registration, to pick up the
            # setup actions that will be waiting for us on the manager.
            self.start_server(fqdn)
            return registration_result
        except Exception:
            log.error(traceback.format_exc())
                "No configuration found (must be registered before running the agent service), "
                "details: %s" % e)
            return

        if config.profile_managed is False:
            # This is kind of terrible. The design of DevicePluginManager is
            # such that it can be called with either class methods or
            # instantiated and then called with instance methods. As such,
            # we can't pass in a list of excluded plugins to the instance
            # constructor. Well, we could, but it would only work some
            # of the time and that would be even more awful.
            import chroma_agent.plugin_manager
            chroma_agent.plugin_manager.EXCLUDED_PLUGINS += ['corosync']

        agent_client = AgentClient(conf['url'] + "message/",
                                   ActionPluginManager(),
                                   DevicePluginManager(), ServerProperties(),
                                   Crypto(config.path))

        def teardown_callback(*args, **kwargs):
            agent_client.stop()
            agent_client.join()
            [function() for function in agent_daemon_teardown_functions]

        if not args.foreground:
            handlers = {
                signal.SIGTERM: teardown_callback,
                signal.SIGUSR1: decrease_loglevel,
                signal.SIGUSR2: increase_loglevel
            }
            set_signal_handlers(handlers)
        else:
class FakeServer(utils.Persisted):
    """Represent a single storage server.  Initialized with an arbitrary
    hostname and NIDs, subsequently manages/modifies its own /proc/ etc.
    """
    default_state = {
        'worker': False,
        'shutting_down': False,
        'starting_up': False,
        'lnet_loaded': False,
        'lnet_up': False,
        'stats': None,
        'packages': {},
        'client_mounts': {},
        'corosync': CorosyncState('stopped', 3000),
        'pacemaker': PacemakerState('stopped')
    }

    def __init__(self,
                 simulator,
                 fake_cluster,
                 server_id,
                 fqdn,
                 nodename,
                 nid_tuples_or_network_interfaces,
                 worker=False,
                 client_mounts=None):
        self.fqdn = fqdn

        super(FakeServer, self).__init__(simulator.folder)

        self.nodename = nodename

        # When reloaded from a file the network interfaces are presented as a dict but when created from code they
        # are represented as nid_tuples.
        if type(nid_tuples_or_network_interfaces) is dict:
            self.network_interfaces = nid_tuples_or_network_interfaces
        else:
            self.network_interfaces = {}
            interface_no = 0
            for nid in nid_tuples_or_network_interfaces:
                self.network_interfaces[nid[0]] = {
                    'interface_no': interface_no,
                    'type': nid[1],
                    'lnd_type': nid[1],
                    'lnd_network': nid[2]
                }
                interface_no += 1

        self.boot_time = IMLDateTime.utcnow()
        self.id = server_id

        self.log_rate = 0
        self._log_messages = []
        self._simulator = simulator
        self._devices = simulator.devices
        self._cluster = fake_cluster

        self._lock = threading.Lock()
        self._agent_client = None

        self._cluster.join(nodename, fqdn=fqdn)

        if simulator.folder:
            # If simulator is persistent, save Crypto in same place
            crypto_folder = os.path.join(simulator.folder,
                                         "%s_crypto" % self.fqdn)
        else:
            # If simulator is ephemeral, create temp dir for Crypto
            crypto_folder = tempfile.mkdtemp()

        if not os.path.exists(crypto_folder):
            os.makedirs(crypto_folder)
        self.crypto = Crypto(crypto_folder)

        if not client_mounts:
            client_mounts = {}

        self.state['id'] = server_id
        self.state['network_interfaces'] = self.network_interfaces
        self.state['nodename'] = nodename
        self.state['fqdn'] = fqdn
        self.state['worker'] = worker
        self.state['client_mounts'] = client_mounts
        self.state['packages'][
            'chroma-agent'] = self._simulator.available_packages(
                self.node_type)[
                    'chroma-agent']  # The agent is installed at this point.
        self.save()

    @property
    def is_worker(self):
        with self._lock:
            return self.state['worker']

    @property
    def node_type(self):
        return 'worker' if self.is_worker else 'server'

    @property
    def lustre_client_mounts(self):
        mounts = []
        with self._lock:
            for mountspec, mountpoint in self.state['client_mounts'].items():
                mounts.append({
                    'mountspec': mountspec,
                    'mountpoint': mountpoint
                })
            return mounts

    def add_client_mount(self, mountspec, mountpoint):
        fsname = mountspec.split(':/')[1]
        mountpoint = "%s/%s" % (mountpoint, fsname)
        self.start_lnet()
        with self._lock:
            self.state['client_mounts'][mountspec] = mountpoint
            self.save()
        log.debug("Mounted %s on %s" % (mountspec, mountpoint))

    def del_client_mount(self, mountspec):
        with self._lock:
            try:
                del self.state['client_mounts'][mountspec]
                self.save()
            except KeyError:
                pass
        log.debug("Unmounted %s" % mountspec)

    @property
    def nids(self):
        nids = []

        for inet4_address in self.network_interfaces.keys():
            interface = self.network_interfaces[inet4_address]

            if interface['lnd_network'] is not None:
                nids.append("%s@%s%s" % (inet4_address, interface['type'],
                                         interface['lnd_network']))

        return nids

    def update_profile(self, profile):
        with self._lock:
            self.state['profile_name'] = profile['name']
            self.save()

        return agent_result_ok

    def restart_agent(self):
        def _restart():
            # FIXME: don't actually want to simulate_shutdown but otherwise it tries to join from the current thread
            self.shutdown(simulate_shutdown=True, reboot=True)

        raise CallbackAfterResponse(None, _restart)

    def scan_packages(self):
        packages = {}
        for package, available_version in self._simulator.available_packages(
                self.node_type).items():
            try:
                installed = [self.get_package_version(package)]
            except KeyError:
                installed = []

            packages[package] = {
                'available': [available_version],
                'installed': installed
            }

        return {'external': packages}

    def install_packages(self, repos, packages):
        for package in packages:
            try:
                self.state['packages'][
                    package] = self._simulator.available_packages(
                        self.node_type)[package]
            except KeyError:
                raise RuntimeError(
                    "Package '%s' not found (available: %s)!" %
                    (package, self._simulator.available_packages(
                        self.node_type)))

        self.save()

        return agent_result(self.scan_packages())

    def get_package_version(self, package):
        return self.state['packages'][package]

    def inject_log_message(self, message):
        log.debug("Injecting log message %s/%s" % (self.fqdn, message))
        self._log_messages.append({
            'source': 'cluster_sim',
            'severity': 1,
            'facility': 1,
            'message': message,
            'datetime': IMLDateTime.utcnow().isoformat()
        })

    def pop_log_messages(self):
        messages = self._log_messages

        d = datetime.datetime.now()
        messages.extend([{
            'source': 'cluster_sim',
            'severity': 1,
            'facility': 1,
            'message': "%s %s %s" % (self.fqdn, d, a),
            'datetime': IMLDateTime.utcnow().isoformat()
        } for a in range(0, self.log_rate)])

        self._log_messages = []
        return messages

    def _proc_path_to_target(self, path):
        """Look up a target dict from a /proc/ path"""

        parts = path.split("/")[1:]
        if parts[3] == 'lov':
            # /proc/fs/lustre/lov/testfs-MDT0000-mdtlov/stripesize
            mdt_name = parts[4].rsplit("-", 1)[0]
            for target in self._targets_started_here:
                if target['label'] == mdt_name:
                    return target

            raise RuntimeError(
                "%s: Cannot get proc information for %s, it's not started here (targets here are: %s)"
                % (self.fqdn, mdt_name,
                   [t['label'] for t in self._targets_started_here]))

        elif parts[3] == 'obdfilter':
            # /proc/fs/lustre/obdfilter/testfs-OST0000/writethrough_cache_enable
            ost_name = parts[4]
            for target in self._targets_started_here:
                if target['label'] == ost_name:
                    return target

            raise RuntimeError(
                "%s: Cannot get proc information for %s, it's not started here (targets here are: %s)"
                % (self.fqdn, ost_name,
                   [t['label'] for t in self._targets_started_here]))

        elif parts[3] == 'mdt':
            # /proc/fs/lustre/mdt/testfs-MDT0000/hsm_control
            mdt_name = parts[4]
            for target in self._targets_started_here:
                if target['label'] == mdt_name:
                    return target

        raise NotImplementedError("Simulator cannot resolve '%s' to target" %
                                  path)

    def _proc_path_to_conf_param(self, configs, path):
        """Resolve a /proc/foo/bar to a foo-MDT0000.bar.baz and look it up
        in the `configs` dict, or substitute a default if it's not set"""

        LOV_DEFAULTS = {'stripesize': "1"}

        OBDFILTER_DEFAULTS = {
            'writethrough_cache_enable': "16",
            'sync_journal': "0"
        }

        MDT_DEFAULTS = {'hsm_control': 'disabled'}

        parts = path.split("/")[1:]
        if parts[3] == 'lov':
            # /proc/fs/lustre/lov/testfs-MDT0000-mdtlov/stripesize
            mdt_name = parts[4].rsplit("-", 1)[0]
            conf_param = "%s.lov.%s" % (mdt_name, parts[5])
            try:
                return configs[conf_param]
            except KeyError:
                return LOV_DEFAULTS[parts[5]]
        elif parts[3] == 'obdfilter':
            # /proc/fs/lustre/obdfilter/testfs-OST0000/writethrough_cache_enable
            conf_param = "%s.ost.%s" % (parts[4], parts[5])
            try:
                return configs[conf_param]
            except KeyError:
                return OBDFILTER_DEFAULTS[parts[5]]
        elif parts[3] == 'mdt':
            # /proc/fs/lustre/mdt/testfs-MDT0000/hsm_control
            mdt_name = parts[4]
            conf_param = "%s.mdt.%s" % (mdt_name, parts[5])
            try:
                print configs
                return configs[conf_param]
            except KeyError:
                return MDT_DEFAULTS[parts[5]]

        raise NotImplementedError(
            "Simulator cannot resolve '%s' to conf param name" % path)

    def detect_scan(self, target_devices):
        local_targets = []
        mgs_target = None
        for serial, target in self._devices.state['targets'].items():
            log.info("targets: %s: %s %s" %
                     (serial, target['label'], target['uuid']))
        for ha_label, resource in self._cluster.state['resources'].items():
            log.info("cluster: %s %s %s" %
                     (ha_label, resource['uuid'], resource['device_path']))

        for target_device in target_devices:
            path = target_device['path']

            try:
                target = self._devices.get_target_by_path(self.fqdn, path)
            except KeyError:
                # Key error means this is not a target
                continue

            try:
                ha_resource = self._cluster.get_by_uuid(target['uuid'])
            except KeyError:
                log.warning("No HA resource for target %s/%s" %
                            (target['label'], target['uuid']))
                continue
            location = self._cluster.resource_locations()[
                ha_resource['ha_label']]
            mounted = location == self.nodename
            local_targets.append({
                "name": target['label'],
                "uuid": target['uuid'],
                "params": {},
                "device_paths": [path],
                "mounted": mounted
            })

            if target['label'] == 'MGS':
                mgs_target = target

        mgs_targets = defaultdict(lambda: [])
        if mgs_target is not None:
            for target_label in self._devices.mgt_get_target_labels(
                    mgs_target['mgsnode']):
                target = self._devices.get_target_by_label(target_label)

                mgs_targets[target['fsname']].append({
                    'uuid':
                    target['uuid'],
                    'name':
                    target['label'],
                    'nid':
                    target['primary_nid']
                })

        result = {
            "local_targets": local_targets,
            "mgs_targets": mgs_targets,
            "mgs_conf_params": {}
        }
        log.debug("detect_scan: %s" % json.dumps(result, indent=2))

        return result

    def read_proc(self, path):
        target = self._proc_path_to_target(path)
        configs = self._devices.mgt_get_conf_params(target['mgsnode'])
        return self._proc_path_to_conf_param(configs, path)

    def set_conf_param(self, key, value):
        mgt = None
        for target in self._targets_started_here:
            if target['label'] == 'MGS':
                mgt = target
                break
        if mgt is None:
            raise RuntimeError(
                "Tried to run set_conf_param but not target named MGS is started on %s"
                % self.fqdn)

        self._devices.mgt_set_conf_param(mgt['mgsnode'], key, value)

        if 'hsm_control' in key:
            # e.g. testfs-MDT0000.mdt.hsm_control
            target = key.split('.')[0]
            fsname = target[:target.rfind('-')]
            self._simulator.control_hsm_coordinator(fsname, value)

    @property
    def filename(self):
        return "fake_server_%s.json" % self.fqdn

    @property
    def has_power(self):
        return self._simulator.power.server_has_power(self.fqdn)

    @property
    def registered(self):
        try:
            return self.crypto.certificate_file is not None
        except AttributeError:
            return False

    @property
    def agent_is_running(self):
        with self._lock:
            if self._agent_client:
                return not self._agent_client.stopped.is_set()
            else:
                return False

    @property
    def starting_up(self):
        with self._lock:
            return self.state['starting_up']

    @property
    def shutting_down(self):
        with self._lock:
            return self.state['shutting_down']

    @property
    def running(self):
        return self.starting_up or self.shutting_down or self.agent_is_running

    def join(self):
        with self._lock:
            if self._agent_client:
                self._agent_client.join()
                self._agent_client.reader.join()

    def stop(self):
        if self._agent_client:
            self._agent_client.stop()

    def shutdown_agent(self):
        if not self._agent_client:
            # Already stopped?
            return

        with self._lock:
            self._agent_client.stop()
            self._agent_client.join()
            # Have to join the reader explicitly, normal AgentClient code
            # skips it because it's slow to join (waits for long-polling GET
            # to complete)
            self._agent_client.reader.join()

            self._agent_client = None

    def start_agent(self):
        with self._lock:
            self._agent_client = AgentClient(
                url=self._simulator.url + "message/",
                action_plugins=FakeActionPlugins(self, self._simulator),
                device_plugins=FakeDevicePlugins(self),
                server_properties=self,
                crypto=self.crypto)
            self._agent_client.start()

    def _enter_shutdown(self):
        with self._lock:
            self.state['shutting_down'] = True
            self.save()

    def _exit_shutdown(self):
        with self._lock:
            self.state['shutting_down'] = False
            self.save()

    def shutdown(self, simulate_shutdown=False, reboot=False):
        """
        Shutdown the simulated server.
        :param simulate_shutdown: Simulate a server shutdown, delays and all.
        :param reboot: Restart the server after shutdown completes.
        """
        def blocking_shutdown():
            log.info("%s beginning shutdown" % self.fqdn)
            self._enter_shutdown()

            shutdown_start_time = IMLDateTime.utcnow()
            self.shutdown_agent()
            self._cluster.leave(self.nodename)
            shutdown_end_time = IMLDateTime.utcnow()

            shutdown_time = (shutdown_end_time - shutdown_start_time).seconds
            while shutdown_time < MIN_SHUTDOWN_DURATION:
                if not simulate_shutdown:
                    break

                remaining_delay = MIN_SHUTDOWN_DURATION - shutdown_time
                log.info("%s, sleeping for %d seconds on shutdown" %
                         (self.fqdn, remaining_delay))

                shutdown_time += remaining_delay
                time.sleep(remaining_delay)

            self._exit_shutdown()
            log.info("%s shutdown complete" % self.fqdn)

            if reboot:
                log.info("%s rebooting" % self.fqdn)
                self.startup(simulate_shutdown)

        class NonBlockingShutdown(threading.Thread):
            def run(self):
                try:
                    blocking_shutdown()
                except IOError:
                    # Sometimes the cluster state dir gets nuked from under
                    # the thread that's shutting down, which is OK. If there
                    # are truly IOErrors to worry about, they'll show up
                    # elsewhere.
                    pass

        if simulate_shutdown:
            NonBlockingShutdown().start()
        else:
            blocking_shutdown()

    def _enter_startup(self):
        with self._lock:
            self.state['starting_up'] = True
            self.save()

    def _exit_startup(self):
        with self._lock:
            self.state['starting_up'] = False
            self.save()

    def startup(self, simulate_bootup=False):
        """
        Start the simulated server.
        :param simulate_bootup: Simulate a server bootup, delays and all.
        """
        if not 'worker' in self.fqdn and not self.has_power:
            log.warning("%s can't start; no PSUs have power" % self.fqdn)
            return
        if not self.registered:
            log.warning("%s can't start; not registered" % self.fqdn)
            return
        if self.running and not self.shutting_down:
            log.warning("%s already running; not starting again" % self.fqdn)
            return

        def blocking_startup():
            log.info("%s beginning startup" % self.fqdn)

            startup_time = 0
            # Mixing shutdown/startup state here is a little weird, but
            # this only happens when it's explicitly requested by a caller
            # that wants full server simulation. The end result is that
            # simulating a reboot is easier if we deal with waiting for
            # the shutdown to finish here before moving on to startup.
            while simulate_bootup and self.shutting_down:
                log.info("%s, waiting for shutdown (%d)" %
                         (self.fqdn, startup_time))
                startup_time += 1
                time.sleep(1)

            # This may happen to the losing thread(s) during startup after
            # a multi-PDU power cycle.
            if self.starting_up:
                log.info(
                    "%s exiting startup because another thread is already doing it?"
                    % self.fqdn)
                return

            self._enter_startup()
            self.boot_time = IMLDateTime.utcnow()

            while startup_time < MIN_STARTUP_DURATION:
                if not simulate_bootup:
                    break

                remaining_delay = MIN_STARTUP_DURATION - startup_time
                log.info("%s, sleeping for %d seconds on boot" %
                         (self.fqdn, remaining_delay))

                startup_time += remaining_delay
                time.sleep(remaining_delay)

            self.start_agent()
            self._cluster.join(self.nodename)

            self._exit_startup()
            log.info("%s startup complete" % self.fqdn)

        class NonBlockingStartup(threading.Thread):
            def run(self):
                blocking_startup()

        if simulate_bootup:
            NonBlockingStartup().start()
        else:
            blocking_startup()

    def reboot(self, simulate_reboot=False):
        if self.running:
            self.shutdown(simulate_reboot)

        self.startup(simulate_reboot)

    def start_corosync(self):
        with self._lock:
            self.state['corosync'].state = 'started'
            self.save()
            return agent_result_ok

    def start_corosync2(self):
        return self.start_corosync()

    def stop_corosync(self):
        with self._lock:
            self.state['corosync'].state = 'stopped'
            self.save()
            return agent_result_ok

    def stop_corosync2(self):
        return self.stop_corosync()

    def configure_network(self,
                          ring0_name,
                          ring1_name=None,
                          ring0_ipaddr=None,
                          ring0_prefix=None,
                          ring1_ipaddr=None,
                          ring1_prefix=None):
        return agent_result_ok

    def configure_corosync(self,
                           ring0_name,
                           old_mcast_port,
                           new_mcast_port,
                           ring1_name=None,
                           ring0_ipaddr=None,
                           ring0_prefix=None,
                           ring1_ipaddr=None,
                           ring1_prefix=None):
        with self._lock:
            self.state['corosync'].mcast_port = new_mcast_port
            self.save()
            return agent_result_ok

    def configure_corosync2_stage_1(self, mcast_port, pcs_password):
        with self._lock:
            self.state['corosync'].mcast_port = mcast_port
            self.save()
            return agent_result_ok

    def configure_corosync2_stage_2(self, ring0_name, ring1_name,
                                    new_node_fqdn, create_cluster, mcast_port,
                                    pcs_password):
        return agent_result_ok

    def change_mcast_port(self, old_mcast_port, new_mcast_port):
        with self._lock:
            self.state['corosync'].mcast_port = new_mcast_port
            self.save()
            return agent_result_ok

    def get_corosync_autoconfig(self):
        with self._lock:
            port_names = {'tcp': 'eth', 'o2ib': 'ib'}
            inet4_addresses = []
            names = []

            for inet4_address in self.network_interfaces.keys():
                interface = self.network_interfaces[inet4_address]
                inet4_addresses.append(inet4_address)
                names.append(
                    '%s%s' %
                    (port_names[interface['type']], interface['interface_no']))

            return agent_result({
                'interfaces': {
                    names[0]: {
                        'dedicated': False,
                        'ipaddr': inet4_addresses[0],
                        'prefix': 24
                    },
                    names[1]: {
                        'dedicated': True,
                        'ipaddr': inet4_addresses[1],
                        'prefix': 24
                    }
                },
                'mcast_port': self.state['corosync'].mcast_port
            })

    def start_pacemaker(self):
        with self._lock:
            self.state['pacemaker'].state = 'started'
            self.save()
            return agent_result_ok

    def stop_pacemaker(self):
        with self._lock:
            self.state['pacemaker'].state = 'stopped'
            self.save()
            return agent_result_ok

    def start_lnet(self):
        with self._lock:
            self.state['lnet_loaded'] = True
            self.state['lnet_up'] = True
            self.save()
            return agent_result_ok

    def stop_lnet(self):
        with self._lock:
            self.state['lnet_loaded'] = True
            self.state['lnet_up'] = False
            self.save()
            return agent_result_ok

    def unload_lnet(self):
        with self._lock:
            self.state['lnet_loaded'] = False
            self.state['lnet_up'] = False
            self.save()
            return agent_result_ok

    def load_lnet(self):
        with self._lock:
            self.state['lnet_loaded'] = True
            self.state['lnet_up'] = False
            self.save()
            return agent_result_ok

    def configure_lnet(self, **kwargs):
        with self._lock:
            lnet_configuration = kwargs['lnet_configuration']

            # First off destroy all the current nids
            for inet4_address in self.network_interfaces:
                self.network_interfaces[inet4_address]['lnd_network'] = None

            for nid in lnet_configuration['network_interfaces']:
                inet4_address = nid[0]
                self.network_interfaces[inet4_address]['lnd_type'] = nid[1]
                self.network_interfaces[inet4_address]['lnd_network'] = nid[2]

            self.state['network_interfaces'] = self.network_interfaces
            self.save()
            return agent_result_ok

    def unconfigure_lnet(self):
        # No with lock because configure_lnet does the with lock.
        self.configure_lnet(lnet_configuration={
            'network_interfaces': [],
            'state': 'lnet_unloaded'
        })
        return agent_result_ok

    def writeconf_target(self,
                         writeconf=False,
                         erase_params=False,
                         device=None,
                         mgsnode=None,
                         failnode=None):
        if mgsnode is None:
            # When writeconfing an MGS, erase its simulated conf params
            target = self._devices.get_target_by_path(self.fqdn, device)
            self._devices.mgt_writeconf(target['mgsnode'])

    def check_block_device(self, device_type, path):
        serial = self._devices.get_by_path(self.fqdn, path)['serial_80']
        return self._devices.state['local_filesystems'].get(serial, None)

    def format_target(self,
                      device_type,
                      target_name,
                      device,
                      backfstype,
                      target_types=(),
                      mgsnode=(),
                      fsname=None,
                      failnode=(),
                      servicenode=(),
                      param={},
                      index=None,
                      comment=None,
                      mountfsoptions=None,
                      network=(),
                      device_size=None,
                      mkfsoptions=None,
                      reformat=False,
                      stripe_count_hint=None,
                      iam_dir=False,
                      dryrun=False,
                      verbose=False,
                      quiet=False):

        # Remove warnings, but allow parameters to match real parameters.
        target_name, servicenode, param, comment, mountfsoptions, network, device_size, reformat, stripe_count_hint, iam_dir, dryrun, verbose, quiet, device_type

        def mgs_key(mgsnode):
            # Create a consistent mgs_key by sorting the nids
            return ":".join(
                sorted([",".join(sorted(mgsnids)) for mgsnids in mgsnode]))

        if 'mgs' in target_types:
            label = "MGS"
            mgsnode = tuple(self.nids)
            if failnode:
                mgsnode = tuple([mgsnode, tuple(failnode)])
            else:
                mgsnode = tuple([mgsnode])
            self._devices.mgt_create(mgs_key(mgsnode))
        elif 'mdt' in target_types:
            label = "%s-MDT%.4x" % (fsname, index)
        else:
            label = "%s-OST%.4x" % (fsname, index)

        tgt_uuid = uuid.uuid4().__str__()

        target = {
            'label': label,
            'uuid': tgt_uuid,
            'mgsnode': mgs_key(mgsnode),
            'fsname': fsname,
            'backfstype': backfstype,
            'index': index,
            'primary_nid': None
        }

        return self._devices.format(self.fqdn, device, mkfsoptions, target)

    def register_target(self, device_path, mount_point, backfstype):

        # FIXME: arbitrary choice of which NID to use, correctly this should be
        # whichever NID LNet uses to route to the MGS, or what is set
        # with servicenode.
        nid = self.nids[0]

        label = self._devices.register(self.fqdn, device_path, nid)

        return {'label': label}

    def open_firewall(self, port, address, proto, description, persist):
        return agent_result_ok

    def close_firewall(self, port, address, proto, description, persist):
        return agent_result_ok

    def configure_target_ha(self, primary, device, ha_label, uuid,
                            mount_point):
        return agent_result(
            self._cluster.configure(self.nodename, device, ha_label, uuid,
                                    primary, mount_point))

    def unconfigure_target_ha(self, primary, ha_label, uuid):
        return agent_result(
            self._cluster.unconfigure(self.nodename, ha_label, primary))

    def purge_configuration(self, mgs_device_path, mgs_device_type,
                            filesystem_name):
        serial = self._devices.get_by_path(self.fqdn,
                                           mgs_device_path)['serial_80']
        mgsnode = self._devices.state['targets'][serial]['mgsnode']
        self._devices.mgt_purge_fs(mgsnode, filesystem_name)

    def start_target(self, ha_label):
        resource = self._cluster.start(ha_label)
        return agent_result(resource['started_on'])

    def stop_target(self, ha_label):
        self._cluster.stop(ha_label)

        return agent_result_ok

    def import_target(self,
                      device_type,
                      path,
                      pacemaker_ha_operation,
                      validate_importable=False):
        return agent_result_ok

    def export_target(self, device_type, path):
        return agent_result_ok

    @property
    def _targets_started_here(self):
        uuid_started_on = {}

        for ha_label, resource in self._cluster.state['resources'].items():
            uuid_started_on[resource['uuid']] = resource['started_on']

        for (target, path) in self._devices.get_targets_by_server(self.fqdn):
            try:
                started_on = uuid_started_on[target['uuid']]
            except KeyError:
                # lustre target not known to cluster, not yet ha-configured
                # log.debug("Lustre target %s %s not known to cluster %s" % (target['label'], target['uuid'],
                #                                                            json.dumps(self._cluster.state['resources'], indent=2)))
                pass
            else:
                if started_on == self.nodename:
                    yield target

    def get_lustre_stats(self):
        result = {
            'target': {},
            'lnet': {},
            'jobid_var': 'procname_uid',
        }

        for target in self._targets_started_here:
            result['target'][target['label']] = self._devices.get_target_stats(
                target)
            # One coordinator per filesystem; just stick it on the first MDT
            if 'MDT0000' in target['label'] and target[
                    'fsname'] in self._simulator.coordinators:
                hsm_stats = self._simulator.coordinators[
                    target['fsname']].hsm_stats
                result['target'][target['label']].update(hsm_stats)

        return result

    def get_node_stats(self):
        if not self.state['stats']:
            self.state['stats'] = {
                "hostname": self.fqdn,
                "meminfo": {
                    "SwapTotal": 1048568,
                    "SwapFree": 1048568,
                    "MemFree": 66196,
                    "HighTotal": 0,
                    "Committed_AS": 209916,
                    "NFS_Unstable": 0,
                    "VmallocChunk": 34359737507,
                    "Writeback": 0,
                    "MemTotal": 509856,
                    "VmallocUsed": 832,
                    "AnonPages": 32052,
                    "HugePages_Free": 0,
                    "Inactive": 195760,
                    "Active": 170680,
                    "CommitLimit": 1303496,
                    "Hugepagesize": 2048,
                    "Cached": 238156,
                    "SwapCached": 0,
                    "LowTotal": 509856,
                    "Dirty": 476,
                    "Mapped": 19916,
                    "HighFree": 0,
                    "VmallocTotal": 34359738367,
                    "Bounce": 0,
                    "HugePages_Rsvd": 0,
                    "PageTables": 5096,
                    "HugePages_Total": 0,
                    "Slab": 60816,
                    "Buffers": 96236,
                    "LowFree": 66196
                },
                "cpustats": {
                    "iowait": 58994,
                    "idle": 17632363,
                    "total": 17820617,
                    "user": 6160,
                    "system": 100260
                }
            }

        self.state['stats']['meminfo']['MemFree'] = utils.perturb(
            self.state['stats']['meminfo']['MemFree'], 5000, 0,
            self.state['stats']['meminfo']['MemTotal'])

        user_inc = random.randint(50, 100)
        idle_inc = random.randint(50, 100)
        self.state['stats']['cpustats']['user'] += user_inc
        self.state['stats']['cpustats']['idle'] += idle_inc
        self.state['stats']['cpustats']['total'] += (idle_inc + user_inc)
        # This is necessary to stop and start the simulator and avoid a big judder on the charts,
        # but in other circumstances it's a gratuitous amount of IO
        #self.save()

        return self.state['stats']