def __init__(self, folder, url):
        self.folder = folder
        super(ClusterSimulator, self).__init__(folder)

        self.url = url + "agent/"

        if folder and not os.path.exists(folder):
            os.makedirs(folder)

        self.lustre_clients = {}
        self.devices = FakeDevices(folder)
        self.power = FakePowerControl(folder, self.poweron_server,
                                      self.poweroff_server)
        self.servers = {}
        self.clusters = {}
        self.controllers = {}
        self.coordinators = {}
        self.copytools = {}
        self.copytool_monitors = {}

        self._load_controllers()
        self._load_servers()
Exemple #2
0
 def _setup(self):
     # This should all be reasonably thread-safe, since it's just reading
     # the JSON from disk, but talks to the server to make any changes.
     self.power = FakePowerControl(self.args.config, None, None)
class ClusterSimulator(Persisted):
    """
    Create the global fakes and the per-server fakes, and publish
    start/stop/register operations for each simulated agent.
    """
    filename = 'simulator.json'
    default_state = {'packages': {'server': {}, 'worker': {}}}

    def __init__(self, folder, url):
        self.folder = folder
        super(ClusterSimulator, self).__init__(folder)

        self.url = url + "agent/"

        if folder and not os.path.exists(folder):
            os.makedirs(folder)

        self.lustre_clients = {}
        self.devices = FakeDevices(folder)
        self.power = FakePowerControl(folder, self.poweron_server,
                                      self.poweroff_server)
        self.servers = {}
        self.clusters = {}
        self.controllers = {}
        self.coordinators = {}
        self.copytools = {}
        self.copytool_monitors = {}

        self._load_controllers()
        self._load_servers()

    def update_packages(self, packages, node_type='server'):
        log.info("Updating packages: %s" % packages)
        for k, v in packages.items():
            self.state['packages'][node_type][k] = v
        self.save()

        # The agent only reports new versions at the start of sessions
        # IRL this is valid because when we install updates on the manager
        # we restart the manager servers, causing new sessions.  In simulation,
        # we don't control the manager, so instead restart the AgentClient instances.
        for fqdn in self.servers.keys():
            self.stop_server(fqdn)
            self.start_server(fqdn)

    def available_packages(self, node_type='server'):
        return self.state['packages'][node_type]

    def _get_cluster_for_server(self, server_id):
        cluster_id = server_id / self.state['cluster_size']
        try:
            return self.clusters[cluster_id]
        except KeyError:
            cluster = self.clusters[cluster_id] = FakeCluster(
                self.folder, cluster_id)
            return cluster

    def get_cluster(self, fqdn):
        return self._get_cluster_for_server(self.servers[fqdn].id)

    def _load_servers(self):
        for server_conf in glob.glob("%s/fake_server_*.json" % self.folder):
            conf = json.load(open(server_conf))
            self.servers[conf['fqdn']] = FakeServer(
                self, self._get_cluster_for_server(conf['id']), conf['id'],
                conf['fqdn'], conf['nodename'], conf['network_interfaces'],
                conf['worker'], conf['client_mounts'])

    def _load_controllers(self):
        for controller_conf in glob.glob("%s/fake_controller_*.json" %
                                         self.folder):
            conf = json.load(open(controller_conf))
            self.controllers[conf['controller_id']] = FakeController(
                self.folder, conf['controller_id'])

    def _create_server(self, i, nid_count):
        interface_names = ['tcp', 'o2ib']
        nid_tuples = []
        nodename = "test%.3d" % i
        fqdn = "%s.localdomain" % nodename
        x, y = (i / 256, i % 256)
        for network in range(0, nid_count):
            name = interface_names[network % len(interface_names)]
            address = '10.%d.%d.%d' % (network, x, y)
            nid_tuples.append((address, name, network))

        log.info("_create_server: %s" % fqdn)

        server = FakeServer(self, self._get_cluster_for_server(i), i, fqdn,
                            nodename, nid_tuples)
        self.servers[fqdn] = server

        self.power.add_server(fqdn)

        return server

    def _create_worker(self, i, nid_count):
        nid_tuples = []
        nodename = "worker%.3d" % i
        fqdn = "%s.localdomain" % nodename
        x, y = (i / 256, i % 256)
        for network in range(0, nid_count):
            nid_tuples.append(
                ('10.%d.%d.%d' % (network, x, y), 'tcp', network))

        log.info("_create_worker: %s" % fqdn)

        # Use -1 as the special cluster for workers
        worker = FakeServer(self,
                            self._get_cluster_for_server(-1),
                            i,
                            fqdn,
                            nodename,
                            nid_tuples,
                            worker=True)
        self.servers[fqdn] = worker

        return worker

    def configure_hsm_copytool(self, server, **kwargs):
        copytool = FakeHsmCopytool(self.folder, server.fqdn, **kwargs)
        self.copytools[copytool.id] = copytool

    def unconfigure_hsm_copytool(self, id):
        self.copytools[id].stop()
        self.copytools[id].delete()
        del self.copytools[id]

    def start_monitored_copytool(self, server, id):
        copytool = self.copytools[id]
        coordinator = self.coordinators[copytool.filesystem]

        client = CryptoClient(self.url + "copytool_event/", server.crypto,
                              server.fqdn)
        monitor = CopytoolMonitor(client, copytool)
        self.copytool_monitors[id] = monitor
        self.copytool_monitors[id].start()
        copytool.start(coordinator)

    def stop_monitored_copytool(self, id):
        try:
            self.copytools[id].stop()
        except KeyError:
            log.error("Attempt to stop unknown copytool: %s" % id)

        try:
            self.copytool_monitors[id].stop()
        except KeyError:
            log.error("Attempt to stop unknown copytool monitor: %s" % id)

    def start_hsm_copytools(self):
        for conf in glob.glob("%s/fake_hsm_copytool-*.json" % self.folder):
            with open(conf) as f:
                data = json.load(f)
                server = self.servers[data['server']]
                self.configure_hsm_copytool(server, **data['copytool'])
                self.start_monitored_copytool(server, data['copytool']['id'])

    def start_hsm_coordinators(self):
        for conf in glob.glob("%s/fake_hsm_coordinator-*.json" % self.folder):
            fsname = json.load(open(conf))['filesystem']
            coordinator = FakeHsmCoordinator(self, fsname)
            self.coordinators[fsname] = coordinator
            coordinator.start()

    def control_hsm_coordinator(self, fsname, control_value):
        if not fsname in self.coordinators:
            coordinator = FakeHsmCoordinator(self, fsname)
            self.coordinators[fsname] = coordinator

        self.coordinators[fsname].control(control_value)

    def setup(self, server_count, worker_count, volume_count, nid_count,
              cluster_size, pdu_count, su_size):
        """

        :param server_count: How many servers in total should exist after call
        :param worker_count: How many worker nodes in total should exist after call
        :param volume_count: How many volumes in total should exist after call
        :param nid_count: How many NIDs each server should have
        :param cluster_size: How many servers per corosync cluster
        :param pdu_count: How many PDUs in total
        :param su_size: How many servers per SU, or zero for no controllers + SAN-style volumes
        :return:
        """
        self.state['cluster_size'] = cluster_size
        self.save()

        # Packages which the FakeServers will report as available
        self.state['packages'] = {
            'server': {
                'lustre': (0, '2.1.4', '1', 'x86_64'),
                'lustre-modules': (0, '2.1.4', '1', 'x86_64'),
                'lustre-osd-ldiskfs': (0, '2.1.4', '1', 'x86_64'),
                'lustre-osd-zfs': (0, '2.1.4', '1', 'x86_64'),
                'kernel-devel-3.10.0-514.10.2.el7_lustre':
                (0, '2.6.32', '1', 'x86_64'),
                'zfs': (0, '0.6.5.3', '1', 'x86_64')
            },
            'worker': {
                'lustre-client': (0, '2.5.0', '1', 'x86_64'),
                'lustre-client-modules': (0, '2.5.0', '1', 'x86_64')
            }
        }
        for packages in self.state['packages'].values():
            packages['chroma-agent'] = (0, '3.0.1', '1', 'x86_64')
            packages['chroma-agent-management'] = (0, '3.0.1', '1', 'x86_64')
        self.save()

        self.power.setup(pdu_count)

        if su_size:
            # Series of SUs, blocks of one controller with several servers
            if server_count % su_size != 0:
                raise RuntimeError("server_count not a multiple of su_size")
            su_count = server_count / su_size
            if volume_count % su_count != 0:
                raise RuntimeError("volume_count not a multiple of su_count")

            for i in range(0, su_count):
                self.add_su(server_count / su_count, volume_count / su_count,
                            nid_count)
        else:
            # SAN-style LUNs visible to all servers
            for i in range(0, server_count):
                self._create_server(i, nid_count)

            self.devices.add_presented_luns(volume_count, self.servers.keys())

        for i in range(0, worker_count):
            self._create_worker(i, nid_count)

    def clear_clusters(self):
        for cluster in self.clusters.values():
            cluster.clear_resources()

        for server in self.servers.values():
            # FIXME: we should completely clear state but apparently we're reliant on some of the keys between tests!
            # server.reset_state()
            server.stop_corosync()
            server.stop_pacemaker()
            server.save()

    def remove_server(self, fqdn):
        log.info("remove_server %s" % fqdn)

        self.stop_server(fqdn, shutdown=True)
        server = self.servers[fqdn]
        assert not server.agent_is_running

        self.devices.remove_presentations(fqdn)

        self.power.remove_server(fqdn)

        server.crypto.delete()
        server.delete()
        del self.servers[fqdn]

    def remove_all_servers(self):
        # Ask them all to stop
        for fqdn, server in self.servers.items():
            if server.agent_is_running:
                server.shutdown_agent()

        # Wait for them to stop and complete removal
        for fqdn in self.servers.keys():
            self.remove_server(fqdn)

    def add_server(self, nid_count):
        i = len(self.servers)
        server = self._create_server(i, nid_count)
        return server.fqdn

    def add_su(self, server_count, volume_count, nid_count):
        """
        In this context SU stands for 'scalable unit', a notional unit of storage hardware
        consisting of some servers and a shared storage controller.

        :param server_count: How many servers in the SU
        :param volume_count: How many volumes in the SU (visible to all servers in the SU)
        :param nid_count: How many LNET NIDs should each server have
        """
        try:
            fqdns = [
                self.add_server(nid_count) for _ in range(0, server_count)
            ]
            serials = self.devices.add_presented_luns(volume_count, fqdns)

            if self.controllers:
                controller_id = max(self.controllers.keys()) + 1
            else:
                controller_id = 1
            controller = FakeController(self.folder, controller_id)
            for serial in serials:
                device = self.devices.get_device(serial)
                controller.add_lun(device['serial_80'], device['size'])
            self.controllers[controller_id] = controller

            return {
                'fqdns': fqdns,
                'serials': serials,
                'controller_id': controller_id
            }
        except Exception:
            print traceback.format_exc()
            raise

    def register_all(self, server_secret, worker_secret):
        register_count = 0
        for fqdn, server in self.servers.items():
            if server.crypto.certificate_file is None:
                if server.is_worker:
                    self.register(fqdn, worker_secret)
                else:
                    self.register(fqdn, server_secret)
                register_count += 1
            else:
                self.start_server(fqdn)

        self.post_server_start()

        # Useful for some callers to know if servers were registered or
        # just started.
        return register_count

    def register(self, fqdn, secret):
        try:
            log.debug("register %s" % fqdn)
            server = self.servers[fqdn]

            if server.agent_is_running:
                # e.g. if the server was added then force-removed then re-added
                server.shutdown_agent()

            if not server.is_worker and not self.power.server_has_power(fqdn):
                raise RuntimeError(
                    "Not registering %s, none of its PSUs are powered" % fqdn)

            client = AgentClient(url=self.url + "register/%s/" % secret,
                                 action_plugins=FakeActionPlugins(
                                     self, server),
                                 device_plugins=FakeDevicePlugins(server),
                                 server_properties=server,
                                 crypto=server.crypto)

            try:
                registration_result = client.register()
            except ConnectionError as e:
                log.error("Registration connection failed for %s: %s" %
                          (fqdn, e))
                return
            except HttpError as e:
                log.error("Registration request failed for %s: %s" % (fqdn, e))
                return
            server.crypto.install_certificate(
                registration_result['certificate'])

            # Immediately start the agent after registration, to pick up the
            # setup actions that will be waiting for us on the manager.
            self.start_server(fqdn)
            return registration_result
        except Exception:
            log.error(traceback.format_exc())

    def register_many(self, fqdns, secret):
        simulator = self

        class RegistrationThread(threading.Thread):
            def __init__(self, fqdn, secret):
                super(RegistrationThread, self).__init__()
                self.fqdn = fqdn
                self.secret = secret

            def run(self):
                self.result = simulator.register(self.fqdn, self.secret)

        threads = []
        log.debug("register_many: spawning threads")
        for fqdn in fqdns:
            thread = RegistrationThread(fqdn, secret)
            thread.start()
            threads.append(thread)

        for i, thread in enumerate(threads):
            thread.join()
            log.debug("register_many: joined %s/%s" % (i + 1, len(threads)))

        return [t.result for t in threads]

    def poweroff_server(self, fqdn):
        self.stop_server(fqdn, shutdown=True, simulate_shutdown=True)

    def poweron_server(self, fqdn):
        self.start_server(fqdn, simulate_bootup=True)

    def reboot_server(self, fqdn):
        log.debug("reboot %s" % fqdn)
        server = self.servers[fqdn]
        if not server.running:
            server.startup()
        else:
            server.shutdown(reboot=True)

    def stop_server(self, fqdn, shutdown=False, simulate_shutdown=False):
        """
        :param shutdown: Whether to treat this like a server shutdown (leave the
         HA cluster) rather than just an agent shutdown.
        :param simulate_shutdown: Whether to simulate a shutdown, delays and all
        """
        log.debug("stop %s" % fqdn)
        server = self.servers[fqdn]
        if not server.running:
            log.debug("not running")
            return

        if shutdown:
            server.shutdown(simulate_shutdown)
        else:
            server.shutdown_agent()

    def start_server(self, fqdn, simulate_bootup=False):
        """
        :param simulate_bootup: Whether to simulate a bootup, delays and all
        """
        log.debug("start %s" % fqdn)
        server = self.servers[fqdn]
        if server.running and not simulate_bootup:
            raise RuntimeError("Can't start %s, it is already running" % fqdn)
        server.startup(simulate_bootup)

    def get_lustre_client(self, client_address):
        try:
            client = self.lustre_clients[client_address]
        except KeyError:
            client = self.lustre_clients[client_address] = FakeClient(
                self.folder, client_address, self.devices, self.clusters)
        return client

    def unmount_lustre_clients(self):
        for client in self.lustre_clients.values():
            client.unmount_all()

    def stop(self):
        log.info("Stopping")
        self.power.stop()
        for group in [
                'coordinators', 'copytools', 'copytool_monitors', 'servers'
        ]:
            for thread in getattr(self, group).values():
                thread.stop()

    def join(self):
        log.info("Joining...")
        self.power.join()
        for group in [
                'coordinators', 'copytools', 'copytool_monitors', 'servers'
        ]:
            for thread in getattr(self, group).values():
                thread.join()
        log.info("Joined")

    def pre_server_start(self):
        self.power.start()

    def post_server_start(self):
        if len(self.servers) < 1:
            log.info("No servers started; skipping post_server_start()")
            return

        self.start_hsm_coordinators()
        self.start_hsm_copytools()

    def start_all(self):
        self.pre_server_start()

        # Spread out starts to avoid everyone doing sending their update
        # at the same moment

        if len(self.servers):
            delay = Session.POLL_PERIOD / float(len(self.servers))

            log.debug("Start all (%.2f dispersion)" % delay)

            for i, fqdn in enumerate(self.servers.keys()):
                self.start_server(fqdn)
                if i != len(self.servers) - 1:
                    time.sleep(delay)
        else:
            log.info("start_all: No servers yet")

        self.post_server_start()

    def set_log_rate(self, fqdn, rate):
        log.info("Set log rate for %s to %s" % (fqdn, rate))
        self.servers[fqdn].log_rate = rate

    def poll_fake_controller(self, controller_id):
        """
        For use by the simulator_controller storage plugin: query a particular fake controller
        """
        try:
            controller = self.controllers[int(controller_id)]
        except KeyError:
            log.error("Controller '%s' not found in %s" %
                      (controller_id, self.controllers.keys()))
            raise
        else:
            return controller.poll()

    def format_block_device(self, fqdn, path, filesystem_type):
        self.devices.format_local(fqdn, path, filesystem_type)
Exemple #4
0
class PowerControlCli(object):
    def _setup(self):
        # This should all be reasonably thread-safe, since it's just reading
        # the JSON from disk, but talks to the server to make any changes.
        self.power = FakePowerControl(self.args.config, None, None)

    def control_server(self, args):
        self.args = args
        self._setup()

        pdu_clients = []
        for pdu in self.power.pdu_sims.values():
            klassname = "%sClient" % pdu.__class__.__name__
            pdu_clients.append(
                getattr(
                    __import__("cluster_sim.fake_power_control",
                               fromlist=[klassname]), klassname)(pdu.address,
                                                                 pdu.port))

        if args.fqdn.lower() == "all":
            for outlet in self.power.server_outlet_list:
                for client in pdu_clients:
                    client.perform_outlet_action(outlet, args.action)
            return

        outlet = self.power.server_outlet_number(args.fqdn)

        for client in pdu_clients:
            client.perform_outlet_action(outlet, args.action)

    def control_pdu(self, args):
        self.args = args
        self._setup()

        pdu = self.power.pdu_sims[args.name]
        klassname = "%sClient" % pdu.__class__.__name__
        client = getattr(
            __import__("cluster_sim.fake_power_control", fromlist=[klassname]),
            klassname)(pdu.address, pdu.port)
        client.perform_outlet_action(args.outlet, args.action)

    def print_server_status(self, args):
        self.args = args
        self._setup()

        for outlet in self.power.server_outlet_list:
            print self.power.outlet_server_name(outlet)
            for pdu in self.power.pdu_sims.values():
                print "  %s (%s): %s" % (pdu.name, outlet,
                                         pdu.outlet_state(outlet))

    def print_pdu_status(self, args):
        self.args = args
        self._setup()

        for pdu in sorted(self.power.pdu_sims.values(), key=lambda x: x.name):
            print "%s:" % pdu.name
            for outlet, state in sorted(pdu.all_outlet_states.items(),
                                        key=lambda x: x[0]):
                print "  %s (%s): %s" % \
                        (self.power.outlet_server_name(outlet), outlet, state)

    def main(self):
        parser = argparse.ArgumentParser(description="Cluster Power Control")
        parser.add_argument('--config',
                            required=False,
                            help="Simulator configuration/state directory",
                            default="cluster_sim")
        subparsers = parser.add_subparsers()

        pdu_parser = subparsers.add_parser('pdu')
        pdu_parser.add_argument('name', help="PDU name")
        pdu_parser.add_argument('outlet', help="PDU outlet number")
        pdu_parser.add_argument('action',
                                help="Action to be performed (off|on|reboot)")
        pdu_parser.set_defaults(func=self.control_pdu)

        server_parser = subparsers.add_parser('server')
        server_parser.add_argument('fqdn', help="Server FQDN")
        server_parser.add_argument(
            'action', help="Action to be performed (off|on|reboot)")
        server_parser.set_defaults(func=self.control_server)

        status_parser = subparsers.add_parser('status')
        sub_subparsers = status_parser.add_subparsers()
        server_status = sub_subparsers.add_parser('servers')
        server_status.set_defaults(func=self.print_server_status)
        pdu_status = sub_subparsers.add_parser('pdus')
        pdu_status.set_defaults(func=self.print_pdu_status)

        args = parser.parse_args()
        args.func(args)