Exemple #1
0
    def run(self):
        self.server = SimpleXMLRPCServer(('localhost', SIMULATOR_PORT),
                                         allow_none=True)
        self.server.register_instance(self.simulator)

        log.info("Listening on %s" % SIMULATOR_PORT)
        self.server.serve_forever()
Exemple #2
0
    def run(self, args):
        simulator = ClusterSimulator(args.config, args.url)
        log.info("Running %s servers in %s/" %
                 (len(simulator.servers), args.config))
        simulator.start_all()

        return simulator
Exemple #3
0
    def register(self, args):
        simulator = ClusterSimulator(args.config, args.url)
        server_count = len(simulator.servers)

        if args.secret:
            server_secret = args.secret
            worker_secret = args.secret
        elif args.username and args.password:
            server_secret = self._acquire_token(
                args.url,
                args.username,
                args.password,
                server_count,
                preferred_profile='base_managed_rh7')
            worker_secret = self._acquire_token(
                args.url,
                args.username,
                args.password,
                server_count,
                preferred_profile='posix_copytool_worker')
        else:
            sys.stderr.write(
                "Must pass either --secret or --username and --password\n")
            sys.exit(-1)

        log.info("Registering %s servers in %s/" % (server_count, args.config))
        register_count = simulator.register_all(server_secret, worker_secret)

        if args.create_pdu_entries and register_count > 0:
            self.create_pdu_entries(simulator, args)

        return simulator
    def post_server_start(self):
        if len(self.servers) < 1:
            log.info("No servers started; skipping post_server_start()")
            return

        self.start_hsm_coordinators()
        self.start_hsm_copytools()
Exemple #5
0
 def handle_client(self, sock, address):
     log.info("%s: received connection from %s:%s" %
              (self.__class__.__name__, address[0], address[1]))
     fd = sock.makefile()
     fd.write("NOT IMPLEMENTED\r\n")
     fd.flush()
     sock.close()
    def format(self, fqdn, path, mkfs_options, target_data):
        # defaults to what an OST would look like
        e2fs_dump = {
            'uuid': target_data['uuid'],
            'filesystem_type': "ext4",
            'inode_size': 256,
            'bytes_per_inode': 16384
        }
        # overrides for MDT
        if mkfs_options:
            for pattern in [
                    r'-I\s*(?P<inode_size>\d+)',
                    r'-i\s*(?P<bytes_per_inode>\d+)'
            ]:
                match = re.search(pattern, mkfs_options)
                if match:
                    for k, v in match.groupdict().items():
                        e2fs_dump[k] = int(v)
        """Format a Lustre target"""
        with self._lock:
            serial = self.get_by_path(fqdn, path)['serial_80']
            device = self.get_device(serial)
            e2fs_dump[
                'inode_count'] = device['size'] / e2fs_dump['bytes_per_inode']

            log.info("format: %s" % serial)
            self.state['targets'][serial] = target_data
            self.save()

        return e2fs_dump
 def stop(self):
     log.info("Stopping")
     self.power.stop()
     for group in [
             'coordinators', 'copytools', 'copytool_monitors', 'servers'
     ]:
         for thread in getattr(self, group).values():
             thread.stop()
Exemple #8
0
    def __init__(self, pdu):
        log.info("Creating PDU server for %s on %s:%s" %
                 (pdu.name, pdu.address, pdu.port))

        self.server = PDUSimulatorTcpServer((pdu.address, pdu.port),
                                            PDUSimulatorTcpHandler,
                                            pdu_simulator=pdu)

        super(PDUSimulatorServer, self).__init__()
 def join(self):
     log.info("Joining...")
     self.power.join()
     for group in [
             'coordinators', 'copytools', 'copytool_monitors', 'servers'
     ]:
         for thread in getattr(self, group).values():
             thread.join()
     log.info("Joined")
Exemple #10
0
    def handle_client(self, sock, address):
        log.info("%s: received connection from %s:%s" %
                 (self.__class__.__name__, address[0], address[1]))
        self.client_address = address
        self.socket = sock
        self.fd = sock.makefile()

        try:
            self.login()
        except socket.error:
            log.info("%s: client %s:%s disconnected" %
                     (self.__class__.__name__, address[0], address[1]))
    def stop(self):
        if self.running:
            log.info("Stopping HSM Copytool: %s" % self.id)
            try:
                self.coordinator.deregister_agent(self)
            except KeyError:
                pass

            try:
                self._thread.stop()
            except AttributeError:
                pass
    def update_packages(self, packages, node_type='server'):
        log.info("Updating packages: %s" % packages)
        for k, v in packages.items():
            self.state['packages'][node_type][k] = v
        self.save()

        # The agent only reports new versions at the start of sessions
        # IRL this is valid because when we install updates on the manager
        # we restart the manager servers, causing new sessions.  In simulation,
        # we don't control the manager, so instead restart the AgentClient instances.
        for fqdn in self.servers.keys():
            self.stop_server(fqdn)
            self.start_server(fqdn)
    def run(self):
        self._open_fifo()
        self.register()

        self.started = True

        log.info("Copytool %s started" % self.wrapper.copytool.id)
        while not self._stopping.is_set():
            for request in self.coordinator.get_agent_requests(self.uuid):
                self.start_request(request)

            self.progress_active_requests()

            self._stopping.wait(COPYTOOL_LOOP_INTERVAL)
    def remove_server(self, fqdn):
        log.info("remove_server %s" % fqdn)

        self.stop_server(fqdn, shutdown=True)
        server = self.servers[fqdn]
        assert not server.agent_is_running

        self.devices.remove_presentations(fqdn)

        self.power.remove_server(fqdn)

        server.crypto.delete()
        server.delete()
        del self.servers[fqdn]
        def blocking_shutdown():
            log.info("%s beginning shutdown" % self.fqdn)
            self._enter_shutdown()

            shutdown_start_time = IMLDateTime.utcnow()
            self.shutdown_agent()
            self._cluster.leave(self.nodename)
            shutdown_end_time = IMLDateTime.utcnow()

            shutdown_time = (shutdown_end_time - shutdown_start_time).seconds
            while shutdown_time < MIN_SHUTDOWN_DURATION:
                if not simulate_shutdown:
                    break

                remaining_delay = MIN_SHUTDOWN_DURATION - shutdown_time
                log.info("%s, sleeping for %d seconds on shutdown" %
                         (self.fqdn, remaining_delay))

                shutdown_time += remaining_delay
                time.sleep(remaining_delay)

            self._exit_shutdown()
            log.info("%s shutdown complete" % self.fqdn)

            if reboot:
                log.info("%s rebooting" % self.fqdn)
                self.startup(simulate_shutdown)
    def start(self, coordinator):
        log.info("Starting HSM Copytool: %s" % self.id)
        coordinator.register_agent(self)

        self._new_thread()
        self._thread.start()

        # Give the thread time to start up in order to avoid weird races
        # in CI. This will prevent any transitions from the 'started' state
        # until the copytool is truly started.
        startup_timeout = 30  # 5-10 seconds seems typical; pad it out a bit
        while startup_timeout > 0:
            if self._thread.started:
                return
            time.sleep(1)
            startup_timeout -= 1

        raise RuntimeError("Timed out waiting for copytool thread to start")
    def start_all(self):
        self.pre_server_start()

        # Spread out starts to avoid everyone doing sending their update
        # at the same moment

        if len(self.servers):
            delay = Session.POLL_PERIOD / float(len(self.servers))

            log.debug("Start all (%.2f dispersion)" % delay)

            for i, fqdn in enumerate(self.servers.keys()):
                self.start_server(fqdn)
                if i != len(self.servers) - 1:
                    time.sleep(delay)
        else:
            log.info("start_all: No servers yet")

        self.post_server_start()
Exemple #18
0
    def setup(self, args):
        log.info("Setting up simulator configuration for %s servers in %s/" %
                 (args.server_count, args.config))

        worker_count = int(args.worker_count)

        server_count = int(args.server_count)
        if args.volume_count:
            volume_count = int(args.volume_count)
        else:
            volume_count = server_count * 2

        reset_agent_config()
        set_agent_config('copytool_fifo_directory', '/tmp')

        simulator = ClusterSimulator(args.config, args.url)
        simulator.setup(server_count, worker_count, volume_count,
                        int(args.nid_count), int(args.cluster_size),
                        int(args.psu_count), int(args.su_size))
        def blocking_startup():
            log.info("%s beginning startup" % self.fqdn)

            startup_time = 0
            # Mixing shutdown/startup state here is a little weird, but
            # this only happens when it's explicitly requested by a caller
            # that wants full server simulation. The end result is that
            # simulating a reboot is easier if we deal with waiting for
            # the shutdown to finish here before moving on to startup.
            while simulate_bootup and self.shutting_down:
                log.info("%s, waiting for shutdown (%d)" %
                         (self.fqdn, startup_time))
                startup_time += 1
                time.sleep(1)

            # This may happen to the losing thread(s) during startup after
            # a multi-PDU power cycle.
            if self.starting_up:
                log.info(
                    "%s exiting startup because another thread is already doing it?"
                    % self.fqdn)
                return

            self._enter_startup()
            self.boot_time = IMLDateTime.utcnow()

            while startup_time < MIN_STARTUP_DURATION:
                if not simulate_bootup:
                    break

                remaining_delay = MIN_STARTUP_DURATION - startup_time
                log.info("%s, sleeping for %d seconds on boot" %
                         (self.fqdn, remaining_delay))

                startup_time += remaining_delay
                time.sleep(remaining_delay)

            self.start_agent()
            self._cluster.join(self.nodename)

            self._exit_startup()
            log.info("%s startup complete" % self.fqdn)
    def _create_server(self, i, nid_count):
        interface_names = ['tcp', 'o2ib']
        nid_tuples = []
        nodename = "test%.3d" % i
        fqdn = "%s.localdomain" % nodename
        x, y = (i / 256, i % 256)
        for network in range(0, nid_count):
            name = interface_names[network % len(interface_names)]
            address = '10.%d.%d.%d' % (network, x, y)
            nid_tuples.append((address, name, network))

        log.info("_create_server: %s" % fqdn)

        server = FakeServer(self, self._get_cluster_for_server(i), i, fqdn,
                            nodename, nid_tuples)
        self.servers[fqdn] = server

        self.power.add_server(fqdn)

        return server
    def _create_worker(self, i, nid_count):
        nid_tuples = []
        nodename = "worker%.3d" % i
        fqdn = "%s.localdomain" % nodename
        x, y = (i / 256, i % 256)
        for network in range(0, nid_count):
            nid_tuples.append(
                ('10.%d.%d.%d' % (network, x, y), 'tcp', network))

        log.info("_create_worker: %s" % fqdn)

        # Use -1 as the special cluster for workers
        worker = FakeServer(self,
                            self._get_cluster_for_server(-1),
                            i,
                            fqdn,
                            nodename,
                            nid_tuples,
                            worker=True)
        self.servers[fqdn] = worker

        return worker
Exemple #22
0
 def join(self):
     log.info("Power control: joining...")
     for sim_server in self.sim_servers.values():
         sim_server.join()
 def set_log_rate(self, fqdn, rate):
     log.info("Set log rate for %s to %s" % (fqdn, rate))
     self.servers[fqdn].log_rate = rate
Exemple #24
0
 def logout(self):
     self.fd.write("Connection Closed - Bye\r\n")
     self.fd.flush()
     log.info("Client %s:%s logged out" % self.client_address)
     self.fd.close()
     self.socket.close()
Exemple #25
0
 def handler(*args, **kwargs):
     log.info("Stopping...")
     cli.stop()
Exemple #26
0
    def create_pdu_entries(self, simulator, args):
        if not (args.username and args.password):
            sys.stderr.write(
                "Username and password required to create PDU entries\n")
            sys.exit(-1)

        session = self._get_authenticated_session(args.url, args.username,
                                                  args.password)

        log.info(
            "Creating PDU entries and associating PDU outlets with servers...")
        outlet_count = len(simulator.servers)

        if outlet_count < 1:
            log.error("Skipping PDU creation (no servers)")
            return

        # Create a custom type to ensure that it has enough outlets.
        # NB: If more servers are added later this won't work correctly,
        # but it should handle most use cases for simulated clusters.
        response = session.post("%s/api/power_control_type/" % args.url,
                                data=json.dumps({
                                    'agent': "fence_apc",
                                    'make': "Fake",
                                    'model': "PDU",
                                    'default_username': "******",
                                    'default_password': "******",
                                    'max_outlets': outlet_count
                                }))

        assert 200 <= response.status_code < 300, response.text
        fence_apc = json.loads(response.text)

        log.debug("Created power_control_type: %s" % fence_apc['name'])

        pdu_entries = []
        for pdu_sim in simulator.power.pdu_sims.values():
            response = session.post("%s/api/power_control_device/" % args.url,
                                    data=json.dumps({
                                        'device_type':
                                        fence_apc['resource_uri'],
                                        'name':
                                        pdu_sim.name,
                                        'address':
                                        pdu_sim.address,
                                        'port':
                                        pdu_sim.port
                                    }))

            assert 200 <= response.status_code < 300, response.text
            pdu_entries.append(json.loads(response.text))
            log.debug("Created power_control_device: %s" %
                      pdu_entries[-1]['name'])

        response = session.get("%s/api/host/" % args.url,
                               data=json.dumps({'limit': 0}))
        assert 200 <= response.status_code < 300, response.text
        servers = [
            s for s in json.loads(response.text)['objects']
            if 'posix_copytool_worker' not in s['server_profile']
        ]

        for i, server in enumerate(
                sorted(servers, key=lambda server: server['fqdn'])):
            for pdu in pdu_entries:
                outlet = [
                    o for o in pdu['outlets'] if o['identifier'] == str(i + 1)
                ][0]
                response = session.patch(
                    "%s/%s" % (args.url, outlet['resource_uri']),
                    data=json.dumps({'host': server['resource_uri']}))
                assert 200 <= response.status_code < 300, response.text
                log.debug("Created association %s <=> %s:%s" %
                          (server['fqdn'], pdu['name'], outlet['identifier']))
Exemple #27
0
    def main(self):
        log.addHandler(logging.StreamHandler())

        daemon_log.addHandler(logging.StreamHandler())
        daemon_log.setLevel(logging.DEBUG)
        handler = logging.FileHandler("chroma-agent.log")
        handler.setFormatter(
            logging.Formatter('[%(asctime)s] %(message)s',
                              '%d/%b/%Y:%H:%M:%S'))
        daemon_log.addHandler(handler)

        # Usually on our Intel laptops https_proxy is set, and needs to be unset for tests,
        # but let's not completely rule out the possibility that someone might want to run
        # the tests on a remote system using a proxy.
        if 'https_proxy' in os.environ:
            sys.stderr.write(
                "Warning: Using proxy %s from https_proxy" %
                os.environ['https_proxy'] +
                " environment variable, you probably don't want that\n")

        parser = argparse.ArgumentParser(description="Cluster simulator")
        parser.add_argument('--config',
                            required=False,
                            help="Simulator configuration/state directory",
                            default="cluster_sim")
        parser.add_argument('--url',
                            required=False,
                            help="Manager URL",
                            default="https://localhost:8000/")
        subparsers = parser.add_subparsers()
        setup_parser = subparsers.add_parser("setup")
        setup_parser.add_argument('--su_size',
                                  required=False,
                                  help="Servers per SU",
                                  default='0')
        setup_parser.add_argument('--cluster_size',
                                  required=False,
                                  help="Number of simulated storage servers",
                                  default='4')
        setup_parser.add_argument('--server_count',
                                  required=False,
                                  help="Number of simulated storage servers",
                                  default='8')
        setup_parser.add_argument('--worker_count',
                                  required=False,
                                  help="Number of simulated HSM workers",
                                  default='1')
        setup_parser.add_argument(
            '--nid_count',
            required=False,
            help=
            "Number of LNet NIDs per storage server, defaults to 1 per server",
            default='1')
        setup_parser.add_argument(
            '--volume_count',
            required=False,
            help=
            "Number of simulated storage devices, defaults to twice the number of servers"
        )
        setup_parser.add_argument(
            '--psu_count',
            required=False,
            help=
            "Number of simulated server Power Supply Units, defaults to one per server",
            default='1')
        setup_parser.set_defaults(func=self.setup)

        register_parser = subparsers.add_parser(
            "register",
            help=
            "Provide a secret for registration, or provide API credentials for the simulator to acquire a token itself"
        )
        register_parser.add_argument('--secret',
                                     required=False,
                                     help="Registration token secret")
        register_parser.add_argument('--username',
                                     required=False,
                                     help="API username")
        register_parser.add_argument('--password',
                                     required=False,
                                     help="API password")
        register_parser.add_argument('--create_pdu_entries',
                                     action='store_true',
                                     help="Create PDU entries on the manager")
        register_parser.set_defaults(func=self.register)

        run_parser = subparsers.add_parser("run")
        run_parser.set_defaults(func=self.run)

        args = parser.parse_args()
        simulator = args.func(args)
        if simulator:
            self.simulator = simulator

            rpc_thread = RpcThread(self.simulator)
            rpc_thread.start()

            # Wake up periodically to handle signals, instead of going straight into join
            while not self._stopping.is_set():
                self._stopping.wait(timeout=1)
            log.info("Running indefinitely.")

            self.simulator.join()

            rpc_thread.stop()
            rpc_thread.join()
Exemple #28
0
 def start(self):
     log.info("Power control: starting...")
     for pdu_name in self.pdu_sims:
         self.start_sim_server(pdu_name)
Exemple #29
0
 def toggle_outlet(self, outlet, state):
     with self._lock:
         self.state['outlets'][outlet] = state
         log.info("POWER: Toggled %s:%s to %s" %
                  (self.name, outlet, self.outlet_state(outlet)))
         self.save()
Exemple #30
0
 def stop(self):
     log.info("Power control: stopping...")
     for sim_server in self.sim_servers.values():
         sim_server.stop()