def testGenericComponent(self): """test ComponentGroup simple methods""" fs = FileSystem('comp') grp = ComponentGroup() self.assertEqual(len(grp), 0) comp = Component(fs, Server('foo', ['foo@tcp'])) comp.TYPE = 'A' # add() grp.add(comp) # __len__ self.assertEqual(len(grp), 1) # __str__ self.assertEqual(str(grp), 'comp-A') # __getitem__ self.assertEqual(grp[comp.uniqueid()], comp) # __contains__ self.assertTrue(comp in grp) # __iter__ self.assertEqual(list(iter(grp)), [ comp ]) # Could not add() twice the same component try: grp.add(comp) except KeyError, error: txt = "'A component with id comp-A-foo@tcp already exists.'" self.assertEqual(str(error), txt)
def testServers(self): """test ComponentGroup.servers()""" fs = FileSystem('comp') grp = ComponentGroup() grp.add(Component(fs, Server('foo1', ['foo1@tcp']))) grp.add(Component(fs, Server('foo2', ['foo2@tcp']))) self.assertEqual(str(grp.servers()), "foo[1-2]")
def testUpdate(self): """test ComponentGroup.update()""" fs = FileSystem('comp') grp1 = ComponentGroup() comp1 = Component(fs, Server('foo1', ['foo1@tcp'])) grp1.add(comp1) grp2 = ComponentGroup() comp2 = Component(fs, Server('foo2', ['foo2@tcp'])) grp2.add(comp2) grp1.update(grp2) self.assertEqual(len(grp1), 2) self.assertTrue(comp1 in grp1) self.assertTrue(comp2 in grp1)
def convert_comparison(fsconf, fs, actions): """ This will transform configuration element that are present in action list to reference to FileSystem Component. """ return ComponentGroup((_create_comp(fsconf, fs, elem) for elem in actions))
def testManaged(self): """test ComponentGroup.managed()""" fs = FileSystem('comp') grp = ComponentGroup() comp1 = Component(fs, Server('foo1', ['foo1@tcp']), mode="external") grp.add(comp1) comp2 = Component(fs, Server('foo2', ['foo2@tcp'])) grp.add(comp2) comp3 = Component(fs, Server('foo3', ['foo3@tcp']), enabled=False) grp.add(comp3) comp4 = Component(fs, Server('foo4', ['foo4@tcp'])) grp.add(comp4) offgrp = grp.managed() self.assertEqual(len(offgrp), 2) self.assertTrue(comp2 in offgrp) self.assertTrue(comp4 in offgrp)
def __init__(self, fs_name, event_handler=None): self.fs_name = fs_name self.hdlr = event_handler or EventHandler() self.proxy_errors = MsgTree() # All FS components (MGT, MDT, OST, Clients, ...) self.components = ComponentGroup() # file system MGT self.mgt = None # Local server reference self.local_server = None self.debug = False self.logger = self._setup_logging()
def testAllServers(self): """test ComponentGroup.allservers()""" fs = FileSystem('comp') grp = ComponentGroup() grp.add(Target(fs, Server('foo1', ['foo1@tcp']), 0, '/dev/sda')) comp = Target(fs, Server('foo2', ['foo2@tcp']), 1, '/dev/sda') grp.add(comp) comp.add_server(Server('foo3', ['foo3@tcp0'])) self.assertEqual(str(grp.allservers()), "foo[1-3]")
def testGroupBy(self): """test ComponentGroup.groupby()""" fs = FileSystem('comp') grp = ComponentGroup() comp1 = Component(fs, Server('foo1', ['foo1@tcp']), mode="external") grp.add(comp1) comp2 = Component(fs, Server('foo2', ['foo2@tcp'])) grp.add(comp2) comp3 = Component(fs, Server('foo3', ['foo3@tcp']), mode="external") grp.add(comp3) comp4 = Component(fs, Server('foo4', ['foo4@tcp'])) grp.add(comp4) results = [[mode, list(comps)] for mode, comps in grp.groupby(attr='_mode')] self.assertEqual(len(results), 2) self.assertEqual(results[0][0], "external") self.assertTrue(comp1 in results[0][1]) self.assertTrue(comp3 in results[0][1]) self.assertEqual(results[1][0], "managed") self.assertTrue(comp2 in results[1][1]) self.assertTrue(comp4 in results[1][1])
def testOr(self): """test ComponentGroup.__or__()""" fs = FileSystem('comp') grp1 = ComponentGroup() comp1 = Component(fs, Server('foo1', ['foo1@tcp'])) grp1.add(comp1) grp2 = ComponentGroup() comp2 = Component(fs, Server('foo2', ['foo2@tcp'])) grp2.add(comp2) merge = grp1|grp2 self.assertEqual(len(merge), 2) self.assertTrue(comp1 in merge) self.assertTrue(comp2 in merge)
def testFilterSupports(self): """test ComponentGroup.filter(supports and key)""" fs = FileSystem('comp') grp = ComponentGroup() comp1 = Component(fs, Server('foo1', ['foo1@tcp'])) comp1.state = MOUNTED grp.add(comp1) comp2 = Component(fs, Server('foo2', ['foo2@tcp'])) comp2.state = OFFLINE grp.add(comp2) comp3 = Component(fs, Server('foo3', ['foo3@tcp'])) comp3.state = MOUNTED grp.add(comp3) comp4 = Component(fs, Server('foo4', ['foo4@tcp'])) comp4.state = OFFLINE grp.add(comp4) offgrp = grp.filter(supports='is_external', key=lambda comp: comp.state == OFFLINE) self.assertEqual(len(offgrp), 2) self.assertTrue(comp2 in offgrp) self.assertTrue(comp4 in offgrp)
def testGenericComponent(self): """test ComponentGroup simple methods""" fs = FileSystem('comp') grp = ComponentGroup() self.assertEqual(len(grp), 0) comp = Component(fs, Server('foo', ['foo@tcp'])) comp.TYPE = 'A' # add() grp.add(comp) # __len__ self.assertEqual(len(grp), 1) # __str__ self.assertEqual(str(grp), 'comp-A') # __getitem__ self.assertEqual(grp[comp.uniqueid()], comp) # __contains__ self.assertTrue(comp in grp) # __iter__ self.assertEqual(list(iter(grp)), [ comp ]) # Could not add() twice the same component try: grp.add(comp) except KeyError as error: txt = "'A component with id comp-A-foo@tcp already exists.'" self.assertEqual(str(error), txt)
def testLabels(self): """test ComponentGroup.labels()""" fs = FileSystem('comp') grp = ComponentGroup() comp = Component(fs, Server('foo1', ['foo1@tcp'])) comp.TYPE = 'A' grp.add(comp) comp = Component(fs, Server('foo2', ['foo2@tcp'])) comp.TYPE = 'B' grp.add(comp) self.assertEqual(str(grp.labels()), 'comp-A,comp-B')
def testGroupByServer(self): """test ComponentGroup.groupbyserver()""" fs = FileSystem('comp') grp = ComponentGroup() srv1 = Server('foo1', ['foo1@tcp']) srv2 = Server('foo2', ['foo2@tcp']) comp1 = Component(fs, srv1) comp1.TYPE = 'A' grp.add(comp1) comp2 = Component(fs, srv2) comp2.TYPE = 'B' grp.add(comp2) comp3 = Component(fs, srv1) comp3.TYPE = 'C' grp.add(comp3) comp4 = Component(fs, srv2) comp4.TYPE = 'D' grp.add(comp4) key = lambda c: c.TYPE results = [[srv, sorted(comps, key=key)] for srv, comps in grp.groupbyserver()] self.assertEqual(len(results), 2) self.assertTrue([srv1, [comp1, comp3]] in results) self.assertTrue([srv2, [comp2, comp4]] in results)
def test_group_by_all_servers(self): """test ComponentGroup.groupbyallservers()""" fs = FileSystem('comp') grp = ComponentGroup() srv1 = Server('foo1', ['foo1@tcp']) srv2 = Server('foo2', ['foo2@tcp']) comp1 = Target(fs, srv1, 0, '/dev/sda') comp1.add_server(srv2) grp.add(comp1) comp2 = Target(fs, srv2, 1, '/dev/sdb') comp2.add_server(srv1) grp.add(comp2) comp3 = Target(fs, srv1, 2, '/dev/sdc') comp3.add_server(srv2) grp.add(comp3) comp4 = Target(fs, srv2, 3, '/dev/sdd') comp4.add_server(srv1) grp.add(comp4) key = lambda c: c.TYPE results = [[srv, sorted(comps, key=key)] for srv, comps in grp.groupbyallservers()] self.assertEqual(len(results), 2) self.assertTrue([srv1, [comp1, comp2, comp3, comp4]] in results) self.assertTrue([srv2, [comp1, comp2, comp3, comp4]] in results)
def test_managed_active(self): """test ComponentGroup.managed() with active option""" fs = FileSystem('active') grp = ComponentGroup() srv = Server('foo1', ['foo1@tcp']) comp1 = Component(fs, srv) comp1.TYPE = 'A' grp.add(comp1) comp2 = Component(fs, srv, active='no') comp2.TYPE = 'B' grp.add(comp2) comp3 = Component(fs, srv, active='nocreate') comp3.TYPE = 'C' grp.add(comp3) comp4 = Component(fs, srv, active='no', mode='external') comp4.TYPE = 'D' grp.add(comp4) self.assertEqual(str(grp.managed()), 'active-A,active-C') self.assertEqual(str(grp.managed(inactive=True)), 'active-A,active-B,active-C,active-D')
class FileSystem: """ The Lustre FileSystem abstract class. """ def __init__(self, fs_name, event_handler=None): self.fs_name = fs_name self.hdlr = event_handler or EventHandler() self.proxy_errors = MsgTree() # All FS components (MGT, MDT, OST, Clients, ...) self.components = ComponentGroup() # file system MGT self.mgt = None # Local server reference self.local_server = None self.debug = False self.logger = self._setup_logging() def set_debug(self, debug): self.debug = debug def _setup_logging(self): """Setup logging configuration for the whole filesystem.""" # XXX: This is only here for fsck, currently. logger = logging.getLogger('Shine.Lustre') # Level if self.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if logger.handlers: # If some handlers already exist, the logger singleton is already # configured, so just return it to avoid duplicate handlers. return logger # Formatter formatter = logging.Formatter( datefmt="%Y-%m-%d %X", fmt='%(name)s %(levelname)s %(message)s') try: # Handler handler = logging.handlers.SysLogHandler(address='/dev/log') logger.addHandler(handler) handler.setFormatter(formatter) except socket.error: logging.raiseExceptions = False msg = "Error connecting to syslog, disabling logging." print("WARNING: %s" % msg, file=sys.stderr) return logger def get_mgs_nids(self): return self.mgt.get_nids() # # file system event handling # def local_event(self, evtype, **params): # Currently, all event callbacks need a node. # When localy called, add the current node self.hdlr.local_event(evtype, **params) def distant_event(self, evtype, node, **params): # Update the local component instance with the provided instance # if one is available in params. if evtype == 'comp': other = params['info'].elem other.fs = self try: # Special hack for Journal object as they are not put in # components list. if other.TYPE == Journal.TYPE: other.target.fs = self target = self.components[other.target.uniqueid()] target.journal.update(other) comp = target.journal else: comp = self.components[other.uniqueid()] # comp.update() updates the component state # and disk information if the component is a target. # These information don't need to be updated unless # we are on a completion event. if params['status'] not in ('start', 'progress'): # ensure other.server is the actual distant server other.server = comp.allservers().select( NodeSet(node))[0] # update target from remote one comp.update(other) # substitute target parameter by local one params['comp'] = comp except KeyError as error: print("ERROR: Component update failed (%s)" % str(error), file=sys.stderr) self.hdlr.event_callback(evtype, node=node, **params) def _handle_shine_proxy_error(self, nodes, message): """ Store error messages, for later processing. Hostnames are replaced by 'THIS_SHINE_HOST' to allow message grouping. Grouping outputs which only differ by the host name. """ message = message.replace(str(nodes), 'THIS_SHINE_HOST') self.proxy_errors.add(NodeSet(nodes), message) # # file system construction # def _attach_component(self, comp): if comp.TYPE == MGT.TYPE: if self.mgt and len(self.mgt.get_nids()) > 0: raise FSError("A Lustre filesystem has only one MGT.") self.mgt = comp self.components.add(comp) def new_target(self, server, type, index, dev, jdev=None, group=None, tag=None, enabled=True, mode='managed', network=None, active='yes'): """ Create a new attached target. """ TYPE_CLASSES = {MGT.TYPE: MGT, MDT.TYPE: MDT, OST.TYPE: OST} if type not in TYPE_CLASSES: raise FSError("Unrecognized target type \"%s\"" % type) target = TYPE_CLASSES[type](fs=self, server=server, index=index, dev=dev, jdev=jdev, group=group, tag=tag, enabled=enabled, mode=mode, network=network, active=active) self._attach_component(target) return target def new_client(self, server, mount_path, mount_options=None, subdir=None, enabled=True): """ Create a new attached client. """ client = Client(self, server, mount_path, mount_options, subdir, enabled) self._attach_component(client) return client def new_router(self, server, enabled=True): """ Create a new attached router. """ router = Router(self, server, enabled) self._attach_component(router) return router # # Task management. # def _proxy_action(self, action, servers, comps=None, **kwargs): """Create a proxy action to remotely run a shine action.""" assert isinstance(servers, NodeSet) assert comps is None or isinstance(comps, ComponentGroup) return FSProxyAction(self, action, servers, self.debug, comps, **kwargs) def _run_actions(self): """ Start actions run-loop. It clears all previous proxy errors and starts task run-loop. This launches all FSProxyAction prepared before by example. """ self.proxy_errors = MsgTree() task_self().set_default("stderr_msgtree", False) task_self().set_info('connect_timeout', Globals().get_ssh_connect_timeout()) task_self().resume() def _check_errors(self, expected_states, components=None, actions=None): """ This verifies that executed tasks were successfull. It verifies all provided components (Target, Clients, ...) have expected state. If not, it returns the most incoherent state. If there is no error, it returns the expected state. """ assert isinstance(expected_states, list) result = set() if actions and actions.status() == ACT_ERROR: result.add(TARGET_ERROR) # If a component list is provided, check that all components from it # have expected state. for comp in components or []: # This should never happen but it is convenient for debugging if # there is some uncatched bug somewhere. # (ie: cannot unpickle due to ClusterShell version mismatch) if comp.state is None: msg = "WARNING: no state report from node %s" % comp.server print(msg, file=sys.stderr) comp.state = RUNTIME_ERROR if comp.state not in expected_states: result.add(comp.state) # Compute component's server. # Although not the best place semantically speaking to perform this # task, update_server() is meaningful only when all the component # states have been filled, and here, we are sure it is the case. # So, waiting for a better solution, _check_errors() is the # best place to compute the component server. if comp.update_server() is False: msg = "WARNING: %s is mounted multiple times" % comp.label self._handle_shine_proxy_error(str(comp.server.hostname), msg) # if empty set, add expected_states[0] if not result: result.add(expected_states[0]) return result def _distant_action_by_server(self, action_class, servers, **kwargs): # filter local server distant_servers = Server.distant_servers(servers) # perform action on distant servers if len(distant_servers) > 0: action = action_class(nodes=distant_servers, fs=self, **kwargs) action.launch() self._run_actions() if action.status() == ACT_ERROR: err_code = None if task_self().num_timeout(): err_code = -1 elif task_self().max_retcode(): err_code = task_self().max_retcode() # FSRemoteError is limited and cannot handle more than 1 error msg, nodes = list(self.proxy_errors.walk())[0] nodes = NodeSet.fromlist(nodes) msg = str(msg).replace('THIS_SHINE_HOST', str(nodes)) raise FSRemoteError(nodes, err_code, msg) def install(self, fs_config_file, servers=None, **kwargs): """ Install filesystem configuration file on its servers. Server list is built from enabled targets and enabled clients only. """ # Get all possible servers servers = (servers or self.components.managed().allservers()) self._distant_action_by_server(Install, servers, config_file=fs_config_file, **kwargs) def remove(self, servers=None, **kwargs): """ Remove FS config files. """ result = 0 if servers is None: # Get all possible servers servers = self.components.managed().allservers() # filter local server distant_servers = Server.distant_servers(servers) # If size is different, we have a local server in the list if len(distant_servers) < len(servers): # remove local fs configuration file fs_file = os.path.join(Globals().get_conf_dir(), "%s.xmf" % self.fs_name) if os.path.exists(fs_file): self.hdlr.log('detail', msg='[DEL] %s' % fs_file) if kwargs.get('dryrun', False): result = 0 else: result = os.remove(fs_file) if len(distant_servers) > 0: # Perform the remove operations on all targets for these nodes. self._proxy_action('remove', distant_servers, **kwargs).launch() # Run local actions and FSProxyAction self._run_actions() if len(self.proxy_errors) > 0: return RUNTIME_ERROR return result def _prepare(self, action, comps=None, groupby=None, reverse=False, need_unload=False, tunings=None, allservers=False, **kwargs): """ Instanciate all actions for the component list and but them in a graph of ActionGroup(). Action could be local or proxy actions. Components list is filtered, based on action name. """ graph = ActionGroup() comps = comps or self.components first_comps = None last_comps = None localsrv = None modules = set() localcomps = None if groupby: iterable = comps.groupby(attr=groupby, reverse=reverse) else: iterable = [(None, comps)] # Iterate over targets, grouping them by start order and server. for _order, comps in iterable: graph.add(ActionGroup()) compgrp = ActionGroup() proxygrp = ActionGroup() for srv, comps in comps.groupbyserver(allservers=allservers): if srv.action_enabled is True: if srv.is_local(): localsrv = srv localcomps = comps for comp in comps: compgrp.add(getattr(comp, action)(**kwargs)) else: act = self._proxy_action(action, srv.hostname, comps, **kwargs) if tunings and tunings.filename: copy = Install(srv.hostname, self, tunings.filename, comps=comps, **kwargs) act.depends_on(copy) proxygrp.add(copy) proxygrp.add(act) if len(compgrp) > 0: graph[-1].add(compgrp) # Keep track of first comp group if first_comps is None: first_comps = compgrp first_comps.parent = graph[-1] # Keep track of last comp group last_comps = compgrp last_comps.parent = graph[-1] # Build module loading list, if needed for comp_action in compgrp: modules.update(comp_action.needed_modules()) if len(proxygrp) > 0: graph[-1].add(proxygrp) # Add module loading, if needed. if first_comps is not None and len(modules) > 0: modgrp = ActionGroup() for module in modules: modgrp.add(localsrv.load_modules(modname=module, **kwargs)) # Serialize modules loading actions modgrp.sequential() first_comps.parent.add(modgrp) first_comps.depends_on(modgrp) # Apply tuning to last component group, if needed if tunings is not None and last_comps is not None: tune = localsrv.tune(tunings, localcomps, self.fs_name, **kwargs) last_comps.parent.add(tune) tune.depends_on(last_comps) # Add module unloading to last component group, if needed. if need_unload and last_comps is not None: unload = localsrv.unload_modules(**kwargs) last_comps.parent.add(unload) unload.depends_on(last_comps) # Join the different part together graph.sequential() return graph def format(self, comps=None, **kwargs): """Format filesystem targets.""" comps = (comps or self.components).managed(supports='format') actions = self._prepare('format', comps, **kwargs) actions.launch() self._run_actions() # Check for errors and return OFFLINE or error code return self._check_errors([OFFLINE], comps, actions) def tunefs(self, comps=None, **kwargs): """Modify component option set at format.""" comps = (comps or self.components).managed(supports='tunefs') actions = self._prepare('tunefs', comps, **kwargs) actions.launch() self._run_actions() # Check for errors and return OFFLINE or error code return self._check_errors([OFFLINE], comps, actions) def fsck(self, comps=None, **kwargs): """Check component filesystem coherency.""" comps = (comps or self.components).managed(supports='fsck') actions = self._prepare('fsck', comps, **kwargs) actions.launch() self._run_actions() # Check for errors and return OFFLINE or error code return self._check_errors([OFFLINE], comps, actions) def status(self, comps=None, **kwargs): """Get status of filesystem.""" comps = (comps or self.components).managed(supports='status') actions = self._prepare('status', comps, allservers=True, **kwargs) actions.launch() self._run_actions() # Here we check MOUNTED but in fact, any status is OK. return self._check_errors([MOUNTED], comps) def start(self, comps=None, **kwargs): """Start Lustre file system servers.""" comps = (comps or self.components).managed(supports='start') # What starting order to use? key = lambda t: t.TYPE == MDT.TYPE mdt_comps = comps.filter(key=key) if mdt_comps: # Found enabled MDT(s): perform writeconf check. self.status(comps=mdt_comps) for target in mdt_comps: if target.has_first_time_flag() or target.has_writeconf_flag(): MDT.START_ORDER, OST.START_ORDER = OST.START_ORDER, MDT.START_ORDER break actions = self._prepare('start', comps, groupby='START_ORDER', **kwargs) actions.launch() self._run_actions() return self._check_errors([MOUNTED, RECOVERING], comps, actions) def stop(self, comps=None, **kwargs): """Stop file system.""" comps = (comps or self.components).managed(supports='stop') actions = self._prepare('stop', comps, groupby='START_ORDER', reverse=True, need_unload=True, **kwargs) actions.launch() self._run_actions() return self._check_errors([OFFLINE], comps) def mount(self, comps=None, **kwargs): """Mount FS clients.""" comps = (comps or self.components).managed(supports='mount') actions = self._prepare('mount', comps, **kwargs) actions.launch() self._run_actions() # Ok, workers have completed, perform late status check... return self._check_errors([MOUNTED], comps, actions) def umount(self, comps=None, **kwargs): """Unmount FS clients.""" comps = (comps or self.components).managed(supports='umount') actions = self._prepare('umount', comps, need_unload=True, **kwargs) actions.launch() self._run_actions() # Ok, workers have completed, perform late status check... return self._check_errors([OFFLINE], comps) def execute(self, comps=None, **kwargs): """Execute custom command.""" comps = (comps or self.components).managed(supports='execute') actions = self._prepare('execute', comps, **kwargs) actions.launch() self._run_actions() # Here we check MOUNTED but in fact, any status is OK. # XXX: Is that ok, to check MOUNTED here? return self._check_errors([MOUNTED], comps, actions) def tune(self, tuning_model, comps=None, **kwargs): """Tune server.""" comps = (comps or self.components).managed() actions = ActionGroup() for server, srvcomps in comps.groupbyserver(): if server.is_local(): actions.add( server.tune(tuning_model, srvcomps, self.fs_name, **kwargs)) else: act = self._proxy_action('tune', server.hostname, srvcomps, **kwargs) if tuning_model.filename: copy = Install(server.hostname, self, tuning_model.filename, comps=srvcomps, **kwargs) act.depends_on(copy) actions.add(copy) actions.add(act) # Run local actions and FSProxyAction actions.launch() self._run_actions() # Check actions status and return MOUNTED if no error return self._check_errors([MOUNTED], None, actions)