def tearDown(self): from chroma_api.authentication import CsrfAuthentication CsrfAuthentication.is_authenticated = self.old_is_authenticated # Restore from chroma_core.services.job_scheduler import job_scheduler_client job_scheduler_client.JobSchedulerClient.available_transitions = self.old_available_transitions job_scheduler_client.JobSchedulerClient.available_jobs = self.old_available_jobs ObjectCache.clear()
def create_host_ssh(address, server_profile, root_pw, pkey, pkey_pw): host_data = AgentRpc.mock_servers[address] host = synthetic_host(address, nids=host_data['nids'], fqdn=host_data['fqdn'], nodename=host_data['nodename']) ObjectCache.add(ManagedHost, host) command = Command.objects.create(complete=True, message="Mock create_host_ssh") return host, command
def learn_target_mounts(self): for host, host_data in self.all_hosts_data.items(): # We will compare any found target mounts to all known MGSs for local_info in host_data["local_targets"]: debug_id = (host, local_info["device_paths"][0], local_info["name"]) targets = ManagedTarget.objects.filter(uuid=local_info["uuid"]) if not targets.count(): log.warning("Ignoring %s:%s (%s), target unknown" % debug_id) continue for target in targets: if isinstance(target, FilesystemMember): try: mgs = self._target_find_mgs(host, local_info) except ManagedMgs.DoesNotExist: log.warning( "Can't find MGS for target %s:%s (%s)" % debug_id) continue else: mgs = None if not self.target_available_here(host, mgs, local_info): log.warning( "Ignoring %s on %s, as it is not mountable on this host" % (local_info["name"], host)) continue try: log.info("Target %s seen on %s" % (target, host)) volumenode = self._get_volume_node( host, local_info["device_paths"]) (tm, created) = ManagedTargetMount.objects.get_or_create( target=target, host=host, volume_node=volumenode) if created: tm.immutable_state = True tm.save() log.info( "Learned association %d between %s and host %s" % (tm.id, local_info["name"], host)) self._learn_event(host, tm) ObjectCache.add(ManagedTargetMount, tm) if local_info["mounted"]: target.state = "mounted" target.active_mount = tm target.save() except NoNidsPresent: log.warning( "Cannot set up target %s on %s until LNet is running" % (local_info["name"], host))
def synthetic_host(address=None, nids=list([]), storage_resource=False, fqdn=None, nodename=None, server_profile='test_profile'): """ Create a ManagedHost + paraphernalia, with states set as if configuration happened successfully :param storage_resource: If true, create a PluginAgentResources (additional overhead, only sometimes required) """ server_profile = ServerProfile.objects.get(name=server_profile) if address is None: address = random_str(postfix=".tld") if fqdn is None: fqdn = address if nodename is None: nodename = address host = ManagedHost.objects.create( address=address, fqdn=fqdn, nodename=nodename, state='managed', server_profile=server_profile, immutable_state=not server_profile.managed if server_profile else False) ObjectCache.add(ManagedHost, host) lnet_configuration = synthetic_lnet_configuration(host, nids) if server_profile.managed: synthetic_rsyslog_configuration(host) synthetic_ntp_configuration(host) synthetic_corosync_configuration(host) synthetic_pacemaker_configuration(host) log.debug("synthetic_host: %s %s" % (address, lnet_configuration.get_nids())) if storage_resource: from chroma_core.lib.storage_plugin.manager import storage_plugin_manager resource_class, resource_class_id = storage_plugin_manager.get_plugin_resource_class( 'linux', 'PluginAgentResources') StorageResourceRecord.get_or_create_root(resource_class, resource_class_id, { 'plugin_name': 'linux', 'host_id': host.id }) return host
def setUp(self): super(TestTargetTransitions, self).setUp() self.mgt, mgt_tms = ManagedMgs.create_for_volume(self._test_lun( self.host).id, name='MGS') ObjectCache.add(ManagedTarget, self.mgt.managedtarget_ptr) for tm in mgt_tms: ObjectCache.add(ManagedTargetMount, tm) self.assertEqual( ManagedMgs.objects.get(pk=self.mgt.pk).state, 'unformatted')
def get_steps(self): host = ObjectCache.get_one( ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id) from chroma_core.models.filesystem import ManagedFilesystem filesystem = ObjectCache.get_one( ManagedFilesystem, lambda mf: mf.id == self.lustre_client_mount.filesystem_id) args = dict(host=host, filesystems=[(filesystem.mount_path(), self.lustre_client_mount.mountpoint)]) return [(UnmountLustreFilesystemsStep, args)]
def get_steps(self): host = ObjectCache.get_one( ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id) filesystem = ObjectCache.get_one( ManagedFilesystem, lambda mf: mf.name == self.lustre_client_mount.filesystem) args = dict(host=host, filesystems=[(filesystem.mount_path(), self.lustre_client_mount.mountpoints)]) return [(UnmountLustreFilesystemsStep, args)]
def filter_by_target(cls, target): if issubclass(target.downcast_class, ManagedMgs): result = ObjectCache.get(ManagedFilesystem, lambda mfs: mfs.mgs_id == target.id) return result elif issubclass(target.downcast_class, FilesystemMember): return ObjectCache.get( ManagedFilesystem, lambda mfs: mfs.id == target.downcast().filesystem_id) else: raise NotImplementedError(target.__class__)
def get_deps(self): search = lambda ct: ct.host == self.copytool.host copytools = ObjectCache.get(Copytool, search) # Only force an unmount if this is the only copytool associated # with the host. if len(copytools) == 1: search = lambda cm: cm.id == self.copytool.client_mount_id client_mount = ObjectCache.get_one(LustreClientMount, search) return DependOn(client_mount, "unmounted") else: return DependAll()
def _measure_scaling(self, create_n, measured_resource, scaled_resource=None): """ :param create_n: Function to create N of scaled_resource :param measured_resource: The resource we will measure the query load for :param scaled_resource: The object which is actually being scaled with N :return: Instance of Order1, OrderN, OrderBad """ if scaled_resource is None: scaled_resource = measured_resource query_counts = {} samples = [5, 6, 7, 8] for n in samples: ObjectCache.clear() create_n(n) # Queries get reset at the start of a request self.assertEqual(scaled_resource._meta.queryset.count(), n) with CaptureQueriesContext(connection) as queries: response = self.api_client.get( "/api/%s/" % measured_resource._meta.resource_name, data={"limit": 0}) self.assertEqual( response.status_code, 200, "%s:%s" % (response.content, measured_resource._meta.resource_name)) query_count = len(queries) self.assertEqual(len(self.deserialize(response)["objects"]), measured_resource._meta.queryset.count()) query_counts[n] = query_count # Ignore samples[0], it was just to clear out any setup overhead from first call to API # gradient between samples[1] and samples[2] grad1 = (query_counts[samples[2]] - query_counts[samples[1]]) / (samples[2] - samples[1]) # gradient between samples[2] and samples[3] grad2 = (query_counts[samples[3]] - query_counts[samples[2]]) / (samples[3] - samples[2]) if grad1 == 0 and grad2 == 0: # Hoorah, O(1) return Order1(query_counts[samples[3]]) elif grad1 > 0 and grad1 == grad2: # O(N) return OrderN(grad1) else: # Worse than O(N) return OrderBad()
class LustreClientMount(DeletableStatefulObject): host = models.ForeignKey('ManagedHost', help_text = "Mount host", related_name="client_mounts") filesystem = models.ForeignKey('ManagedFilesystem', help_text = "Mounted filesystem") mountpoint = models.CharField(max_length = CHARFIELD_MAX_LENGTH, help_text = "Filesystem mountpoint on host", null = True, blank = True) states = ['unmounted', 'mounted', 'removed'] initial_state = 'unmounted' def __str__(self): return self.get_label() @property def active(self): return self.state == 'mounted' def get_label(self): return "%s:%s (%s)" % (self.host, self.mountpoint, self.state) def get_deps(self, state = None): if not state: state = self.state deps = [] if state == 'mounted': # Depend on this mount's host having LNet up. If LNet is stopped # on the host, this filesystem will be unmounted first. deps.append(DependOn(self.host.lnet_configuration, 'lnet_up', fix_state='unmounted')) if state != 'removed': # Depend on the fs being available. deps.append(DependOn(self.filesystem, 'available', fix_state='unmounted')) # But if either the host or the filesystem are removed, the # mount should follow. deps.append(DependOn(self.host, 'lnet_up', acceptable_states = list(set(self.host.states) - set(['removed', 'forgotten'])), fix_state = 'removed')) deps.append(DependOn(self.filesystem, 'available', acceptable_states = list(set(self.filesystem.states) - set(['removed', 'forgotten'])), fix_state = 'removed')) return DependAll(deps) reverse_deps = { 'ManagedHost': lambda mh: ObjectCache.host_client_mounts(mh.id), 'LNetConfiguration': lambda lc: ObjectCache.host_client_mounts(lc.host.id), 'ManagedFilesystem': lambda mf: ObjectCache.filesystem_client_mounts(mf.id) } class Meta: app_label = 'chroma_core' unique_together = ('host', 'filesystem')
def create_fake_filesystem_client(self, active=False): from chroma_core.models import ManagedMgs, ManagedMdt, ManagedOst, ManagedFilesystem, LustreClientMount from tests.unit.chroma_core.helpers import synthetic_volume_full mgt, _ = ManagedMgs.create_for_volume(synthetic_volume_full(self.server).id, name = "MGS") fs = ManagedFilesystem.objects.create(mgs = mgt, name = 'testfs') ObjectCache.add(ManagedFilesystem, fs) ManagedMdt.create_for_volume(synthetic_volume_full(self.server).id, filesystem = fs) ManagedOst.create_for_volume(synthetic_volume_full(self.server).id, filesystem = fs) state = 'mounted' if active else 'unmounted' self.mount = LustreClientMount.objects.create(host = self.worker, filesystem = fs, state = state) ObjectCache.add(LustreClientMount, self.mount)
def test_managed_host_undeployed(self): """Test that an undeployed host can only be force removed""" self.host.state = "undeployed" self.host.save() ObjectCache.update(self.host) expected_job_classes = ["ForceRemoveHostJob"] received_job_classes = [ job["class_name"] for job in self._get_jobs(self.host) ] self.assertEqual(set(received_job_classes), set(expected_job_classes))
def setUp(self): super(TestSharedTarget, self).setUp() self.mgt, tms = ManagedMgs.create_for_volume(self._test_lun( ManagedHost.objects.get(address='pair1'), secondary_hosts=[ManagedHost.objects.get(address='pair2')]).id, name="MGS") ObjectCache.add(ManagedTarget, self.mgt.managedtarget_ptr) for tm in tms: ObjectCache.add(ManagedTargetMount, tm) self.assertEqual( ManagedMgs.objects.get(pk=self.mgt.pk).state, 'unformatted')
def learn_primary_target(self, managed_target): primary_target = None managed_target.managedtargetmount_set.update(primary=False) for tm in managed_target.managedtargetmount_set.all(): # We may well have scanned a subset of the hosts and so not have data for all the target mounts, if we # are rescanning we can know about targetmounts we didn't scan. if tm.host not in self.all_hosts_data: continue try: target_info = next( dev for dev in self.all_hosts_data[tm.host]["local_targets"] if dev["uuid"] == managed_target.uuid) except StopIteration: # LV not in all_hosts_data continue local_nids = set(tm.host.lnet_configuration.get_nids()) if not local_nids: raise NoNidsPresent("Host %s has no NIDS!" % tm.host) if "failover.node" in target_info["params"]: failover_nids = set( normalize_nid(n) for nids in target_info["params"]["failover.node"] for n in nids.split(",")) if not bool(local_nids & failover_nids): # In the case the current nids is not shown in the failover nids # This target is considered primary and has been created with mkfs.lustre --failnode # There isn't any other possibilities to have another primary defined primary_target = tm break elif target_info["mounted"]: # In the case the target has been created with 'mkfs.lustre --servicenodes' # If it is mounted, we use the current target as primary until we found a better candidate primary_target = tm else: # If there are no failover nids then this must be the primary. primary_target = tm break if primary_target != None: log.info("Target %s has been set to primary" % (primary_target)) primary_target.primary = True primary_target.save() ObjectCache.update(primary_target) return primary_target
def test_failing_job(self): mgt, tms = ManagedMgs.create_for_volume(self._test_lun(self.host).id, name = "MGS") ObjectCache.add(ManagedTarget, mgt.managedtarget_ptr) for tm in tms: ObjectCache.add(ManagedTargetMount, tm) try: MockAgentRpc.succeed = False # This is to check that the scheduler doesn't run past the failed job (like in HYD-1572) self.set_and_assert_state(mgt.managedtarget_ptr, 'mounted', check = False) mgt = self.assertState(mgt, 'unformatted') finally: MockAgentRpc.succeed = True mgt.managedtarget_ptr = self.set_and_assert_state(mgt.managedtarget_ptr, 'mounted')
def synthetic_lnet_configuration(host, nids): lnet_configuration, _ = LNetConfiguration.objects.get_or_create(host=host) ObjectCache.add(LNetConfiguration, lnet_configuration) # Now delete any existing nids as we will recreate them if some have been requested. Nid.objects.filter(lnet_configuration=lnet_configuration).delete() if nids: assert type(nids[0]) == Nid.Nid lnet_configuration.state = "lnet_up" interface_no = 0 for nid in nids: try: network_interface = NetworkInterface.objects.get( host=host, name="eth%s" % interface_no, type=nid.lnd_type ) network_interface.inet4_address = nid.nid_address network_interface.inet4_prefix = 24 network_interface.state_up = True except NetworkInterface.DoesNotExist: network_interface = NetworkInterface.objects.create( host=host, name="eth%s" % interface_no, type=nid.lnd_type, inet4_address=nid.nid_address, inet4_prefix=24, state_up=True, ) network_interface.save() nid_record = Nid.objects.create( lnet_configuration=lnet_configuration, network_interface=network_interface, lnd_network=nid.lnd_network, lnd_type=nid.lnd_type, ) nid_record.save() interface_no += 1 else: lnet_configuration.state = "lnet_unloaded" lnet_configuration.save() return lnet_configuration
def get_steps(self): host = ObjectCache.get_one( ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id) mountpoint = (self.lustre_client_mount.mountpoints[0] if self.lustre_client_mount.mountpoints else "/mnt/{}".format(self.lustre_client_mount.filesystem)) filesystem = ObjectCache.get_one( ManagedFilesystem, lambda mf: mf.name == self.lustre_client_mount.filesystem) args = { "host": host, "filesystems": [(filesystem.mount_path(), mountpoint)] } return [(MountLustreFilesystemsStep, args)]
def get_steps(self): search = lambda cm: (cm.host == self.host and cm.state == "mounted") mounted = ObjectCache.get(LustreClientMount, search) args = { "host": self.host, "filesystems": [( ObjectCache.get_one( ManagedFilesystem, lambda mf, mtd=m: mf.name == mtd.filesystem).mount_path(), m.mountpoints, ) for m in mounted], } return [(UnmountLustreFilesystemsStep, args)]
def get_steps(self): search = lambda cm: (cm.host == self.host and cm.state == 'mounted') mounted = ObjectCache.get(LustreClientMount, search) args = dict(host=self.host, filesystems=[(m.filesystem.mount_path(), m.mountpoint) for m in mounted]) return [(UnmountLustreFilesystemsStep, args)]
def get_deps(self, state=None): if not state: state = self.state client_mount = ObjectCache.get_one(LustreClientMount, lambda cm: cm.id == self.client_mount_id) deps = [] if state == "started": # Depend on the client mount being mounted in order to # start or stay running. deps.append(DependOn(client_mount, "mounted", fix_state="stopped")) if state != "removed": # If the client mount is going to be removed, then the # copytool should also be removed. deps.append( DependOn( client_mount, "mounted", acceptable_states=list(set(self.client_mount.states) - set(["removed"])), fix_state="removed", ) ) return DependAll(deps)
def get_steps(self): steps = [] mgs_target = ObjectCache.get_one( ManagedTarget, lambda t: t.id == self.filesystem.mgs_id) # Only try to purge filesystem from MGT if the MGT has made it past # being formatted (case where a filesystem was created but is being # removed before it or its MGT got off the ground) if mgs_target.state in ['unformatted', 'formatted']: return steps # Don't purge immutable filesystems. (Although how this gets called in that case is beyond me) if self.filesystem.immutable_state: return steps # MGS needs to be started if not mgs_target.active_mount: raise RuntimeError( "MGT needs to be running in order to remove the filesystem.") steps.append((PurgeFilesystemStep, { 'filesystem': self.filesystem, 'mgs_device_path': mgs_target.active_mount.volume_node.path, 'mgs_device_type': mgs_target.active_mount.volume_node.volume.storage_resource. to_resource_class().device_type(), 'host': mgs_target.active_mount.host })) return steps
def get_deps(self): return DependOn( ObjectCache.get_one( ManagedHost, lambda mh: mh.id == self.lustre_client_mount. host_id).lnet_configuration, "lnet_up", )
def dehydrate_client_mounts(self, bundle): from chroma_core.lib.cache import ObjectCache from chroma_core.models import LustreClientMount search = lambda cm: cm.host == bundle.obj mounts = ObjectCache.get(LustreClientMount, search) return [{'filesystem_name': mount.filesystem.name, 'mountpoint': mount.mountpoint, 'state': mount.state} for mount in mounts]
def get_steps(self): from chroma_core.models import ServerProfile profiles = [(p.name, list(p.serverprofilevalidation_set.values())) for p in ObjectCache.get(ServerProfile)] return [ ( TestHostConnectionStep, {"address": self.address, "credentials_key": self.credentials_key, "profiles": profiles}, ) ]
def get_deps(self): deps = [] mgs_target = ObjectCache.get_one( ManagedTarget, lambda t: t.id == self.filesystem.mgs_id) # Can't start a MGT that hasn't made it past formatting. if mgs_target.state not in ["unformatted", "formatted"]: deps.append( DependOn(mgs_target, "mounted", fix_state="unavailable")) return DependAll(deps)
def get_deps(self): if issubclass(self.target.downcast_class, ManagedMgs): ticket = self.target.downcast().get_ticket() if ticket: return DependAll( DependOn(ticket, "granted", fix_state="unmounted")) if self.target.downcast_class in [ManagedMdt, ManagedOst]: from chroma_core.models import FilesystemTicket target = self.target.downcast() ticket = FilesystemTicket.objects.filter( filesystem=target.filesystem_id).first() if ticket: return DependAll( DependOn(ticket.ticket, "granted", fix_state="unmounted")) deps = [] # Depend on at least one targetmount having lnet up for host in self.target.hosts: from chroma_core.models import LNetConfiguration lnet_configuration = ObjectCache.get_one( LNetConfiguration, lambda l: l.host_id == host.id) deps.append( DependOn(lnet_configuration, "lnet_up", fix_state="unmounted")) try: pacemaker_configuration = ObjectCache.get_one( PacemakerConfiguration, lambda pm: pm.host_id == host.id) deps.append( DependOn(pacemaker_configuration, "started", fix_state="unmounted")) except PacemakerConfiguration.DoesNotExist: pass return DependAny(deps)
def test_two_concurrent_removes(self): """ Test that we can concurrently remove two filesystems which depend on the same mgt """ fs2 = ManagedFilesystem.objects.create(mgs=self.mgt, name="testfs2") ObjectCache.add(ManagedFilesystem, fs2) mdt2, mdt_tms = ManagedMdt.create_for_volume(self._test_lun( self.host).id, filesystem=fs2) ost2, ost_tms = ManagedOst.create_for_volume(self._test_lun( self.host).id, filesystem=fs2) for target in [mdt2, ost2]: ObjectCache.add(ManagedTarget, target.managedtarget_ptr) for tm in chain(mdt_tms, ost_tms): ObjectCache.add(ManagedTargetMount, tm) self.fs = self.set_and_assert_state(self.fs, "available") fs2 = self.set_and_assert_state(fs2, "available") self.set_state_delayed([(self.fs, "removed")]) self.set_state_delayed([(fs2, "removed")]) self.set_state_complete() with self.assertRaises(ManagedFilesystem.DoesNotExist): ManagedFilesystem.objects.get(pk=self.fs.pk) with self.assertRaises(ManagedFilesystem.DoesNotExist): ManagedFilesystem.objects.get(pk=fs2.pk)
def can_run(cls, host): if not host.is_worker: return False search = lambda cm: (cm.host == host and cm.state == 'mounted') mounted = ObjectCache.get(LustreClientMount, search) return (host.state not in ['removed', 'undeployed', 'unconfigured'] and len(mounted) > 0 and not AlertState.filter_by_item(host).filter( active=True, alert_type__in=[ HostOfflineAlert.__name__, HostContactAlert.__name__ ]).exists())
def get_deps(self): deps = [] targets = ObjectCache.get_targets_by_filesystem(self.filesystem_id) targets = [ t for t in targets if not issubclass(t.downcast_class, ManagedMgs) ] for t in targets: deps.append( DependOn(t, "unmounted", acceptable_states=t.not_state("mounted"), fix_state="unavailable")) return DependAll(deps)