class Copytool(StatefulObject, MeasuredEntity): __metaclass__ = DeletableDowncastableMetaclass # Fixed, minimum size (RH6.5) for HYD-3244, so that no matter what # ulimit -s size, and hence os.sysconf('SC_ARG_MAX') is always viable. HSM_ARGUMENT_MAX_SIZE_FOR_COPYTOOL = 131072 # characters host = models.ForeignKey("ManagedHost", related_name="copytools") index = models.IntegerField( default=0, help_text="Instance index, used to uniquely identify per-host path-filesystem-archive instances" ) bin_path = models.CharField(max_length=CHARFIELD_MAX_LENGTH, help_text="Path to copytool binary on HSM worker node") archive = models.IntegerField(default=1, help_text="HSM archive number") filesystem = models.ForeignKey("ManagedFilesystem") mountpoint = models.CharField( max_length=CHARFIELD_MAX_LENGTH, help_text="Lustre mountpoint on HSM worker node", default="/mnt/lustre" ) hsm_arguments = models.CharField( max_length=HSM_ARGUMENT_MAX_SIZE_FOR_COPYTOOL, help_text="Copytool arguments that are specific to the HSM implementation", ) uuid = models.CharField( max_length=len("%s" % uuid.uuid4()), null=True, blank=True, help_text="UUID as assigned by cdt" ) pid = models.IntegerField(null=True, blank=True, help_text="Current PID, if known") client_mount = models.ForeignKey("LustreClientMount", null=True, blank=True, related_name="copytools") states = ["unconfigured", "stopped", "started", "removed"] initial_state = "unconfigured" def __str__(self): return self.get_label() def get_label(self): return "%s-%s-%s-%s" % (os.path.basename(self.bin_path), self.filesystem.name, self.archive, self.index) def register(self, uuid): if self.uuid and uuid != self.uuid: # If this is a re-registration with a new uuid (i.e. new # running copytool instance, then we need to cancel all outstanding # actions in the UI for the old instance. The actions have # already been canceled on the coordinator, so this is just # to keep the UI in sync with reality. log.warn("Canceling stale operations for %s" % self) self.cancel_current_operations() self.uuid = uuid self.set_state("started") self.save() def unregister(self): self.uuid = None self.set_state("stopped") self.save() def create_operation(self, start_time, type, path, fid): try: return self.operations.create( started_at=start_time, state=OP_STATES.STARTED, type=resolve_value("type", type), path=path, fid=fid ) except KeyError: log.error("Unknown operation type: %s" % type) @property def current_operations(self): return self.operations.exclude(state__in=[OP_STATES.FINISHED, OP_STATES.ERRORED]) def cancel_current_operations(self): for operation in self.current_operations: log.warn("Canceling operation: %s" % operation) operation.finish() def get_deps(self, state=None): if not state: state = self.state client_mount = ObjectCache.get_one(LustreClientMount, lambda cm: cm.id == self.client_mount_id) deps = [] if state == "started": # Depend on the client mount being mounted in order to # start or stay running. deps.append(DependOn(client_mount, "mounted", fix_state="stopped")) if state != "removed": # If the client mount is going to be removed, then the # copytool should also be removed. deps.append( DependOn( client_mount, "mounted", acceptable_states=list(set(self.client_mount.states) - set(["removed"])), fix_state="removed", ) ) return DependAll(deps) reverse_deps = {"LustreClientMount": lambda cm: ObjectCache.client_mount_copytools(cm.id)} class Meta: app_label = "chroma_core" unique_together = ("host", "bin_path", "filesystem", "archive", "index") ordering = ["id"]
class ManagedTarget(StatefulObject): __metaclass__ = DeletableDowncastableMetaclass name = models.CharField( max_length=64, null=True, blank=True, help_text="Lustre target name, e.g. 'testfs-OST0001'. May be null\ if the target has not yet been registered.", ) uuid = models.CharField( max_length=64, null=True, blank=True, help_text="UUID of the target's internal file system. May be null\ if the target has not yet been formatted", ) ha_label = models.CharField( max_length=64, null=True, blank=True, help_text="Label used for HA layer; human readable but unique") inode_size = models.IntegerField(null=True, blank=True, help_text="Size in bytes per inode") bytes_per_inode = models.IntegerField( null=True, blank=True, help_text="Constant used during formatting to " "determine inode count by dividing the volume size by ``bytes_per_inode``", ) inode_count = models.BigIntegerField( null=True, blank=True, help_text="The number of inodes in this target's" "backing store") reformat = models.BooleanField( default=False, help_text= "Only used during formatting, indicates that when formatting this target \ any existing filesystem on the Volume should be overwritten", ) def get_param(self, key): params = self.targetparam_set.filter(key=key) return [p.value for p in params] def get_params(self): return [(p.key, p.value) for p in self.targetparam_set.all()] @property def first_known_host(self): t = get_target_by_name(self.name) host_id = t.get("host_ids")[0] return ManagedHost.objects.get(id=host_id) @property def inactive_hosts(self): t = get_target_by_name(self.name) host_id = t.get("active_host_id") host_ids = t.get("host_ids") xs = [x for x in host_ids if x != host_id] return ManagedHost.objects.filter(id__in=xs, not_deleted=True) @property def hosts(self): t = get_target_by_name(self.name) host_ids = t.get("host_ids") return ManagedHost.objects.filter(id__in=host_ids, not_deleted=True) @property def default_mount_point(self): return "/mnt/%s" % self.name @property def active_host(self): t = get_target_by_name(self.name) id = t.get("active_host_id") if id is None: return None return ManagedHost.objects.get(id=id) def get_label(self): return self.name def __str__(self): return self.name or "" def best_available_host(self): """ :return: A host which is available for actions, preferably the one running this target. """ t = get_target_by_name(self.name) xs = [t.get("active_host_id")] if t.get("active_host_id") else [] xs = xs + t["host_ids"] xs = filter( lambda x: HostContactAlert.filter_by_item_id(ManagedHost, x).count( ) == 0, xs) if len(xs) == 0: raise ManagedHost.DoesNotExist("No hosts online for {}".format( t["name"])) return ManagedHost.objects.get(id=xs[0]) # unformatted: I exist in theory in the database # formatted: I've been mkfs'd # registered: I've registered with the MGS, I'm not setup in HA yet # unmounted: I'm set up in HA, ready to mount # mounted: Im mounted # removed: this target no longer exists in real life # forgotten: Equivalent of 'removed' for immutable_state targets # Additional states needed for 'deactivated'? states = [ "unformatted", "formatted", "registered", "unmounted", "mounted", "removed", "forgotten" ] initial_state = "unformatted" def set_state(self, state, intentional=False): job_log.debug("mt.set_state %s %s" % (state, intentional)) super(ManagedTarget, self).set_state(state, intentional) if intentional: TargetOfflineAlert.notify_warning(self, self.state == "unmounted") else: TargetOfflineAlert.notify(self, self.state == "unmounted") class Meta: app_label = "chroma_core" ordering = ["id"] def get_deps(self, state=None): if not state: state = self.state t = get_target_by_name(self.name) active_host_id = t["active_host_id"] deps = [] if state == "mounted" and active_host_id and not self.immutable_state: from chroma_core.models import LNetConfiguration # Depend on the active mount's host having LNet up, so that if # LNet is stopped on that host this target will be stopped first. host = ObjectCache.get_one(ManagedHost, lambda mh: mh.id == active_host_id, fill_on_miss=True) lnet_configuration = ObjectCache.get_by_id( LNetConfiguration, host.lnet_configuration.id) deps.append( DependOn(lnet_configuration, "lnet_up", fix_state="unmounted")) if host.pacemaker_configuration: pacemaker_configuration = ObjectCache.get_by_id( PacemakerConfiguration, host.pacemaker_configuration.id) deps.append( DependOn(pacemaker_configuration, "started", fix_state="unmounted")) # TODO: also express that this situation may be resolved by migrating # the target instead of stopping it. if state not in ["removed", "forgotten"]: from chroma_core.models import LNetConfiguration for host in self.hosts: fix_state = "forgotten" if self.immutable_state else "removed" lnet_configuration = ObjectCache.get_by_id( LNetConfiguration, host.lnet_configuration.id) deps.append( DependOn(lnet_configuration, "lnet_up", unacceptable_states=["unconfigured"], fix_state=fix_state)) if host.pacemaker_configuration: pacemaker_configuration = ObjectCache.get_by_id( PacemakerConfiguration, host.pacemaker_configuration.id) deps.append( DependOn( pacemaker_configuration, "started", unacceptable_states=["unconfigured"], fix_state=fix_state, )) return DependAll(deps) reverse_deps = { "ManagedHost": lambda mh: get_host_targets(mh.id), "LNetConfiguration": lambda lc: get_host_targets(lc.host.id), "PacemakerConfiguration": lambda pc: get_host_targets(pc.host.id), "ManagedFilesystem": lambda mfs: ObjectCache.fs_targets(mfs.id), "Copytool": lambda ct: ObjectCache.client_mount_copytools(ct.id), } def target_type(self): raise NotImplementedError("Unimplemented method 'target_type'") @classmethod def managed_target_of_type(cls, target_type): """ :param target_type: is a string describing the target required, generally ost, mdt or mgt :return: Returns a klass of the type required by looking through the subclasses """ try: # Hack I need to work out with Joe. if target_type == "mgt": target_type = "mgs" target_type = target_type.lower() subtype = next(klass for klass in util.all_subclasses(ManagedTarget) if target_type == klass().target_type()) return subtype except StopIteration: raise NotImplementedError("ManagedTarget %s unknown" % target_type) @property def filesystem_member(self): """ :return: True if the TargetType is a file system member, generally OST or MDT. """ return issubclass(type(self), FilesystemMember) def mkfs_override_options(self, filesystemtype, mkfs_options): """Allows a ManagedTarget to modify the mkfs_options as required. :return: A list of additional options for mkfs as in those things that appear after --mkfsoptions """ return mkfs_options