Exemple #1
0
    def tearDown(self):
        from chroma_api.authentication import CsrfAuthentication
        CsrfAuthentication.is_authenticated = self.old_is_authenticated

        #  Restore
        from chroma_core.services.job_scheduler import job_scheduler_client
        job_scheduler_client.JobSchedulerClient.available_transitions = self.old_available_transitions
        job_scheduler_client.JobSchedulerClient.available_jobs = self.old_available_jobs

        ObjectCache.clear()
 def create_host_ssh(address, server_profile, root_pw, pkey, pkey_pw):
     host_data = AgentRpc.mock_servers[address]
     host = synthetic_host(address,
                           nids=host_data['nids'],
                           fqdn=host_data['fqdn'],
                           nodename=host_data['nodename'])
     ObjectCache.add(ManagedHost, host)
     command = Command.objects.create(complete=True,
                                      message="Mock create_host_ssh")
     return host, command
    def learn_target_mounts(self):
        for host, host_data in self.all_hosts_data.items():
            # We will compare any found target mounts to all known MGSs
            for local_info in host_data["local_targets"]:
                debug_id = (host, local_info["device_paths"][0],
                            local_info["name"])
                targets = ManagedTarget.objects.filter(uuid=local_info["uuid"])
                if not targets.count():
                    log.warning("Ignoring %s:%s (%s), target unknown" %
                                debug_id)
                    continue

                for target in targets:
                    if isinstance(target, FilesystemMember):
                        try:
                            mgs = self._target_find_mgs(host, local_info)
                        except ManagedMgs.DoesNotExist:
                            log.warning(
                                "Can't find MGS for target %s:%s (%s)" %
                                debug_id)
                            continue
                    else:
                        mgs = None

                    if not self.target_available_here(host, mgs, local_info):
                        log.warning(
                            "Ignoring %s on %s, as it is not mountable on this host"
                            % (local_info["name"], host))
                        continue

                    try:
                        log.info("Target %s seen on %s" % (target, host))
                        volumenode = self._get_volume_node(
                            host, local_info["device_paths"])
                        (tm,
                         created) = ManagedTargetMount.objects.get_or_create(
                             target=target, host=host, volume_node=volumenode)
                        if created:
                            tm.immutable_state = True
                            tm.save()
                            log.info(
                                "Learned association %d between %s and host %s"
                                % (tm.id, local_info["name"], host))
                            self._learn_event(host, tm)
                            ObjectCache.add(ManagedTargetMount, tm)

                        if local_info["mounted"]:
                            target.state = "mounted"
                            target.active_mount = tm
                            target.save()

                    except NoNidsPresent:
                        log.warning(
                            "Cannot set up target %s on %s until LNet is running"
                            % (local_info["name"], host))
Exemple #4
0
def synthetic_host(address=None,
                   nids=list([]),
                   storage_resource=False,
                   fqdn=None,
                   nodename=None,
                   server_profile='test_profile'):
    """
    Create a ManagedHost + paraphernalia, with states set as if configuration happened successfully

    :param storage_resource: If true, create a PluginAgentResources (additional overhead, only sometimes required)
    """

    server_profile = ServerProfile.objects.get(name=server_profile)

    if address is None:
        address = random_str(postfix=".tld")

    if fqdn is None:
        fqdn = address
    if nodename is None:
        nodename = address

    host = ManagedHost.objects.create(
        address=address,
        fqdn=fqdn,
        nodename=nodename,
        state='managed',
        server_profile=server_profile,
        immutable_state=not server_profile.managed
        if server_profile else False)

    ObjectCache.add(ManagedHost, host)

    lnet_configuration = synthetic_lnet_configuration(host, nids)

    if server_profile.managed:
        synthetic_rsyslog_configuration(host)
        synthetic_ntp_configuration(host)
        synthetic_corosync_configuration(host)
        synthetic_pacemaker_configuration(host)

    log.debug("synthetic_host: %s %s" %
              (address, lnet_configuration.get_nids()))

    if storage_resource:
        from chroma_core.lib.storage_plugin.manager import storage_plugin_manager
        resource_class, resource_class_id = storage_plugin_manager.get_plugin_resource_class(
            'linux', 'PluginAgentResources')
        StorageResourceRecord.get_or_create_root(resource_class,
                                                 resource_class_id, {
                                                     'plugin_name': 'linux',
                                                     'host_id': host.id
                                                 })

    return host
    def setUp(self):
        super(TestTargetTransitions, self).setUp()

        self.mgt, mgt_tms = ManagedMgs.create_for_volume(self._test_lun(
            self.host).id,
                                                         name='MGS')
        ObjectCache.add(ManagedTarget, self.mgt.managedtarget_ptr)
        for tm in mgt_tms:
            ObjectCache.add(ManagedTargetMount, tm)
        self.assertEqual(
            ManagedMgs.objects.get(pk=self.mgt.pk).state, 'unformatted')
 def get_steps(self):
     host = ObjectCache.get_one(
         ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id)
     from chroma_core.models.filesystem import ManagedFilesystem
     filesystem = ObjectCache.get_one(
         ManagedFilesystem,
         lambda mf: mf.id == self.lustre_client_mount.filesystem_id)
     args = dict(host=host,
                 filesystems=[(filesystem.mount_path(),
                               self.lustre_client_mount.mountpoint)])
     return [(UnmountLustreFilesystemsStep, args)]
Exemple #7
0
    def get_steps(self):
        host = ObjectCache.get_one(
            ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id)

        filesystem = ObjectCache.get_one(
            ManagedFilesystem,
            lambda mf: mf.name == self.lustre_client_mount.filesystem)
        args = dict(host=host,
                    filesystems=[(filesystem.mount_path(),
                                  self.lustre_client_mount.mountpoints)])
        return [(UnmountLustreFilesystemsStep, args)]
 def filter_by_target(cls, target):
     if issubclass(target.downcast_class, ManagedMgs):
         result = ObjectCache.get(ManagedFilesystem,
                                  lambda mfs: mfs.mgs_id == target.id)
         return result
     elif issubclass(target.downcast_class, FilesystemMember):
         return ObjectCache.get(
             ManagedFilesystem,
             lambda mfs: mfs.id == target.downcast().filesystem_id)
     else:
         raise NotImplementedError(target.__class__)
    def get_deps(self):
        search = lambda ct: ct.host == self.copytool.host
        copytools = ObjectCache.get(Copytool, search)

        # Only force an unmount if this is the only copytool associated
        # with the host.
        if len(copytools) == 1:
            search = lambda cm: cm.id == self.copytool.client_mount_id
            client_mount = ObjectCache.get_one(LustreClientMount, search)
            return DependOn(client_mount, "unmounted")
        else:
            return DependAll()
Exemple #10
0
    def _measure_scaling(self,
                         create_n,
                         measured_resource,
                         scaled_resource=None):
        """

        :param create_n: Function to create N of scaled_resource
        :param measured_resource: The resource we will measure the query load for
        :param scaled_resource: The object which is actually being scaled with N
        :return: Instance of Order1, OrderN, OrderBad
        """
        if scaled_resource is None:
            scaled_resource = measured_resource

        query_counts = {}
        samples = [5, 6, 7, 8]

        for n in samples:
            ObjectCache.clear()
            create_n(n)
            # Queries get reset at the start of a request
            self.assertEqual(scaled_resource._meta.queryset.count(), n)
            with CaptureQueriesContext(connection) as queries:
                response = self.api_client.get(
                    "/api/%s/" % measured_resource._meta.resource_name,
                    data={"limit": 0})
                self.assertEqual(
                    response.status_code, 200, "%s:%s" %
                    (response.content, measured_resource._meta.resource_name))
                query_count = len(queries)

            self.assertEqual(len(self.deserialize(response)["objects"]),
                             measured_resource._meta.queryset.count())
            query_counts[n] = query_count

        # Ignore samples[0], it was just to clear out any setup overhead from first call to API

        # gradient between samples[1] and samples[2]
        grad1 = (query_counts[samples[2]] -
                 query_counts[samples[1]]) / (samples[2] - samples[1])
        # gradient between samples[2] and samples[3]
        grad2 = (query_counts[samples[3]] -
                 query_counts[samples[2]]) / (samples[3] - samples[2])

        if grad1 == 0 and grad2 == 0:
            # Hoorah, O(1)
            return Order1(query_counts[samples[3]])
        elif grad1 > 0 and grad1 == grad2:
            # O(N)
            return OrderN(grad1)
        else:
            # Worse than O(N)
            return OrderBad()
class LustreClientMount(DeletableStatefulObject):
    host = models.ForeignKey('ManagedHost', help_text = "Mount host", related_name="client_mounts")
    filesystem = models.ForeignKey('ManagedFilesystem', help_text = "Mounted filesystem")
    mountpoint = models.CharField(max_length = CHARFIELD_MAX_LENGTH,
                                  help_text = "Filesystem mountpoint on host",
                                  null = True,
                                  blank = True)

    states = ['unmounted', 'mounted', 'removed']
    initial_state = 'unmounted'

    def __str__(self):
        return self.get_label()

    @property
    def active(self):
        return self.state == 'mounted'

    def get_label(self):
        return "%s:%s (%s)" % (self.host, self.mountpoint, self.state)

    def get_deps(self, state = None):
        if not state:
            state = self.state

        deps = []
        if state == 'mounted':
            # Depend on this mount's host having LNet up. If LNet is stopped
            # on the host, this filesystem will be unmounted first.
            deps.append(DependOn(self.host.lnet_configuration, 'lnet_up', fix_state='unmounted'))

        if state != 'removed':
            # Depend on the fs being available.
            deps.append(DependOn(self.filesystem, 'available',
                                 fix_state='unmounted'))

            # But if either the host or the filesystem are removed, the
            # mount should follow.
            deps.append(DependOn(self.host, 'lnet_up', acceptable_states = list(set(self.host.states) - set(['removed', 'forgotten'])), fix_state = 'removed'))
            deps.append(DependOn(self.filesystem, 'available', acceptable_states = list(set(self.filesystem.states) - set(['removed', 'forgotten'])), fix_state = 'removed'))

        return DependAll(deps)

    reverse_deps = {
        'ManagedHost': lambda mh: ObjectCache.host_client_mounts(mh.id),
        'LNetConfiguration': lambda lc: ObjectCache.host_client_mounts(lc.host.id),
        'ManagedFilesystem': lambda mf: ObjectCache.filesystem_client_mounts(mf.id)
    }

    class Meta:
        app_label = 'chroma_core'
        unique_together = ('host', 'filesystem')
    def create_fake_filesystem_client(self, active=False):
        from chroma_core.models import ManagedMgs, ManagedMdt, ManagedOst, ManagedFilesystem, LustreClientMount
        from tests.unit.chroma_core.helpers import synthetic_volume_full

        mgt, _ = ManagedMgs.create_for_volume(synthetic_volume_full(self.server).id, name = "MGS")
        fs = ManagedFilesystem.objects.create(mgs = mgt, name = 'testfs')
        ObjectCache.add(ManagedFilesystem, fs)
        ManagedMdt.create_for_volume(synthetic_volume_full(self.server).id, filesystem = fs)
        ManagedOst.create_for_volume(synthetic_volume_full(self.server).id, filesystem = fs)
        state = 'mounted' if active else 'unmounted'
        self.mount = LustreClientMount.objects.create(host = self.worker, filesystem = fs, state = state)

        ObjectCache.add(LustreClientMount, self.mount)
Exemple #13
0
    def test_managed_host_undeployed(self):
        """Test that an undeployed host can only be force removed"""

        self.host.state = "undeployed"
        self.host.save()

        ObjectCache.update(self.host)

        expected_job_classes = ["ForceRemoveHostJob"]
        received_job_classes = [
            job["class_name"] for job in self._get_jobs(self.host)
        ]
        self.assertEqual(set(received_job_classes), set(expected_job_classes))
    def setUp(self):
        super(TestSharedTarget, self).setUp()

        self.mgt, tms = ManagedMgs.create_for_volume(self._test_lun(
            ManagedHost.objects.get(address='pair1'),
            secondary_hosts=[ManagedHost.objects.get(address='pair2')]).id,
                                                     name="MGS")

        ObjectCache.add(ManagedTarget, self.mgt.managedtarget_ptr)
        for tm in tms:
            ObjectCache.add(ManagedTargetMount, tm)
        self.assertEqual(
            ManagedMgs.objects.get(pk=self.mgt.pk).state, 'unformatted')
Exemple #15
0
    def learn_primary_target(self, managed_target):

        primary_target = None
        managed_target.managedtargetmount_set.update(primary=False)
        for tm in managed_target.managedtargetmount_set.all():
            # We may well have scanned a subset of the hosts and so not have data for all the target mounts, if we
            # are rescanning we can know about targetmounts we didn't scan.
            if tm.host not in self.all_hosts_data:
                continue

            try:
                target_info = next(
                    dev
                    for dev in self.all_hosts_data[tm.host]["local_targets"]
                    if dev["uuid"] == managed_target.uuid)
            except StopIteration:
                # LV not in all_hosts_data
                continue
            local_nids = set(tm.host.lnet_configuration.get_nids())

            if not local_nids:
                raise NoNidsPresent("Host %s has no NIDS!" % tm.host)

            if "failover.node" in target_info["params"]:
                failover_nids = set(
                    normalize_nid(n)
                    for nids in target_info["params"]["failover.node"]
                    for n in nids.split(","))

                if not bool(local_nids & failover_nids):
                    # In the case the current nids is not shown in the failover nids
                    # This target is considered primary and has been created with mkfs.lustre --failnode
                    # There isn't any other possibilities to have another primary defined
                    primary_target = tm
                    break
                elif target_info["mounted"]:
                    # In the case the target has been created with 'mkfs.lustre --servicenodes'
                    # If it is mounted, we use the current target as primary until we found a better candidate
                    primary_target = tm
            else:
                # If there are no failover nids then this must be the primary.
                primary_target = tm
                break

        if primary_target != None:
            log.info("Target %s has been set to primary" % (primary_target))
            primary_target.primary = True
            primary_target.save()
            ObjectCache.update(primary_target)

        return primary_target
    def test_failing_job(self):
        mgt, tms = ManagedMgs.create_for_volume(self._test_lun(self.host).id, name = "MGS")
        ObjectCache.add(ManagedTarget, mgt.managedtarget_ptr)
        for tm in tms:
            ObjectCache.add(ManagedTargetMount, tm)

        try:
            MockAgentRpc.succeed = False
            # This is to check that the scheduler doesn't run past the failed job (like in HYD-1572)
            self.set_and_assert_state(mgt.managedtarget_ptr, 'mounted', check = False)
            mgt = self.assertState(mgt, 'unformatted')
        finally:
            MockAgentRpc.succeed = True
            mgt.managedtarget_ptr = self.set_and_assert_state(mgt.managedtarget_ptr, 'mounted')
def synthetic_lnet_configuration(host, nids):
    lnet_configuration, _ = LNetConfiguration.objects.get_or_create(host=host)

    ObjectCache.add(LNetConfiguration, lnet_configuration)

    # Now delete any existing nids as we will recreate them if some have been requested.
    Nid.objects.filter(lnet_configuration=lnet_configuration).delete()

    if nids:
        assert type(nids[0]) == Nid.Nid

        lnet_configuration.state = "lnet_up"

        interface_no = 0
        for nid in nids:
            try:
                network_interface = NetworkInterface.objects.get(
                    host=host, name="eth%s" % interface_no, type=nid.lnd_type
                )
                network_interface.inet4_address = nid.nid_address
                network_interface.inet4_prefix = 24
                network_interface.state_up = True
            except NetworkInterface.DoesNotExist:
                network_interface = NetworkInterface.objects.create(
                    host=host,
                    name="eth%s" % interface_no,
                    type=nid.lnd_type,
                    inet4_address=nid.nid_address,
                    inet4_prefix=24,
                    state_up=True,
                )

            network_interface.save()

            nid_record = Nid.objects.create(
                lnet_configuration=lnet_configuration,
                network_interface=network_interface,
                lnd_network=nid.lnd_network,
                lnd_type=nid.lnd_type,
            )
            nid_record.save()

            interface_no += 1
    else:
        lnet_configuration.state = "lnet_unloaded"

    lnet_configuration.save()

    return lnet_configuration
Exemple #18
0
    def get_steps(self):
        host = ObjectCache.get_one(
            ManagedHost, lambda mh: mh.id == self.lustre_client_mount.host_id)

        mountpoint = (self.lustre_client_mount.mountpoints[0]
                      if self.lustre_client_mount.mountpoints else
                      "/mnt/{}".format(self.lustre_client_mount.filesystem))
        filesystem = ObjectCache.get_one(
            ManagedFilesystem,
            lambda mf: mf.name == self.lustre_client_mount.filesystem)
        args = {
            "host": host,
            "filesystems": [(filesystem.mount_path(), mountpoint)]
        }
        return [(MountLustreFilesystemsStep, args)]
Exemple #19
0
    def get_steps(self):
        search = lambda cm: (cm.host == self.host and cm.state == "mounted")
        mounted = ObjectCache.get(LustreClientMount, search)
        args = {
            "host":
            self.host,
            "filesystems": [(
                ObjectCache.get_one(
                    ManagedFilesystem,
                    lambda mf, mtd=m: mf.name == mtd.filesystem).mount_path(),
                m.mountpoints,
            ) for m in mounted],
        }

        return [(UnmountLustreFilesystemsStep, args)]
 def get_steps(self):
     search = lambda cm: (cm.host == self.host and cm.state == 'mounted')
     mounted = ObjectCache.get(LustreClientMount, search)
     args = dict(host=self.host,
                 filesystems=[(m.filesystem.mount_path(), m.mountpoint)
                              for m in mounted])
     return [(UnmountLustreFilesystemsStep, args)]
    def get_deps(self, state=None):
        if not state:
            state = self.state

        client_mount = ObjectCache.get_one(LustreClientMount, lambda cm: cm.id == self.client_mount_id)

        deps = []
        if state == "started":
            # Depend on the client mount being mounted in order to
            # start or stay running.
            deps.append(DependOn(client_mount, "mounted", fix_state="stopped"))

        if state != "removed":
            # If the client mount is going to be removed, then the
            # copytool should also be removed.
            deps.append(
                DependOn(
                    client_mount,
                    "mounted",
                    acceptable_states=list(set(self.client_mount.states) - set(["removed"])),
                    fix_state="removed",
                )
            )

        return DependAll(deps)
    def get_steps(self):
        steps = []

        mgs_target = ObjectCache.get_one(
            ManagedTarget, lambda t: t.id == self.filesystem.mgs_id)

        # Only try to purge filesystem from MGT if the MGT has made it past
        # being formatted (case where a filesystem was created but is being
        # removed before it or its MGT got off the ground)
        if mgs_target.state in ['unformatted', 'formatted']:
            return steps

        # Don't purge immutable filesystems. (Although how this gets called in that case is beyond me)
        if self.filesystem.immutable_state:
            return steps

        # MGS needs to be started
        if not mgs_target.active_mount:
            raise RuntimeError(
                "MGT needs to be running in order to remove the filesystem.")

        steps.append((PurgeFilesystemStep, {
            'filesystem':
            self.filesystem,
            'mgs_device_path':
            mgs_target.active_mount.volume_node.path,
            'mgs_device_type':
            mgs_target.active_mount.volume_node.volume.storage_resource.
            to_resource_class().device_type(),
            'host':
            mgs_target.active_mount.host
        }))

        return steps
 def get_deps(self):
     return DependOn(
         ObjectCache.get_one(
             ManagedHost, lambda mh: mh.id == self.lustre_client_mount.
             host_id).lnet_configuration,
         "lnet_up",
     )
Exemple #24
0
 def dehydrate_client_mounts(self, bundle):
     from chroma_core.lib.cache import ObjectCache
     from chroma_core.models import LustreClientMount
     search = lambda cm: cm.host == bundle.obj
     mounts = ObjectCache.get(LustreClientMount, search)
     return [{'filesystem_name': mount.filesystem.name,
              'mountpoint': mount.mountpoint,
              'state': mount.state} for mount in mounts]
Exemple #25
0
    def get_steps(self):
        from chroma_core.models import ServerProfile

        profiles = [(p.name, list(p.serverprofilevalidation_set.values())) for p in ObjectCache.get(ServerProfile)]

        return [
            (
                TestHostConnectionStep,
                {"address": self.address, "credentials_key": self.credentials_key, "profiles": profiles},
            )
        ]
Exemple #26
0
    def get_deps(self):
        deps = []

        mgs_target = ObjectCache.get_one(
            ManagedTarget, lambda t: t.id == self.filesystem.mgs_id)

        # Can't start a MGT that hasn't made it past formatting.
        if mgs_target.state not in ["unformatted", "formatted"]:
            deps.append(
                DependOn(mgs_target, "mounted", fix_state="unavailable"))
        return DependAll(deps)
    def get_deps(self):
        if issubclass(self.target.downcast_class, ManagedMgs):
            ticket = self.target.downcast().get_ticket()
            if ticket:
                return DependAll(
                    DependOn(ticket, "granted", fix_state="unmounted"))

        if self.target.downcast_class in [ManagedMdt, ManagedOst]:
            from chroma_core.models import FilesystemTicket

            target = self.target.downcast()

            ticket = FilesystemTicket.objects.filter(
                filesystem=target.filesystem_id).first()

            if ticket:
                return DependAll(
                    DependOn(ticket.ticket, "granted", fix_state="unmounted"))

        deps = []

        # Depend on at least one targetmount having lnet up
        for host in self.target.hosts:
            from chroma_core.models import LNetConfiguration

            lnet_configuration = ObjectCache.get_one(
                LNetConfiguration, lambda l: l.host_id == host.id)
            deps.append(
                DependOn(lnet_configuration, "lnet_up", fix_state="unmounted"))

            try:
                pacemaker_configuration = ObjectCache.get_one(
                    PacemakerConfiguration, lambda pm: pm.host_id == host.id)
                deps.append(
                    DependOn(pacemaker_configuration,
                             "started",
                             fix_state="unmounted"))
            except PacemakerConfiguration.DoesNotExist:
                pass

        return DependAny(deps)
    def test_two_concurrent_removes(self):
        """
        Test that we can concurrently remove two filesystems which depend on the same mgt
        """
        fs2 = ManagedFilesystem.objects.create(mgs=self.mgt, name="testfs2")
        ObjectCache.add(ManagedFilesystem, fs2)
        mdt2, mdt_tms = ManagedMdt.create_for_volume(self._test_lun(
            self.host).id,
                                                     filesystem=fs2)
        ost2, ost_tms = ManagedOst.create_for_volume(self._test_lun(
            self.host).id,
                                                     filesystem=fs2)
        for target in [mdt2, ost2]:
            ObjectCache.add(ManagedTarget, target.managedtarget_ptr)
        for tm in chain(mdt_tms, ost_tms):
            ObjectCache.add(ManagedTargetMount, tm)

        self.fs = self.set_and_assert_state(self.fs, "available")
        fs2 = self.set_and_assert_state(fs2, "available")

        self.set_state_delayed([(self.fs, "removed")])
        self.set_state_delayed([(fs2, "removed")])

        self.set_state_complete()

        with self.assertRaises(ManagedFilesystem.DoesNotExist):
            ManagedFilesystem.objects.get(pk=self.fs.pk)

        with self.assertRaises(ManagedFilesystem.DoesNotExist):
            ManagedFilesystem.objects.get(pk=fs2.pk)
    def can_run(cls, host):
        if not host.is_worker:
            return False

        search = lambda cm: (cm.host == host and cm.state == 'mounted')
        mounted = ObjectCache.get(LustreClientMount, search)
        return (host.state not in ['removed', 'undeployed', 'unconfigured']
                and len(mounted) > 0
                and not AlertState.filter_by_item(host).filter(
                    active=True,
                    alert_type__in=[
                        HostOfflineAlert.__name__, HostContactAlert.__name__
                    ]).exists())
Exemple #30
0
 def get_deps(self):
     deps = []
     targets = ObjectCache.get_targets_by_filesystem(self.filesystem_id)
     targets = [
         t for t in targets if not issubclass(t.downcast_class, ManagedMgs)
     ]
     for t in targets:
         deps.append(
             DependOn(t,
                      "unmounted",
                      acceptable_states=t.not_state("mounted"),
                      fix_state="unavailable"))
     return DependAll(deps)