def learn_failed_lustre_tunefs_secondary_lvm_mounts(self): for host, host_data in self.all_hosts_data.items(): devices = get_devices(host.fqdn, timeout=30) for vgname in devices["lvs"]: for lvname in devices["lvs"][vgname]: lv = devices["lvs"][vgname][lvname] targets = ManagedTarget.objects.filter(uuid=lv["uuid"]) if not targets.count(): log.warning( "Ignoring lv {}, no matching ManagedTarget".format( lv["uuid"])) continue for target in targets: try: log.info("Target %s seen on %s" % (target, host)) volumenode = self._get_volume_node( host, self.ha_targets[lv["uuid"]]["paths"]) (tm, created ) = ManagedTargetMount.objects.get_or_create( target=target, host=host, volume_node=volumenode) if created: tm.save() log.info( "Learned association %d between %s and host %s" % (tm.id, lv["name"], host)) self._learn_event(host, tm) ObjectCache.add(ManagedTargetMount, tm) except e: log.error("Could create target %s on %s: %s" % (target, host, e))
def agent_session_start(self, host_id, data, initial_scan=True): with transaction.atomic(): initiate_device_poll = False reported_device_node_paths = [] fqdn = ManagedHost.objects.get(id=host_id).fqdn devices = get_devices(fqdn) # use info from IML 4.0 if not devices and data: devices = data for expected_item in [ "vgs", "lvs", "zfspools", "zfsdatasets", "devs", "local_fs", "mds", "mpath" ]: if expected_item not in devices.keys(): devices[expected_item] = {} dev_json = json.dumps(devices["devs"], sort_keys=True) if dev_json == self.current_devices: return None log.debug("Linux.devices changed on {}: {}".format( fqdn, set(json.loads(self.current_devices).keys()) - set(devices["devs"].keys()))) self.current_devices = dev_json lv_block_devices = set() for vg, lv_list in devices["lvs"].items(): for lv_name, lv in lv_list.items(): try: lv_block_devices.add(lv["block_device"]) except KeyError: # An inactive LV has no block device pass mpath_block_devices = set() for mp_name, mp in devices["mpath"].items(): mpath_block_devices.add(mp["block_device"]) special_block_devices = lv_block_devices | mpath_block_devices for uuid, md_info in devices["mds"].items(): special_block_devices.add(md_info["block_device"]) def add_zfs(zfs_info): # add attributes not specific to zfs instances bdid = zfs_info["block_device"] special_block_devices.add(bdid) dev = devices["devs"][bdid] dev["major_minor"] = bdid dev["parent"] = None dev["serial_80"] = None dev["serial_83"] = None dev["filesystem_type"] = "zfs" if bdid.startswith( "zfsset") else None for uuid, zfs_info in merge(devices["zfspools"], devices["zfsdatasets"]).items(): add_zfs(zfs_info) def preferred_serial(bdev): for attr in SERIAL_PREFERENCE: if bdev[attr]: return bdev[attr] return None # Scrub dodgy QEMU SCSI IDs for bdev in devices["devs"].values(): qemu_pattern = "QEMU HARDDISK" if bdev["serial_80"] and bdev["serial_80"].find( qemu_pattern) != -1: # Virtual environments can set an ID that trails QEMU HARDDISK, in which case # we should pick that up, or this might not be a real ID at all. # We have seen at least "SQEMU QEMU HARDDISK" and "SQEMU QEMU HARDDISK 0" # for devices without manually set IDs, so apply a general condition that the trailing # portion must be more than N characters for us to treat it like an ID trailing_id = bdev["serial_80"].split( qemu_pattern)[1].strip() if len(trailing_id) < 4: bdev["serial_80"] = None else: bdev["serial_80"] = trailing_id if bdev["serial_83"] and bdev["serial_83"].find( qemu_pattern) != -1: bdev["serial_83"] = None # Create ScsiDevices res_by_serial = {} scsi_device_identifiers = [] for bdev in devices["devs"].values(): serial = preferred_serial(bdev) if not bdev["major_minor"] in special_block_devices: if serial is not None and serial not in res_by_serial: # NB it's okay to have multiple block devices with the same # serial (multipath): we just store the serial+size once node, created = self.update_or_create( ScsiDevice, serial=serial, size=bdev["size"], filesystem_type=bdev["filesystem_type"]) res_by_serial[serial] = node scsi_device_identifiers.append(node.id_tuple()) # Map major:minor string to LinuxDeviceNode self.major_minor_to_node_resource = {} # Create DeviceNodes for ScsiDevices and UnsharedDevices for bdev in devices["devs"].values(): # Partitions: we will do these in a second pass once their # parents are in bdev_to_resource if bdev["parent"] is not None: continue # Don't create ScsiDevices for devicemapper, mdraid if bdev["major_minor"] in special_block_devices: continue serial = preferred_serial(bdev) if serial is not None: # Serial is set, so look up the ScsiDevice lun_resource = res_by_serial[serial] node, created = self.update_or_create( LinuxDeviceNode, parents=[lun_resource], logical_drive=lun_resource, host_id=host_id, path=bdev["path"], ) self.major_minor_to_node_resource[ bdev["major_minor"]] = node reported_device_node_paths.append(bdev["path"]) else: # Serial is not set, so create an UnsharedDevice device, created = self.update_or_create( UnsharedDevice, path=bdev["path"], size=bdev["size"], filesystem_type=bdev["filesystem_type"]) node, created = self.update_or_create(LinuxDeviceNode, parents=[device], logical_drive=device, host_id=host_id, path=bdev["path"]) self.major_minor_to_node_resource[ bdev["major_minor"]] = node reported_device_node_paths.append(bdev["path"]) # Okay, now we've got ScsiDeviceNodes, time to build the devicemapper ones # on top of them. These can come in any order and be nested to any depth. # So we have to build a graph and then traverse it to populate our resources. for bdev in devices["devs"].values(): if bdev["major_minor"] in lv_block_devices: node, created = self.update_or_create(LinuxDeviceNode, host_id=host_id, path=bdev["path"]) elif bdev["major_minor"] in mpath_block_devices: node, created = self.update_or_create(LinuxDeviceNode, host_id=host_id, path=bdev["path"]) elif bdev["parent"]: node, created = self.update_or_create(LinuxDeviceNode, host_id=host_id, path=bdev["path"]) else: continue self.major_minor_to_node_resource[bdev["major_minor"]] = node reported_device_node_paths.append(bdev["path"]) # Finally remove any of the scsi devs that are no longer present. initiate_device_poll |= self.remove_missing_devices( host_id, ScsiDevice, scsi_device_identifiers) # Now all the LUNs and device nodes are in, create the links between # the DM block devices and their parent entities. vg_uuid_to_resource = {} for vg in devices["vgs"].values(): # Create VG resource vg_resource, created = self.update_or_create(LvmGroup, uuid=vg["uuid"], name=vg["name"], size=vg["size"]) vg_uuid_to_resource[vg["uuid"]] = vg_resource # Add PV block devices as parents of VG for pv_bdev in vg["pvs_major_minor"]: if pv_bdev in self.major_minor_to_node_resource: vg_resource.add_parent( self.major_minor_to_node_resource[pv_bdev]) for vg, lv_list in devices["lvs"].items(): for lv_name, lv in lv_list.items(): vg_info = devices["vgs"][vg] vg_resource = vg_uuid_to_resource[vg_info["uuid"]] # Make the LV a parent of its device node on this host lv_resource, created = self.update_or_create( LvmVolume, parents=[vg_resource], uuid=lv["uuid"], name=lv["name"], vg=vg_resource, size=lv["size"], filesystem_type=devices["devs"][lv["block_device"]] ["filesystem_type"], ) try: lv_node = self.major_minor_to_node_resource[ lv["block_device"]] lv_node.logical_drive = lv_resource lv_node.add_parent(lv_resource) except KeyError: # Inactive LVs have no block device pass for mpath_alias, mpath in devices["mpath"].items(): # Devices contributing to the multipath mpath_parents = [ self.major_minor_to_node_resource[n["major_minor"]] for n in mpath["nodes"] ] # The multipath device node mpath_node = self.major_minor_to_node_resource[ mpath["block_device"]] for p in mpath_parents: # All the mpath_parents should have the same logical_drive mpath_node.logical_drive = mpath_parents[0].logical_drive mpath_node.add_parent(p) self._map_drives_to_device_to_node(devices, host_id, "mds", MdRaid, [], reported_device_node_paths) initiate_device_poll = (self._map_drives_to_device_to_node( devices, host_id, "zfspools", ZfsPool, ["name"], reported_device_node_paths) or initiate_device_poll) initiate_device_poll = (self._map_drives_to_device_to_node( devices, host_id, "zfsdatasets", ZfsDataset, ["name"], reported_device_node_paths) or initiate_device_poll) for bdev, (mntpnt, fstype) in devices["local_fs"].items(): if fstype != "lustre": bdev_resource = self.major_minor_to_node_resource[bdev] self.update_or_create(LocalMount, parents=[bdev_resource], mount_point=mntpnt, fstype=fstype) # Create Partitions (devices that have 'parent' set) partition_identifiers = [] for bdev in [x for x in devices["devs"].values() if x["parent"]]: this_node = self.major_minor_to_node_resource[ bdev["major_minor"]] parent_resource = self.major_minor_to_node_resource[ bdev["parent"]] if not parent_resource.logical_drive: raise RuntimeError("Parent %s of %s has no logical drive" % (parent_resource, bdev)) partition, created = self.update_or_create( # ZfsPartitions should be differentiated as they are not usable for lustre ZfsPartition if bdev.get("is_zfs_reserved") or bdev["filesystem_type"] == "zfs_member" else Partition, parents=[parent_resource], container=parent_resource.logical_drive, number=bdev["partition_number"], size=bdev["size"], filesystem_type=bdev["filesystem_type"], ) this_node.add_parent(partition) partition_identifiers.append(partition.id_tuple()) # Finally remove any of the partitions that are no longer present. initiate_device_poll |= self.remove_missing_devices( host_id, Partition, partition_identifiers) initiate_device_poll |= self.remove_missing_devices( host_id, ZfsPartition, partition_identifiers) initiate_device_poll |= self.remove_missing_devicenodes( reported_device_node_paths) # If we see a device change and the data was sent by the agent poll rather than initial start up # then we need to cause all of the ha peer agents and any other nodes that we share VolumeNodes with # re-poll themselves. # This 'set' is probably a good balance between every node and no poll at all. if (initial_scan is False) and (initiate_device_poll is True): ha_peers = set( HaCluster.host_peers(ManagedHost.objects.get(id=host_id))) hosts_volume_node_ids = [ volume_node.volume_id for volume_node in VolumeNode.objects.filter(host_id=host_id) ] all_volume_nodes = list( VolumeNode.objects.filter(volume_id__in=hosts_volume_node_ids)) all_volume_node_hosts = ManagedHost.objects.filter(id__in=set( volume_node.host_id for volume_node in all_volume_nodes)) ha_peers |= set(all_volume_node_hosts) JobSchedulerClient.trigger_plugin_update( [peer.id for peer in ha_peers], [host_id], ["linux"])