def _get_warpdrive_card(self, block_device): device_name = os.path.basename(block_device.name) sys_block_path = '{0}/block/{1}'.format(self.sys_path, device_name) # NOTE(russell_h): Trying to map a block device name to an LSI card # gets a little weird. It seems like if we follow the # /sys/block/<name> symlink, we'll find something that looks like: # # /sys/devices/pci0000:00/0000:00:02.0/0000:02:00.0/host3/ # target3:1:0/3:1:0:0/block/sdb # # This seems to correspond to the card that ddcli reports has PCI # address 00:02:00:00 real_path = os.path.realpath(sys_block_path) # pull out a segment such as 0000:02:00.0 and trim it to 00:02:00 pci_address = real_path.split('/')[5][2:-2] devices = self._list_lsi_devices() matching_devices = [device for device in devices if device['pci_address'] == pci_address] if len(matching_devices) == 0: raise errors.CleaningError(('Unable to locate an LSI ' 'card with a PCI Address matching {0} for block device ' '{1}').format(pci_address, block_device.name)) if len(matching_devices) > 1: raise errors.CleaningError(('Found multiple LSI ' 'cards with a PCI Address matching {0} for block device ' '{1}').format(pci_address, block_device.name)) return matching_devices[0]
def _get_node_switchports(self, node, ports): """Find the chassis and ports the node is attached to Return a set of tuples (chassis, port). Supports pulling them from node['extra'], with support for pull chassis/port/interface from port['extra'] in the future. :param node: a dict representation of a Node object :param ports: a dict representation of Ports connected to the node :return: a Set of tuples (chassis, port) """ if not node.get('extra'): return set() LOG.info('Matching against node ports: %s', node.get('extra')) try: return set([ (node['extra']['hardware/interfaces/0/' 'switch_chassis_id'].lower(), node['extra']['hardware/interfaces/0/' 'switch_port_id'].lower()), (node['extra']['hardware/interfaces/1/' 'switch_chassis_id'].lower(), node['extra']['hardware/interfaces/1/' 'switch_port_id'].lower()) ]) except KeyError: raise errors.CleaningError( 'Node has malformed extra data, could not find chassis' ' and port: %s' % node['extra'])
def _verify_blockdevice_count(self, block_devices, model, count): if len([d for d in block_devices if d.model == model]) != count: raise errors.CleaningError('Could not find %(count)s block ' 'devices with model name "%(model)s"' % { 'count': count, 'model': model })
def _get_flavor_from_node(self, node): ram = node['properties']['memory_mb'] if ram == (1024 * 32): return 'onmetal-compute1' if ram == (1024 * 128): return 'onmetal-io1' if ram == (1024 * 512): return 'onmetal-memory1' raise errors.CleaningError('unknown flavor')
def _handle_bios_update(actual_bios_version, expected_bios_version): """Handle system BIOS update stub. Future managers may override this to support automatic updates. """ raise errors.CleaningError( "A manual BIOS update is required. Expected version '{0}' but " "version '{1}' was installed".format(expected_bios_version, actual_bios_version))
def _get_tlv(self, tlv_type, lldp_info): """Return all LLDP values that match a TLV type (an int) as a list.""" # Use a list because TLV type 127 may be be used multiple times in LLDP values = [] for tlv in lldp_info: if len(tlv) != 2: raise errors.CleaningError('Malformed LLDP info %s' % lldp_info) if tlv[0] == tlv_type: values.append(tlv[1]) return values
def _get_expected_property(node, node_property): try: expected_property = node['extra']['system_vendor'][node_property] except KeyError as e: raise errors.CleaningError( "Expected property '{0}' not found. For cleaning to proceed " "you must set the property 'system_vendor/{1}' in the node's " "extra field, for example: $ openstack baremetal node set " "$NODE_ID --extra system_vendor/{1}=$VALUE".format( e.message, node_property)) return expected_property
def companyx_verify_device_lifecycle(self, node, ports): """Verify node is not beyond useful life of 3 years.""" create_date = node.get('created_at') if create_date is not None: server_age = time.time() - time.mktime(time.strptime(create_date)) if server_age > (60 * 60 * 24 * 365 * 3): raise errors.CleaningError( 'Server is too old to pass cleaning!') else: LOG.info( 'Node is %s seconds old, younger than 3 years, ' 'cleaning passes.', server_age)
def execute_clean_step(self, step, node, ports, clean_version=None, **kwargs): """Execute a clean step. :param step: A clean step with 'step', 'priority' and 'interface' keys :param node: A dict representation of a node :param ports: A dict representation of ports attached to node :param clean_version: The clean version as returned by hardware.get_current_versions() at the beginning of cleaning/zapping :returns: a CommandResult object with command_result set to whatever the step returns. """ # Ensure the agent is still the same version, or raise an exception LOG.debug('Executing clean step %s', step) hardware.cache_node(node) hardware.check_versions(clean_version) if 'step' not in step: msg = 'Malformed clean_step, no "step" key: %s' % step LOG.error(msg) raise ValueError(msg) try: result = hardware.dispatch_to_managers(step['step'], node, ports) except Exception as e: msg = ('Error performing clean step %(step)s: %(err)s' % { 'step': step['step'], 'err': e }) LOG.exception(msg) raise errors.CleaningError(msg) LOG.info('Clean step completed: %(step)s, result: %(result)s', { 'step': step, 'result': result }) # Cast result tuples (like output of utils.execute) as lists, or # API throws errors if isinstance(result, tuple): result = list(result) # Return the step that was executed so we can dispatch # to the appropriate Ironic interface return {'clean_result': result, 'clean_step': step}
def check_ipmi_users(self, node, ports): """Check users having IPMI access with admin rights In CERN environment there should be only 2 users having admin access to the IPMI interface. One of them is node.driver_info["ipmi_username"] and the other is admin/root. As the superadmin's username is not known beforehand, if we detect >2 users, cleaning should fail. In future we may want to implement logic to automatically delete any unnecessary user from IPMI. """ for channel in range(16): # Count number of enabled admin users out, e = utils.execute( "ipmitool user list {0!s} | awk '{{if ($3 == \"true\" && $6 == \"ADMINISTRATOR\") print $0;}}' | wc -l" .format(channel + 1), shell=True) if int(out) != 1: raise errors.CleaningError( "Detected {} admin users for IPMI !".format(out)) # In case there is only 1 ipmi user, check if name matches the one # known by Ironic out, e = utils.execute( "ipmitool user list {0!s} | awk '{{if ($3 == \"true\" && $6 == \"ADMINISTRATOR\") print $2;}}' | wc -l" .format(channel + 1), shell=True) if out != node.get('driver_info')['ipmi_username']: raise errors.CleaningError( "Detected illegal admin user \"{}\" for IPMI !".format( out)) # The following error message indicates we started querying # non existing channel if "Get User Access command failed" in e: break
def verify_ports(self, node, ports): """Given Port dicts, verify they match LLDP information :param node: a dict representation of a Node object :param ports: a dict representation of Ports connected to the node :raises CleaningError: if any of the steps determine the node does not match the given data """ node_switchports = self._get_node_switchports(node, ports) if not node_switchports: # Fail gracefully if we cannot find node ports. If call is made # with only driver_info, don't fail. return interface_names = [x.name for x in self.list_network_interfaces()] lldp_info = netutils.get_lldp_info(interface_names) # Both should be a set of tuples: (chassis, port) lldp_ports = set() for lldp in lldp_info.values(): lldp_ports.add(self._get_port_from_lldp(lldp)) LOG.info('LLDP ports: %s', lldp_ports) LOG.info('Node ports: %s', node_switchports) # TODO(JoshNang) add check that ports, chassis *and* interface match # when port/chassis are stored on Port objects # Compare the ports if node_switchports != lldp_ports: LOG.error('Ports did not match, LLDP: %(lldp)s, Node: %(node)s', { 'lldp': lldp_ports, 'node': node_switchports }) # TODO(supermari0) The old error here - VerificationError - seems # not to exist or have existed. It would be good if we had more # specific errors that subclass CleaningError. raise errors.CleaningError( 'Detected port mismatches. LLDP detected_ports: %(lldp)s, ' 'Node ports: %(node)s.' % { 'lldp': lldp_ports, 'node': node_switchports }) # Return the LLDP info LOG.debug('Ports match, returning LLDP info: %s', lldp_info) # Ensure the return value is properly encode or JSON throws errors return unicode(lldp_info)
def _get_port_from_lldp(self, lldp_info): """Return a set of tuples (chassis, port) from the given LLDP info :param lldp_info: the return from netutils.get_lldp_info() :return: a Set of tuples (chassis, port) """ tlv_port = self._get_tlv(LLDP_PORT_TYPE, lldp_info) tlv_chassis = self._get_tlv(LLDP_CHASSIS_TYPE, lldp_info) if len(tlv_port) != 1 or len(tlv_chassis) != 1: raise errors.CleaningError( 'Malformed LLDP info. Received port: %(port)s, ' 'chassis: %(chassis)s' % {'port': tlv_port, 'chassis': tlv_chassis}) port_number = re.search(r'\d{1,2}/\d{1,2}', tlv_port[0]) lldp_port = 'eth' + port_number.group() return tlv_chassis[0].lower(), lldp_port.lower()
def verify_bios_version(self, node, ports): """Verify the BIOS version. To avoid the case where two different products may have the same BIOS version, we also check that the product is as expected. """ if _bios_verification_disabled(node): LOG.warning('BIOS version verification has been disabled.') return True vendor_info = hardware.dispatch_to_managers('get_system_vendor_info') expected_product_name = _get_expected_property(node, 'product_name') actual_product_name = vendor_info.product_name product_match = expected_product_name == actual_product_name expected_bios_version = _get_expected_property(node, 'bios_version') actual_bios_version = _get_bios() bios_version_match = expected_bios_version == actual_bios_version if product_match and bios_version_match: LOG.debug('Specified product and BIOS version match; ' 'no update is required.') return True elif product_match: LOG.debug('BIOS version did not match; attempting an update.') try: _handle_bios_update(actual_bios_version, expected_bios_version) except Exception as e: # Log and pass through the exception so cleaning will fail LOG.exception(e) raise return True else: raise errors.CleaningError( "Product did not match. Expected product '{0}', but the " "actual product is '{1}'. Check that the product set in the " "node's extra field under 'system_vendor/product_name' " "matches the actual product.".format(expected_product_name, actual_product_name))
def run(self): """Run the Ironic Python Agent.""" # Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError # if there is an issue (uncaught, restart agent) self.started_at = _time() # Cached hw managers at runtime, not load time. See bug 1490008. hardware.load_managers() # Operator-settable delay before hardware actually comes up. # Helps with slow RAID drivers - see bug 1582797. if self.hardware_initialization_delay > 0: LOG.info('Waiting %d seconds before proceeding', self.hardware_initialization_delay) time.sleep(self.hardware_initialization_delay) if not self.standalone: # Inspection should be started before call to lookup, otherwise # lookup will fail due to unknown MAC. uuid = None if cfg.CONF.inspection_callback_url: uuid = inspector.inspect() if self.api_url: self._wait_for_interface() content = self.api_client.lookup_node( hardware_info=hardware.dispatch_to_managers( 'list_hardware_info'), timeout=self.lookup_timeout, starting_interval=self.lookup_interval, node_uuid=uuid) LOG.debug('Received lookup results: %s', content) self.node = content['node'] LOG.info('Lookup succeeded, node UUID is %s', self.node['uuid']) hardware.cache_node(self.node) self.heartbeat_timeout = content['config']['heartbeat_timeout'] # Update config with values from Ironic config = content.get('config', {}) if config.get('metrics'): for opt, val in config.items(): setattr(cfg.CONF.metrics, opt, val) if config.get('metrics_statsd'): for opt, val in config.items(): setattr(cfg.CONF.metrics_statsd, opt, val) elif cfg.CONF.inspection_callback_url: LOG.info('No ipa-api-url configured, Heartbeat and lookup ' 'skipped for inspector.') else: LOG.error('Neither ipa-api-url nor inspection_callback_url' 'found, please check your pxe append parameters.') if cfg.CONF.init_clean_steps: clean_steps = sorted(jsonutils.loads(cfg.CONF.init_clean_steps), key=lambda k: k['priority'], reverse=True) for step in clean_steps: LOG.debug('Executing clean step %s', step) if 'step' not in step: LOG.error('Malformed clean_step, no "step" key: %s' % step) continue try: result = hardware.dispatch_to_managers( step['step'], self.node, []) # steps like raid config need time for system aware disks time.sleep(cfg.CONF.init_clean_steps_interval) except Exception as e: msg = ('Error performing clean_step %(step)s: %(err)s' % { 'step': step['step'], 'err': e }) LOG.exception(msg) raise errors.CleaningError(msg) LOG.info('Clean step completed: %(step)s, result: %(result)s', { 'step': step, 'result': result }) if netutils.is_ipv6_enabled(): # Listens to both IP versions, assuming IPV6_V6ONLY isn't enabled, # (the default behaviour in linux) simple_server.WSGIServer.address_family = socket.AF_INET6 wsgi = simple_server.make_server(self.listen_address.hostname, self.listen_address.port, self.api, server_class=simple_server.WSGIServer) if not self.standalone and self.api_url: # Don't start heartbeating until the server is listening self.heartbeater.start() try: wsgi.serve_forever() except BaseException: LOG.exception('shutting down') if not self.standalone and self.api_url: self.heartbeater.stop()
def delete_configuration(self, node, ports): """Deletes RAID configuration on the bare metal. This method deletes all the RAID disks on the bare metal. :param node: A dictionary of the node object :param ports: A list of dictionaries containing information of ports for the node """ LOG.info("Deleting RAID configurations") raid_devices, _ = utils.execute( "cat /proc/mdstat | grep 'active' | awk '{ print $1 }'", shell=True) for device in ['/dev/' + x for x in raid_devices.split()]: LOG.info( "Deleting RAID configuration for device {}".format(device)) try: component_devices, err = utils.execute( "mdadm --detail {} | grep 'active sync' | awk '{{ print $7 }}'" .format(device), shell=True) LOG.info("Component devices for {}: {}".format( device, component_devices)) if err: raise processutils.ProcessExecutionError(err) except (processutils.ProcessExecutionError, OSError) as e: raise errors.CleaningError( "Error getting details of RAID device {}. {}".format( device, e)) # Wipe partition tables from the RAID device. Needed before # creating a new md device. try: LOG.info("Wiping device {}".format(device)) utils.execute("wipefs -af {}".format(device), shell=True) except (processutils.ProcessExecutionError, OSError) as e: raise errors.CleaningError( "Error wiping RAID device {}. {}".format(device, e)) try: LOG.info("Stopping device {}".format(device)) utils.execute("mdadm --stop {}".format(device), shell=True) except (processutils.ProcessExecutionError, OSError) as e: raise errors.CleaningError( "Error stopping RAID device {}. {}".format(device, e)) try: LOG.info("Removing device {}".format(device)) utils.execute("mdadm --remove {}".format(device), shell=True) except processutils.ProcessExecutionError: # After successful stop this returns # "mdadm: error opening /dev/md3: No such file or directory" # with error code 1, which we can safely ignore pass LOG.info("Removed RAID device {}".format(device)) for component_device in component_devices.split(): try: _, err = utils.execute( "mdadm --examine {}".format(component_device), shell=True) if "No md superblock detected" in err: continue _, err = utils.execute( "mdadm --zero-superblock {}".format(component_device), shell=True) if err: raise processutils.ProcessExecutionError(err) except (processutils.ProcessExecutionError, OSError) as e: raise errors.CleaningError( "Error erasing superblock for device {}. {}".format( component_device, e)) LOG.info( "Deleted md superblock on {}".format(component_device)) LOG.info("Removed RAID configuration of {}".format(device)) LOG.info("Finished deleting RAID configurations")