def _get_warpdrive_card(self, block_device):
        device_name = os.path.basename(block_device.name)
        sys_block_path = '{0}/block/{1}'.format(self.sys_path, device_name)

        # NOTE(russell_h): Trying to map a block device name to an LSI card
        # gets a little weird. It seems like if we follow the
        # /sys/block/<name> symlink, we'll find something that looks like:
        #
        # /sys/devices/pci0000:00/0000:00:02.0/0000:02:00.0/host3/
        #      target3:1:0/3:1:0:0/block/sdb
        #
        # This seems to correspond to the card that ddcli reports has PCI
        # address 00:02:00:00
        real_path = os.path.realpath(sys_block_path)

        # pull out a segment such as 0000:02:00.0 and trim it to 00:02:00
        pci_address = real_path.split('/')[5][2:-2]

        devices = self._list_lsi_devices()

        matching_devices = [device for device in devices if
                            device['pci_address'] == pci_address]

        if len(matching_devices) == 0:
            raise errors.CleaningError(('Unable to locate an LSI '
                'card with a PCI Address matching {0} for block device '
                '{1}').format(pci_address, block_device.name))

        if len(matching_devices) > 1:
            raise errors.CleaningError(('Found multiple LSI '
                'cards with a PCI Address matching {0} for block device '
                '{1}').format(pci_address, block_device.name))

        return matching_devices[0]
    def _get_node_switchports(self, node, ports):
        """Find the chassis and ports the node is attached to

        Return a set of tuples (chassis, port). Supports pulling them
        from node['extra'], with support for pull chassis/port/interface from
        port['extra'] in the future.

        :param node: a dict representation of a Node object
        :param ports: a dict representation of Ports connected to the node
        :return: a Set of tuples (chassis, port)
        """
        if not node.get('extra'):
            return set()
        LOG.info('Matching against node ports: %s', node.get('extra'))
        try:
            return set([
                (node['extra']['hardware/interfaces/0/'
                               'switch_chassis_id'].lower(),
                 node['extra']['hardware/interfaces/0/'
                               'switch_port_id'].lower()),
                (node['extra']['hardware/interfaces/1/'
                               'switch_chassis_id'].lower(),
                 node['extra']['hardware/interfaces/1/'
                               'switch_port_id'].lower())
            ])
        except KeyError:
            raise errors.CleaningError(
                'Node has malformed extra data, could not find chassis'
                ' and port: %s' % node['extra'])
Exemple #3
0
 def _verify_blockdevice_count(self, block_devices, model, count):
     if len([d for d in block_devices if d.model == model]) != count:
         raise errors.CleaningError('Could not find %(count)s block '
                                    'devices with model name "%(model)s"' %
                                    {
                                        'count': count,
                                        'model': model
                                    })
 def _get_flavor_from_node(self, node):
     ram = node['properties']['memory_mb']
     if ram == (1024 * 32):
         return 'onmetal-compute1'
     if ram == (1024 * 128):
         return 'onmetal-io1'
     if ram == (1024 * 512):
         return 'onmetal-memory1'
     raise errors.CleaningError('unknown flavor')
def _handle_bios_update(actual_bios_version, expected_bios_version):
    """Handle system BIOS update stub.

    Future managers may override this to support automatic updates.
    """
    raise errors.CleaningError(
        "A manual BIOS update is required. Expected version '{0}' but "
        "version '{1}' was installed".format(expected_bios_version,
                                             actual_bios_version))
 def _get_tlv(self, tlv_type, lldp_info):
     """Return all LLDP values that match a TLV type (an int) as a list."""
     # Use a list because TLV type 127 may be be used multiple times in LLDP
     values = []
     for tlv in lldp_info:
         if len(tlv) != 2:
             raise errors.CleaningError('Malformed LLDP info %s'
                                            % lldp_info)
         if tlv[0] == tlv_type:
             values.append(tlv[1])
     return values
def _get_expected_property(node, node_property):
    try:
        expected_property = node['extra']['system_vendor'][node_property]
    except KeyError as e:
        raise errors.CleaningError(
            "Expected property '{0}' not found. For cleaning to proceed "
            "you must set the property 'system_vendor/{1}' in the node's "
            "extra field, for example: $ openstack baremetal node set "
            "$NODE_ID --extra system_vendor/{1}=$VALUE".format(
                e.message, node_property))
    return expected_property
 def companyx_verify_device_lifecycle(self, node, ports):
     """Verify node is not beyond useful life of 3 years."""
     create_date = node.get('created_at')
     if create_date is not None:
         server_age = time.time() - time.mktime(time.strptime(create_date))
         if server_age > (60 * 60 * 24 * 365 * 3):
             raise errors.CleaningError(
                 'Server is too old to pass cleaning!')
         else:
             LOG.info(
                 'Node is %s seconds old, younger than 3 years, '
                 'cleaning passes.', server_age)
Exemple #9
0
    def execute_clean_step(self,
                           step,
                           node,
                           ports,
                           clean_version=None,
                           **kwargs):
        """Execute a clean step.

        :param step: A clean step with 'step', 'priority' and 'interface' keys
        :param node: A dict representation of a node
        :param ports: A dict representation of ports attached to node
        :param clean_version: The clean version as returned by
                              hardware.get_current_versions() at the beginning
                              of cleaning/zapping
        :returns: a CommandResult object with command_result set to whatever
            the step returns.
        """
        # Ensure the agent is still the same version, or raise an exception
        LOG.debug('Executing clean step %s', step)
        hardware.cache_node(node)
        hardware.check_versions(clean_version)

        if 'step' not in step:
            msg = 'Malformed clean_step, no "step" key: %s' % step
            LOG.error(msg)
            raise ValueError(msg)
        try:
            result = hardware.dispatch_to_managers(step['step'], node, ports)
        except Exception as e:
            msg = ('Error performing clean step %(step)s: %(err)s' % {
                'step': step['step'],
                'err': e
            })
            LOG.exception(msg)
            raise errors.CleaningError(msg)

        LOG.info('Clean step completed: %(step)s, result: %(result)s', {
            'step': step,
            'result': result
        })

        # Cast result tuples (like output of utils.execute) as lists, or
        # API throws errors
        if isinstance(result, tuple):
            result = list(result)

        # Return the step that was executed so we can dispatch
        # to the appropriate Ironic interface
        return {'clean_result': result, 'clean_step': step}
    def check_ipmi_users(self, node, ports):
        """Check users having IPMI access with admin rights

        In CERN environment there should be only 2 users having admin access
        to the IPMI interface. One of them is node.driver_info["ipmi_username"]
        and the other is admin/root.

        As the superadmin's username is not known beforehand, if we detect >2
        users, cleaning should fail. In future we may want to implement logic
        to automatically delete any unnecessary user from IPMI.
        """
        for channel in range(16):
            # Count number of enabled admin users
            out, e = utils.execute(
                "ipmitool user list {0!s} | awk '{{if ($3 == \"true\" && $6 == \"ADMINISTRATOR\") print $0;}}' | wc -l"
                .format(channel + 1),
                shell=True)
            if int(out) != 1:
                raise errors.CleaningError(
                    "Detected {} admin users for IPMI !".format(out))

            # In case there is only 1 ipmi user, check if name matches the one
            # known by Ironic
            out, e = utils.execute(
                "ipmitool user list {0!s} | awk '{{if ($3 == \"true\" && $6 == \"ADMINISTRATOR\") print $2;}}' | wc -l"
                .format(channel + 1),
                shell=True)
            if out != node.get('driver_info')['ipmi_username']:
                raise errors.CleaningError(
                    "Detected illegal admin user \"{}\" for IPMI !".format(
                        out))

            # The following error message indicates we started querying
            # non existing channel
            if "Get User Access command failed" in e:
                break
Exemple #11
0
    def verify_ports(self, node, ports):
        """Given Port dicts, verify they match LLDP information

        :param node: a dict representation of a Node object
        :param ports: a dict representation of Ports connected to the node
        :raises CleaningError: if any of the steps determine the node
                does not match the given data
        """
        node_switchports = self._get_node_switchports(node, ports)
        if not node_switchports:
            # Fail gracefully if we cannot find node ports. If call is made
            # with only driver_info, don't fail.
            return

        interface_names = [x.name for x in self.list_network_interfaces()]
        lldp_info = netutils.get_lldp_info(interface_names)

        # Both should be a set of tuples: (chassis, port)
        lldp_ports = set()

        for lldp in lldp_info.values():
            lldp_ports.add(self._get_port_from_lldp(lldp))
        LOG.info('LLDP ports: %s', lldp_ports)
        LOG.info('Node ports: %s', node_switchports)
        # TODO(JoshNang) add check that ports, chassis *and* interface match
        # when port/chassis are stored on Port objects

        # Compare the ports
        if node_switchports != lldp_ports:
            LOG.error('Ports did not match, LLDP: %(lldp)s, Node: %(node)s', {
                'lldp': lldp_ports,
                'node': node_switchports
            })
            # TODO(supermari0) The old error here - VerificationError - seems
            # not to exist or have existed. It would be good if we had more
            # specific errors that subclass CleaningError.
            raise errors.CleaningError(
                'Detected port mismatches. LLDP detected_ports: %(lldp)s, '
                'Node ports: %(node)s.' % {
                    'lldp': lldp_ports,
                    'node': node_switchports
                })

        # Return the LLDP info
        LOG.debug('Ports match, returning LLDP info: %s', lldp_info)
        # Ensure the return value is properly encode or JSON throws errors
        return unicode(lldp_info)
    def _get_port_from_lldp(self, lldp_info):
        """Return a set of tuples (chassis, port) from the given LLDP info

        :param lldp_info: the return from netutils.get_lldp_info()
        :return: a Set of tuples (chassis, port)
        """

        tlv_port = self._get_tlv(LLDP_PORT_TYPE, lldp_info)
        tlv_chassis = self._get_tlv(LLDP_CHASSIS_TYPE, lldp_info)

        if len(tlv_port) != 1 or len(tlv_chassis) != 1:
            raise errors.CleaningError(
                'Malformed LLDP info. Received port: %(port)s, '
                'chassis: %(chassis)s' %
                {'port': tlv_port, 'chassis': tlv_chassis})

        port_number = re.search(r'\d{1,2}/\d{1,2}', tlv_port[0])
        lldp_port = 'eth' + port_number.group()
        return tlv_chassis[0].lower(), lldp_port.lower()
    def verify_bios_version(self, node, ports):
        """Verify the BIOS version.

        To avoid the case where two different products may have the same BIOS
        version, we also check that the product is as expected.
        """
        if _bios_verification_disabled(node):
            LOG.warning('BIOS version verification has been disabled.')
            return True

        vendor_info = hardware.dispatch_to_managers('get_system_vendor_info')

        expected_product_name = _get_expected_property(node, 'product_name')
        actual_product_name = vendor_info.product_name
        product_match = expected_product_name == actual_product_name

        expected_bios_version = _get_expected_property(node, 'bios_version')
        actual_bios_version = _get_bios()
        bios_version_match = expected_bios_version == actual_bios_version

        if product_match and bios_version_match:
            LOG.debug('Specified product and BIOS version match; '
                      'no update is required.')
            return True
        elif product_match:
            LOG.debug('BIOS version did not match; attempting an update.')
            try:
                _handle_bios_update(actual_bios_version, expected_bios_version)
            except Exception as e:
                # Log and pass through the exception so cleaning will fail
                LOG.exception(e)
                raise
            return True
        else:
            raise errors.CleaningError(
                "Product did not match. Expected product '{0}', but the "
                "actual product is '{1}'. Check that the product set in the "
                "node's extra field under 'system_vendor/product_name' "
                "matches the actual product.".format(expected_product_name,
                                                     actual_product_name))
    def run(self):
        """Run the Ironic Python Agent."""
        # Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError
        # if there is an issue (uncaught, restart agent)
        self.started_at = _time()

        # Cached hw managers at runtime, not load time. See bug 1490008.
        hardware.load_managers()
        # Operator-settable delay before hardware actually comes up.
        # Helps with slow RAID drivers - see bug 1582797.
        if self.hardware_initialization_delay > 0:
            LOG.info('Waiting %d seconds before proceeding',
                     self.hardware_initialization_delay)
            time.sleep(self.hardware_initialization_delay)

        if not self.standalone:
            # Inspection should be started before call to lookup, otherwise
            # lookup will fail due to unknown MAC.
            uuid = None
            if cfg.CONF.inspection_callback_url:
                uuid = inspector.inspect()

            if self.api_url:
                self._wait_for_interface()
                content = self.api_client.lookup_node(
                    hardware_info=hardware.dispatch_to_managers(
                        'list_hardware_info'),
                    timeout=self.lookup_timeout,
                    starting_interval=self.lookup_interval,
                    node_uuid=uuid)

                LOG.debug('Received lookup results: %s', content)
                self.node = content['node']
                LOG.info('Lookup succeeded, node UUID is %s',
                         self.node['uuid'])
                hardware.cache_node(self.node)
                self.heartbeat_timeout = content['config']['heartbeat_timeout']

                # Update config with values from Ironic
                config = content.get('config', {})
                if config.get('metrics'):
                    for opt, val in config.items():
                        setattr(cfg.CONF.metrics, opt, val)
                if config.get('metrics_statsd'):
                    for opt, val in config.items():
                        setattr(cfg.CONF.metrics_statsd, opt, val)
            elif cfg.CONF.inspection_callback_url:
                LOG.info('No ipa-api-url configured, Heartbeat and lookup '
                         'skipped for inspector.')
            else:
                LOG.error('Neither ipa-api-url nor inspection_callback_url'
                          'found, please check your pxe append parameters.')

        if cfg.CONF.init_clean_steps:
            clean_steps = sorted(jsonutils.loads(cfg.CONF.init_clean_steps),
                                 key=lambda k: k['priority'],
                                 reverse=True)
            for step in clean_steps:
                LOG.debug('Executing clean step %s', step)
                if 'step' not in step:
                    LOG.error('Malformed clean_step, no "step" key: %s' % step)
                    continue

                try:
                    result = hardware.dispatch_to_managers(
                        step['step'], self.node, [])
                    # steps like raid config need time for system aware disks
                    time.sleep(cfg.CONF.init_clean_steps_interval)
                except Exception as e:
                    msg = ('Error performing clean_step %(step)s: %(err)s' % {
                        'step': step['step'],
                        'err': e
                    })
                    LOG.exception(msg)
                    raise errors.CleaningError(msg)
                LOG.info('Clean step completed: %(step)s, result: %(result)s',
                         {
                             'step': step,
                             'result': result
                         })

        if netutils.is_ipv6_enabled():
            # Listens to both IP versions, assuming IPV6_V6ONLY isn't enabled,
            # (the default behaviour in linux)
            simple_server.WSGIServer.address_family = socket.AF_INET6
        wsgi = simple_server.make_server(self.listen_address.hostname,
                                         self.listen_address.port,
                                         self.api,
                                         server_class=simple_server.WSGIServer)

        if not self.standalone and self.api_url:
            # Don't start heartbeating until the server is listening
            self.heartbeater.start()

        try:
            wsgi.serve_forever()
        except BaseException:
            LOG.exception('shutting down')

        if not self.standalone and self.api_url:
            self.heartbeater.stop()
    def delete_configuration(self, node, ports):
        """Deletes RAID configuration on the bare metal.

        This method deletes all the RAID disks on the bare metal.

        :param node: A dictionary of the node object
        :param ports: A list of dictionaries containing information of ports
                      for the node
        """
        LOG.info("Deleting RAID configurations")

        raid_devices, _ = utils.execute(
            "cat /proc/mdstat | grep 'active' | awk '{ print $1 }'",
            shell=True)

        for device in ['/dev/' + x for x in raid_devices.split()]:
            LOG.info(
                "Deleting RAID configuration for device {}".format(device))
            try:
                component_devices, err = utils.execute(
                    "mdadm --detail {} | grep 'active sync' | awk '{{ print $7 }}'"
                    .format(device),
                    shell=True)
                LOG.info("Component devices for {}: {}".format(
                    device, component_devices))

                if err:
                    raise processutils.ProcessExecutionError(err)
            except (processutils.ProcessExecutionError, OSError) as e:
                raise errors.CleaningError(
                    "Error getting details of RAID device {}. {}".format(
                        device, e))

            # Wipe partition tables from the RAID device. Needed before
            # creating a new md device.
            try:
                LOG.info("Wiping device {}".format(device))
                utils.execute("wipefs -af {}".format(device), shell=True)
            except (processutils.ProcessExecutionError, OSError) as e:
                raise errors.CleaningError(
                    "Error wiping RAID device {}. {}".format(device, e))

            try:
                LOG.info("Stopping device {}".format(device))
                utils.execute("mdadm --stop {}".format(device), shell=True)
            except (processutils.ProcessExecutionError, OSError) as e:
                raise errors.CleaningError(
                    "Error stopping RAID device {}. {}".format(device, e))

            try:
                LOG.info("Removing device {}".format(device))
                utils.execute("mdadm --remove {}".format(device), shell=True)
            except processutils.ProcessExecutionError:
                # After successful stop this returns
                # "mdadm: error opening /dev/md3: No such file or directory"
                # with error code 1, which we can safely ignore
                pass
            LOG.info("Removed RAID device {}".format(device))

            for component_device in component_devices.split():
                try:
                    _, err = utils.execute(
                        "mdadm --examine {}".format(component_device),
                        shell=True)
                    if "No md superblock detected" in err:
                        continue

                    _, err = utils.execute(
                        "mdadm --zero-superblock {}".format(component_device),
                        shell=True)
                    if err:
                        raise processutils.ProcessExecutionError(err)
                except (processutils.ProcessExecutionError, OSError) as e:
                    raise errors.CleaningError(
                        "Error erasing superblock for device {}. {}".format(
                            component_device, e))
                LOG.info(
                    "Deleted md superblock on {}".format(component_device))

            LOG.info("Removed RAID configuration of {}".format(device))

        LOG.info("Finished deleting RAID configurations")