Esempio n. 1
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = base_parser('Diagnose node(s) for error states.')
    parser.add_argument('--nodes', nargs="+", type=str, default=[])

    args = parser.parse_args(sys.argv[1:])
    auth = osapi.Auth.from_env_or_args(args=args)

    nodes = {
        nid: dict(n, **{"ailments": []}) for nid, n
        in osrest.ironic_nodes(auth, details=True).items()
        if n['name'] in args.nodes or not args.nodes}

    node_in_error_state(nodes)
    node_stuck_deleting(nodes)
    node_maintenance_state_error(auth, nodes)
    node_not_in_freepool(auth, nodes)
    node_undead_instance(auth, nodes)
    resource_provider_failure(auth, nodes)

    for node_id, node in nodes.items():
        print("Checking Node {name} (uuid: {uuid})".format(
            name=node['name'], uuid=node_id))

        if node.get("ailments"):
            for ailment in node.get("ailments"):
                print("\t{node_name}: {msg}".format(
                    node_name=node.get("name"),
                    msg=NODE_AILMENTS_MESSAGES[ailment]))
        else:
            print("\tNODE PASSED ALL TESTS. EVERYTHING SHOULD BE FINE.")
Esempio n. 2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = base_parser(
        'Strange things, as per someone\'s definition of "strange".')
    parser.add_argument('-v', '--verbose', action='store_true')

    args = parser.parse_args(argv[1:])

    if args.slack:
        slack = Slackbot(args.slack)
    else:
        slack = None

    os_vars = {
        k: os.environ[k]
        for k in os.environ if k.startswith(OS_ENV_PREFIX)
    }
    if args.osrc:
        os_vars.update(osapi.load_osrc(args.osrc))
    missing_os_vars = set(osapi.Auth.required_os_vars) - set(os_vars)
    if missing_os_vars:
        print('Missing required OS values in env/rcfile: {}'.format(
            ', '.join(missing_os_vars)),
              file=sys.stderr)
        return -1

    auth = osapi.Auth(os_vars)

    nodes = osrest.ironic_nodes(auth, details=True)
    # hypervisors = osrest.nova_hypervisors(auth, details=True)

    errored_nodes = [
        n for n in nodes.values()
        if n['provision_state'] == 'error' and not n['maintenance']
    ]

    if not errored_nodes:
        if args.verbose:
            print('All good.')
        return

    message = ['Ironic nodes in "error" provision state, not in maintenance']
    message.extend(
        '• `{}`, last error: {}'.format(n['uuid'], n.get('last_error'))
        for n in errored_nodes)
    message = '\n'.join(message)

    print(message.replace('•', '*'))
    if slack:
        slack.post(SUBCOMMAND, message, color='xkcd:red')
Esempio n. 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(description='Kick Ironic nodes that '
        'are in an common/known error state')

    parser.add_argument('mode', choices=['info', 'reset'],
        help='Just display data on the stuck nodes or reset their states')
    parser.add_argument('--slack', type=str,
        help='JSON file with Slack webhook information to send a notification to')
    parser.add_argument('--osrc', type=str,
        help='Connection parameter file. Should include password. envars used '
        'if not provided by this file.')
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument('--dry-run', action='store_true',
        help='Dry run, don\'t actually do anything')

    args = parser.parse_args(argv[1:])

    if args.slack:
        slack = Slackbot(args.slack)
    else:
        slack = None

    os_vars = {k: os.environ[k] for k in os.environ if k.startswith(OS_ENV_PREFIX)}
    if args.osrc:
        os_vars.update(load_osrc(args.osrc))
    missing_os_vars = set(Auth.required_os_vars) - set(os_vars)
    if missing_os_vars:
        print(
            'Missing required OS values in env/rcfile: {}'
            .format(', '.join(missing_os_vars)),
            file=sys.stderr
        )
        return -1

    auth = Auth(os_vars)

    nodes = osrest.ironic_nodes(auth, details=True)
    cureable = cureable_nodes(nodes)

    if args.mode == 'info':
        print('{} node(s) in a state that we can treat'.format(len(cureable)))
        for nid in cureable:
            print('-' * 40)
            print('\n'.join(
                '{:<25s} {}'.format(key, nodes[nid].get(key))
                for key
                in [
                    'uuid',
                    'provision_updated_at',
                    'provision_state',
                    'last_error',
                    'instance_uuid',
                    'extra',
                    'maintenance',
                ]
            ))
        if slack:
            if cureable:
                message = ('{} nodes in correctable error states (no action '
                           'taken)'.format(len(cureable)))
                color = 'xkcd:orange red'
            else:
                error_nodes = sum(1
                    for (nid, n)
                    in nodes.items()
                    if (
                        not n['maintenance'] and
                        n['provision_state'] == 'error'
                    )
                )
                if error_nodes:
                    message = ('No nodes in correctable error states ({} other'
                               ' nodes in error state)').format(error_nodes)
                    color = 'xkcd:yellow'
                else:
                    message = 'No nodes in correctable error states'
                    color = 'xkcd:green'

            slack.post(SUBCOMMAND, message, color=color)
        return

    if len(cureable) == 0:
        if args.verbose:
            message = 'Nothing to do.'
            print(message)
            if slack:
                slack.post(SUBCOMMAND, message, color='xkcd:light grey')
        return

    print('To correct: {}'.format(repr(cureable)))
    if slack:
        message = ['Ironic nodes in correctable error states']
        for nid in cureable:
            message.append(' • `{}`: "{}"'.format(nid, nodes[nid]['last_error']))
        message = '\n'.join(message)
        slack.post(SUBCOMMAND, message, color='xkcd:darkish red')

    reset_ok = []
    too_many = []
    for nid in cureable:
        resetter = NodeResetter(auth, nid, dry_run=args.dry_run)
        try:
            resetter.reset()
        except TooManyResets as e:
            too_many.append(nid)
        except Exception as e:
            if slack:
                error = '{}; check logs'.format(str(e))
                slack.post(SUBCOMMAND, error, color='xkcd:red')
            raise
        else:
            reset_ok.append((nid, resetter.tracker.count()))

    print('Attempted to fix: {}'.format(repr(reset_ok)))
    print('Refused to fix:   {}'.format(repr(too_many)))
    if slack:
        message = []
        if reset_ok:
            message.append('Attempted reset of nodes')
            message.extend(' • `{}`: {} resets'.format(*r) for r in reset_ok)
        if too_many:
            message.append('\nAbstained (already at limit)')
            message.extend(' • `{}`'.format(*r) for r in reset_ok)

        color = 'xkcd:chartreuse'
        if args.dry_run:
            color = 'xkcd:yellow'
            message.append('dry run, no changes actually made.')
        if too_many:
            color = 'xkcd:orange'

        message = '\n'.join(message)
        slack.post(SUBCOMMAND, message, color=color)
Esempio n. 4
0
def find_conflicts(auth, ignore_subnets):
    nodes = osrest.ironic_nodes(auth)
    ports = osrest.ironic_ports(auth)
    neut_ports = osrest.neutron_ports(auth)

    # they aren't being ironic
    serious_neut_ports = {
        pid: port
        for pid, port
        in neut_ports.items()
        if not any(
            ip['subnet_id'] in ignore_subnets
            for ip
            in port['fixed_ips']
        )
    }

    # mac --> uuid mappings
    node_mac_map = {port['address']: port['node_uuid'] for port in ports.values()}
    port_mac_map = {port['address']: pid for pid, port in ports.items()}
    neut_mac_map = {port['mac_address']: pid for pid, port in serious_neut_ports.items()}

    neut_macs = set(neut_mac_map)

    # there would be fewer in the neut_mac_map if there were collisions on
    # the mac address
    if len(neut_mac_map) != len(serious_neut_ports):
        macs = (port['mac_address'] for port in serious_neut_ports.values())
        mac_collisions = [
            (mac, count)
            for mac, count
            in collections.Counter(macs).items()
            if count > 1
        ]
        message_lines = []
        for mac_collision, count in mac_collisions:
            bad_ports = (
                pid
                for pid, port
                in serious_neut_ports.items()
                if port['mac_address'] == mac_collision
            )
            message_lines.append('- mac {}, ports: {}'.format(
                mac_collision,
                ', '.join(bad_ports)
            ))
            neut_macs.remove(mac_collision)
        message = ('conflict of mac addresses among neutron ports, ignoring '
                   'some mac addresses:\n{}'
                   .format('\n'.join(message_lines)))
        print(message, file=sys.stderr)

    inactive_nodes = {
        nid: node
        for nid, node
        in nodes.items()
        if node['instance_uuid'] is None
    }
    inactive_ports = {
        pid: port
        for pid, port
        in ports.items()
        if port['node_uuid'] in inactive_nodes
    }
    inactive_macs = {port['address'] for port in inactive_ports.values()}

    conflict_macs = neut_macs & inactive_macs

    conflict_macs_info = {}
    for mac in conflict_macs:
        node = nodes[node_mac_map[mac]]
        neut_port = neut_ports[neut_mac_map[mac]]

        conflict_macs_info[mac] = {
            'mac': mac,
            'ironic_node_id': node['uuid'],
            'ironic_node_instance': node['instance_uuid'],
            'ironic_port': port_mac_map[mac],
            'neutron_port_id': neut_port['id'],
            'neutron_port': neut_port,
        }

    return conflict_macs_info
Esempio n. 5
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = base_parser(
        'Kick Ironic nodes that are in an common/known error state')
    parser.add_argument(
        'mode',
        choices=['info', 'reset'],
        help='Just display data on the stuck nodes or reset their states')
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Dry run, don\'t actually do anything')

    args = parser.parse_args(argv[1:])

    slack = Slackbot(
        args.slack,
        script_name='ironic-error-resetter') if args.slack else None

    os_vars = {
        k: os.environ[k]
        for k in os.environ if k.startswith(OS_ENV_PREFIX)
    }
    if args.osrc:
        os_vars.update(load_osrc(args.osrc))
    missing_os_vars = set(Auth.required_os_vars) - set(os_vars)
    if missing_os_vars:
        print('Missing required OS values in env/rcfile: {}'.format(
            ', '.join(missing_os_vars)),
              file=sys.stderr)
        return -1

    auth = Auth(os_vars)

    try:
        nodes = osrest.ironic_nodes(auth, details=True)
        cureable = cureable_nodes(nodes)

        if args.mode == 'info':
            print('{} node(s) in a state that we can treat'.format(
                len(cureable)))
            for nid in cureable:
                print('-' * 40)
                print('\n'.join('{:<25s} {}'.format(key, nodes[nid].get(key))
                                for key in [
                                    'uuid',
                                    'provision_updated_at',
                                    'provision_state',
                                    'last_error',
                                    'instance_uuid',
                                    'extra',
                                    'maintenance',
                                ]))
            return

        if len(cureable) == 0:
            if args.verbose:
                print('Nothing to do.')
            return

        print('To correct: {}'.format(repr(cureable)))

        reset_ok = []
        too_many = []
        for nid in cureable:
            resetter = NodeResetter(auth, nid, dry_run=args.dry_run)
            resetter.reset()
            reset_ok.append((nid, resetter.tracker.count()))

        message_lines = []
        if reset_ok:
            message_lines.append('Performed reset of nodes')
            message_lines.extend(' • `{}`: {} resets'.format(*r)
                                 for r in reset_ok)
        if too_many:
            message_lines.append('Skipped (already at limit)')
            message_lines.extend(' • `{}`'.format(r) for r in too_many)
        if args.dry_run:
            message_lines.append('dry run, no changes actually made.')

        message = '\n'.join(message_lines)

        print(message)

        if slack and (not args.dry_run):
            slack.success(message)
    except:
        if slack:
            slack.exception()
        raise
Esempio n. 6
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = base_parser(
        'Kick Ironic nodes that refer to a deleted/nonexistant Nova instance')

    parser.add_argument(
        'mode',
        choices=['info', 'delete'],
        help='Just display data on the bound nodes or delete them')
    parser.add_argument(
        '--slack',
        type=str,
        help=
        'JSON file with Slack webhook information to send a notification to')
    parser.add_argument(
        '--osrc',
        type=str,
        help='Connection parameter file. Should include password. envars used '
        'if not provided by this file.')
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument(
        '--force-sane',
        action='store_true',
        help='Disable sanity checking (i.e. things really are that bad)')
    parser.add_argument('--force-insane',
                        action='store_true',
                        help=argparse.SUPPRESS)  # for testing

    args = parser.parse_args(argv[1:])

    slack = Slackbot(args.slack,
                     script_name='undead-instances') if args.slack else None

    os_vars = {
        k: os.environ[k]
        for k in os.environ if k.startswith(OS_ENV_PREFIX)
    }
    if args.osrc:
        os_vars.update(load_osrc(args.osrc))
    missing_os_vars = set(Auth.required_os_vars) - set(os_vars)
    if missing_os_vars:
        print('Missing required OS values in env/rcfile: {}'.format(
            ', '.join(missing_os_vars)),
              file=sys.stderr)
        return -1

    auth = Auth(os_vars)

    nodes = osrest.ironic_nodes(auth)
    instances = osrest.nova_instances(auth)

    node_instance_map, unbound_instances = find_unbound_instances(
        auth, nodes, instances)

    if args.mode == 'info':
        # no-op
        if unbound_instances:
            print('ZOMBIE INSTANCES ON NODES')
        else:
            print('No zombies currently.')
        for inst_id in unbound_instances:
            node = node_instance_map[inst_id]

            assert inst_id not in instances, 'contradiction, this should be impossible'

            print('-----')
            print('Ironic Node\n' '  ID:       {}'.format(node['uuid']))
            print('  Instance: {}'.format(node['instance_uuid']))
            print('  State:    {}'.format(node['provision_state']))

    elif args.mode == 'delete':
        if not args.force_sane or args.force_insane:
            # sanity check(s) to avoid doing something stupid
            if len(instance_ids) == 0 and len(unbound_instances) != 0:
                _thats_crazy('(in)sanity check: 0 running instances(?!)',
                             slack)

            ubi_limit = 20 if not args.force_insane else -1
            if len(unbound_instances) > ubi_limit:
                _thats_crazy(
                    '(in)sanity check: it thinks there are {} unbound instances'
                    .format(len(unbound_instances)),
                    slack,
                )

        try:
            for inst_id in unbound_instances:
                node = node_instance_map[inst_id]
                node_id = node['uuid']
                if node['provision_state'] == 'available':
                    clear_node_instance_data(auth, node_id)
                else:
                    osrest.ironic_node_set_state(auth, node_id, 'deleted')

            message = 'Fixed Ironic nodes with nonexistant instances:\n{}'.format(
                '\n'.join(' • node `{}` → instance `{}`'.format(
                    node_instance_map[i]['uuid'], node_instance_map[i]
                    ['instance_uuid']) for i in unbound_instances))

            print(message)

            if slack:
                slack.success(message)
        except:
            if slack:
                slack.exception()
            raise
Esempio n. 7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(description='Kick Ironic nodes that '
        'refer to a deleted/nonexistant Nova instance')

    parser.add_argument('mode', choices=['info', 'delete'],
        help='Just display data on the bound nodes or delete them')
    parser.add_argument('--slack', type=str,
        help='JSON file with Slack webhook information to send a notification to')
    parser.add_argument('--osrc', type=str,
        help='Connection parameter file. Should include password. envars used '
        'if not provided by this file.')
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument('--force-sane', action='store_true',
        help='Disable sanity checking (i.e. things really are that bad)')
    parser.add_argument('--force-insane', action='store_true',
        help=argparse.SUPPRESS) # for testing

    args = parser.parse_args(argv[1:])

    if args.slack:
        slack = Slackbot(args.slack)
    else:
        slack = None

    os_vars = {k: os.environ[k] for k in os.environ if k.startswith(OS_ENV_PREFIX)}
    if args.osrc:
        os_vars.update(load_osrc(args.osrc))
    missing_os_vars = set(Auth.required_os_vars) - set(os_vars)
    if missing_os_vars:
        print(
            'Missing required OS values in env/rcfile: {}'
            .format(', '.join(missing_os_vars)),
            file=sys.stderr
        )
        return -1

    auth = Auth(os_vars)

    nodes = osrest.ironic_nodes(auth)
    instances = osrest.nova_instances(auth)

    node_instance_map = {
        n['instance_uuid']: n
        for n
        in nodes.values()
        if n['instance_uuid'] is not None
    }

    node_instance_ids = set(node_instance_map)
    instance_ids = set(instances)

    unbound_instances = node_instance_ids - instance_ids

    if args.mode == 'info':
        # no-op
        if unbound_instances:
            print('ZOMBIE INSTANCES ON NODES')
        else:
            print('No zombies currently.')
        for inst_id in unbound_instances:
            node = node_instance_map[inst_id]

            assert inst_id not in instances, 'contradiction, this should be impossible'

            print('-----')
            print('Ironic Node\n'
                  '  ID:       {}'.format(node['uuid']))
            print('  Instance: {}'.format(node['instance_uuid']))
            print('  State:    {}'.format(node['provision_state']))

        if slack:
            if unbound_instances:
                message = ('{} nodes with dead instances (no action taken)'
                           .format(len(unbound_instances)))
                color = 'xkcd:orange red'
            else:
                message = 'No nodes with dead instances.'
                color = 'xkcd:green'
            slack.post(SUBCOMMAND, message, color=color)

    elif args.mode == 'delete':
        if not args.force_sane or args.force_insane:
            # sanity check(s) to avoid doing something stupid
            if len(instance_ids) == 0 and len(unbound_instances) != 0:
                _thats_crazy('(in)sanity check: 0 running instances(?!)', slack)

            ubi_limit = 20 if not args.force_insane else -1
            if len(unbound_instances) > ubi_limit:
                _thats_crazy(
                    '(in)sanity check: it thinks there are {} unbound instances'
                        .format(len(unbound_instances)),
                    slack,
                )

        if slack:
            if unbound_instances:
                message = 'Possible Ironic nodes with nonexistant instances:\n{}'.format(
                    '\n'.join(
                        ' • node `{}` → instance `{}`'.format(
                            node_instance_map[i]['uuid'],
                            node_instance_map[i]['instance_uuid'])
                        for i in unbound_instances
                    )
                )
                color = 'xkcd:darkish red'
            elif args.verbose:
                message = 'No Ironic nodes visibly clinging to dead instances'
                color = 'xkcd:light grey'
            else:
                message = None

            if message:
                slack.post(SUBCOMMAND, message, color=color)

        try:
            for inst_id in unbound_instances:
                node = node_instance_map[inst_id]
                node_id = node['uuid']
                if node['provision_state'] == 'available':
                    clear_node_instance_data(auth, node_id)
                else:
                    osrest.ironic_node_set_state(auth, node_id, 'deleted')
        except Exception as e:
            if slack:
                error = '{} while trying to clean instances; check logs for traceback'.format(str(e))
                slack.post(SUBCOMMAND, error, color='xkcd:red')
            raise
        else:
            if unbound_instances and slack:
                ok_message = (
                    'Cleaned {} instance(s).'
                    .format(len(unbound_instances))
                )
                slack.post(SUBCOMMAND, ok_message, color='xkcd:chartreuse')