def main():
    args = parse_args()
    user = lib.get_running_user()
    setup_logging(logger, user)

    if not lib.is_hostname_valid(args.host):
        logger.error("{} is not a valid hostname. Exiting.".format(args.host))
        return 1

    # Set Icinga downtime for the host to be upgraded
    icinga_downtime(args.host, "Software upgrade and reboot", 1200)

    # Disable puppet
    lib.disable_puppet([args.host], '{} --{}'.format(NAME, user))

    # Depool and wait a bit for the host to be drained
    if args.depool_cmd:
        if not run_cumin(args.host, [args.depool_cmd]):
            logger.error("Failed depooling {}. Exiting.".format(args.host))
            return 1
    else:
        logging.info("Not performing any depool action as requested (empty --depool-cmd)")

    logging.info("Waiting for {} to be drained.".format(args.host))
    time.sleep(30)

    # Upgrade all pacakges, leave config files untouched, do not prompt
    upgrade_cmd = ("DEBIAN_FRONTEND=noninteractive apt-get -y -o Dpkg::Options::='--force-confdef' "
                   "-o Dpkg::Options::='--force-confold' dist-upgrade")
    if not run_cumin(args.host, [upgrade_cmd], timeout=300):
        logger.error("Failed upgrading {}. Exiting.".format(args.host))
        return 1

    reboot_time = datetime.utcnow()

    lib.reboot_host(args.host)

    boot_time = datetime.utcnow()

    lib.wait_reboot(args.host, start=reboot_time)

    # Enable puppet
    lib.enable_puppet([args.host], '{} --{}'.format(NAME, user))

    # Run puppet
    lib.run_puppet([args.host])

    lib.wait_puppet_run(args.host, start=boot_time)

    # Repool
    if args.repool_cmd:
        if not run_cumin(args.host, [args.repool_cmd]):
            logger.error("Failed repooling {}. Exiting.".format(args.host))
            return 1
    else:
        logging.info("Not performing any repool action as requested (empty --repool-cmd)")

    # Cancel Icinga downtime
    icinga_cancel_downtime(args.host)
    return 0
Ejemplo n.º 2
0
def main():
    """Automated reimaging of a list of hosts."""
    # Setup
    args = parse_args()
    lib.ensure_shell_mode()
    user = lib.get_running_user()
    log_path = setup_logging(user)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    logger.info('wmf-auto-reimage called with args: {args}'.format(args=args))
    lib.print_line('START. To monitor the full log:')
    lib.print_line('sudo tail -F {log}'.format(log=log_path), skip_time=True)

    try:
        retcode = run(args, user, log_path)
    except BaseException as e:
        message = 'Unable to run wmf-auto-reimage'
        lib.print_line('{message}: {error}'.format(message=message, error=e))
        logger.exception(message)
        retcode = 2
    finally:
        lib.print_line('END')

    return retcode
Ejemplo n.º 3
0
def main():
    """Run the automated reimaging of a single host."""
    # Setup
    phab_client = None
    args = parse_args()
    lib.ensure_shell_mode()
    user = lib.get_running_user()
    log_path = setup_logging(user, args.host)
    cumin_output_path = log_path.replace('.log', '_cumin.out')
    if args.debug:
        logger.setLevel(logging.DEBUG)

    logger.info('wmf-auto-reimage-host called with args: {args}'.format(args=args))
    lib.print_line('REIMAGE START | To monitor the full log and cumin output:', host=args.host)
    lib.print_line('sudo tail -F {log}'.format(log=log_path), skip_time=True)
    lib.print_line('sudo tail -F {log}'.format(log=cumin_output_path), skip_time=True)

    try:
        lib.ensure_ipmi_password()
        lib.check_remote_ipmi(args.mgmt)
        if args.rename_mgmt:
            lib.check_remote_ipmi(args.rename_mgmt)

        if args.phab_task_id is not None:
            phab_client = lib.get_phabricator_client()
            lib.phabricator_task_update(
                phab_client, args.phab_task_id, lib.PHAB_COMMENT_PRE.format(
                    user=user, hostname=socket.getfqdn(), hosts=args.host, log=log_path))

        try:
            # This is needed due to a bug in tqdm and a limitation in Cumin
            with open(cumin_output_path, 'w', 1) as cumin_output:
                stderr = sys.stderr
                stdout = sys.stdout
                sys.stderr = cumin_output
                sys.stdout = cumin_output
                run(args, user, log_path)
                retcode = 0
        finally:
            sys.stderr = stderr
            sys.stdout = stdout
    except BaseException as e:
        message = 'Unable to run wmf-auto-reimage-host'
        lib.print_line('{message}: {error}'.format(message=message, error=e), host=args.host)
        logger.exception(message)
        retcode = 2
    finally:
        lib.print_line('REIMAGE END | retcode={ret}'.format(ret=retcode), host=args.host)

    # Comment on the Phabricator task
    if args.phab_task_id is not None and phab_client is not None:
        phabricator_message = lib.get_phabricator_post_message({retcode: [args.host]})
        lib.phabricator_task_update(phab_client, args.phab_task_id, phabricator_message)

    return retcode
Ejemplo n.º 4
0
def main():
    """Run the script."""
    script_name = os.path.basename(__file__)
    args = parse_args()
    user = lib.get_running_user()
    phab_client = lib.get_phabricator_client()
    is_valid_host = lib.is_hostname_valid(args.host)
    actions = []

    if not is_valid_host and not args.force:
        logger.error(
            "{host} is not a valid hostname. Aborting.".format(host=args.host))
        return 1

    # Remove from Puppet and PuppetDB
    lib.puppet_remove_host(args.host)
    actions += ['Revoked Puppet certificate', 'Removed from PuppetDB']

    # Downtime on Icinga both the host and the mgmt host, they will be removed by Puppet
    if is_valid_host:
        try:
            lib.icinga_downtime(args.host,
                                user,
                                args.phab_task_id,
                                title=script_name)
            actions.append('Downtimed host on Icinga')
        except RuntimeError:
            actions.append(
                'Skipped downtime host on Icinga (likely already removed)')

        mgmts = lib.get_mgmts([args.host])
        try:
            lib.icinga_downtime(mgmts[args.host],
                                user,
                                args.phab_task_id,
                                title=script_name)
            actions.append('Downtimed mgmt interface on Icinga')
        except RuntimeError:
            actions.append(
                'Skipped downtime mgmt interface on Icinga (likely already removed)'
            )

    # Remove from DebMonitor
    lib.debmonitor_remove_host(args.host)
    actions.append('Removed from DebMonitor')

    message = (
        '{script} was executed by {user} for {host} and performed the following actions:\n'
        '- {actions}').format(script=script_name,
                              user=user,
                              host=args.host,
                              actions='\n- '.join(actions))
    lib.phabricator_task_update(phab_client, args.phab_task_id, message)

    return 0
Ejemplo n.º 5
0
def main():
    args = parse_args()
    user = lib.get_running_user()
    setup_logging(logger, user)

    if not lib.is_hostname_valid(args.host):
        logger.error("{} is not a valid hostname. Exiting.".format(args.host))
        return 1

    # Set Icinga downtime for the host to be upgraded
    icinga_downtime(args.host, "Software upgrade and reboot", 1200)

    # Depool and wait a bit for the host to be drained
    if args.depool_cmd:
        if not run_cumin(args.host, [args.depool_cmd]):
            logger.error("Failed depooling {}. Exiting.".format(args.host))
            return 1
    else:
        logging.info(
            "Not performing any depool action as requested (empty --depool-cmd)"
        )

    logging.info("Waiting for {} to be drained.".format(args.host))
    time.sleep(30)

    # Run apt full-upgrade
    if not run_cumin(args.host, ['apt -y full-upgrade'], timeout=300):
        logger.error("Failed upgrading {}. Exiting.".format(args.host))
        return 1

    reboot_time = datetime.utcnow()

    lib.reboot_host(args.host)

    boot_time = datetime.utcnow()

    lib.wait_reboot(args.host, start=reboot_time)

    lib.wait_puppet_run(args.host, start=boot_time)

    # Repool
    if args.repool_cmd:
        if not run_cumin(args.host, [args.repool_cmd]):
            logger.error("Failed repooling {}. Exiting.".format(args.host))
            return 1
    else:
        logging.info(
            "Not performing any repool action as requested (empty --repool-cmd)"
        )

    # Cancel Icinga downtime
    icinga_cancel_downtime(args.host)
    return 0
Ejemplo n.º 6
0
def main():
    """Downtime a single host on Icinga."""
    args = parse_args()
    user = lib.get_running_user()
    if args.debug:
        logger.setLevel(logging.DEBUG)

    if args.sleep:
        lib.print_line('Sleeping for {s} seconds'.format(s=args.sleep),
                       host=args.host)
        time.sleep(args.sleep)

    lib.print_line('Running Puppet on the Icinga server', host=args.host)
    try:
        if args.debug:
            lib.run_puppet([lib.resolve_dns(lib.ICINGA_DOMAIN, 'CNAME')],
                           no_raise=True)
            lib.icinga_downtime(args.host,
                                user,
                                args.phab_task_id,
                                title='wmf-downtime-host')
        else:
            # This is needed due to a bug in tqdm and a limitation in Cumin
            with open(os.devnull, 'w', 1) as cumin_output:
                stderr = sys.stderr
                stdout = sys.stdout
                sys.stderr = cumin_output
                sys.stdout = cumin_output
                lib.run_puppet([lib.resolve_dns(lib.ICINGA_DOMAIN, 'CNAME')],
                               no_raise=True)
                lib.icinga_downtime(args.host,
                                    user,
                                    args.phab_task_id,
                                    title='wmf-downtime-host')
        retcode = 0
    except BaseException as e:
        message = 'Unable to run wmf-downtime-host'
        lib.print_line('{message}: {error}'.format(message=message, error=e),
                       host=args.host)
        logger.exception(message)
        retcode = 2
    finally:
        if not args.debug:
            sys.stderr = stderr
            sys.stdout = stdout

    return retcode
Ejemplo n.º 7
0
def main():
    args = parse_args()
    user = lib.get_running_user()
    setup_logging(logger, user)

    if not lib.is_hostname_valid(args.host):
        logger.error("{} is not a valid hostname. Exiting.".format(args.host))
        return 1

    action = 'Upgrading'
    if args.downgrade:
        action = 'Downgrading'

    reason = "{} Varnish on {} --{}".format(action, args.host, user)

    logger.info(reason)

    if not args.hiera_merged:
        # Check that puppet is not already disabled. We skip this check if
        # invoked with --hiera-merged because in that case puppet must
        # necessarily be disabled already. If that were not the case, it would
        # fail because of the discrepancy between the hiera setting
        # profile::cache::base::varnish_version and the Varnish version
        # installed on the system.
        if not run_cumin(args.host, ['puppet-enabled']):
            logger.error("puppet is disabled on {}. Exiting.".format(
                args.host))
            return 1
    else:
        logger.info(
            "Not disabling puppet/waiting for puppet merge as requested (--hiera-merged)"
        )

        # On the contrary, if --hiera-merged is specified, make sure puppet
        # is disabled with the given message
        expected_output = "Puppet is disabled. {}".format(args.hiera_merged)
        if not check_cumin_output(args.host, ['puppet-enabled'],
                                  expected_output):
            logger.error(
                "puppet on {} must be disabled with commit message='{}'. Exiting."
                .format(args.host, args.hiera_merged))
            return 1

    # Set Icinga downtime for the host to be upgraded
    icinga_downtime(args.host, reason, 1200)

    # Depool and wait a bit for the host to be drained
    if not run_cumin(args.host, ['depool']):
        logger.error("Failed depooling {}. Exiting.".format(args.host))

    logging.info("Waiting for {} to be drained.".format(args.host))
    time.sleep(30)

    if not args.hiera_merged:
        # Disable puppet
        if not run_cumin(
                args.host,
            ['disable-puppet "{message}"'.format(message=reason)]):
            logger.error("Failed to disable puppet on {}. Exiting.".format(
                args.host))
            return 1

        # Wait for admin to merge the puppet patch toggling hiera settings
        if not ask_confirmation(
                "Waiting for you to puppet-merge "
                "the change toggling {}'s hiera settings".format(args.host)):
            return 1

    # Remove old stuff
    pre_puppet(args.host, downgrading=args.downgrade)

    # Enable and run puppet
    if args.hiera_merged:
        # If invoked with --hiera-merged we need to use the reason passed to
        # --hiera-merged itself in order to re-enable puppet
        reason = args.hiera_merged

    cmd = 'run-puppet-agent --enable "{message}"'.format(message=reason)
    if not run_cumin(args.host, [cmd], timeout=300):
        logger.error("Failed to enable and run puppet on {}. Exiting.".format(
            args.host))
        return 1

    # Post upgrade
    post_puppet(args.host)

    # check HTTP response from backend/frontend
    if args.host != "cp1008.wikimedia.org":
        # Skip HTTP check if working on pinkunicorn. PU is firewalled and does
        # not allow us to establish TCP connections to varnish.
        check_http_responses(args.host)

    # Repool
    if not run_cumin(args.host, ['pool']):
        logger.error("Failed repooling {}. Exiting.".format(args.host))

    # Cancel Icinga downtime
    icinga_cancel_downtime(args.host)
    return 0