Example #1
0
    def __init__(self, args, spicerack):
        """Initialize the runner"""
        if args.cluster is not None:
            self.query = 'A:{}'.format(args.cluster)
        else:
            self.query = args.query
        ensure_shell_is_durable()

        self.cassandra_nodes = spicerack.remote().query(self.query)
        self.icinga_hosts = spicerack.icinga_hosts(self.cassandra_nodes.hosts)
        self.reason = spicerack.admin_reason(args.reason)
        self.instance_sleep_seconds = args.instance_sleep_seconds
        self.batch_sleep_seconds = args.batch_sleep_seconds

        logger.info(
            'Checking that all Cassandra nodes are reported up by their systemd unit status.'
        )
        # perhaps we should create a c-foreach-status script?
        # See also https://phabricator.wikimedia.org/T229916
        status_cmd = """\
                STRING=''; \
                for i in $(c-ls) ; do STRING="${STRING} cassandra-${i}" ; done ; \
                systemctl status $STRING\
                """
        self.cassandra_nodes.run_sync(status_cmd)
    def __init__(self, args, spicerack):
        """Change Hadoop distribution on all the clients of a given cluster"""
        if args.cluster == 'test':
            cumin_labels = HADOOP_TEST_CLIENT_CUMIN_ALIASES
        elif args.cluster == 'analytics':
            cumin_labels = HADOOP_CLIENT_CUMIN_ALIASES
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        spicerack_remote = spicerack.remote()
        if args.cumin_client_label:
            if args.cumin_client_label not in cumin_labels:
                raise RuntimeError(
                    "Cumin label {} not supported. Please use one of: {}"
                    .format(args.cumin_client_label, cumin_labels))
            cumin_labels = [args.cumin_client_label]

        self.hadoop_client_hosts = spicerack_remote.query(' or '.join(cumin_labels))
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_client_hosts.hosts)
        self.admin_reason = spicerack.admin_reason('Change Hadoop distribution')
        self.rollback = args.rollback
        self.cluster = args.cluster

        ask_confirmation(
            "This cookbook assumes that the Hadoop cluster runs already the new distro, "
            "please do not proceed otherwise.")
    def __init__(self, args, spicerack):
        """Initialize an Hadoop worker."""
        self.success_percent_cumin = args.success_percent / 100
        self.skip_disks = args.skip_disks
        self.disks_number = args.disks_number
        self.hostname_pattern = args.hostname_pattern
        self.partitions_basedir = args.partitions_basedir
        self.wipe_partitions = args.wipe_partitions
        self.hadoop_workers = spicerack.remote().query(self.hostname_pattern)

        letters = list(string.ascii_lowercase)
        if len(letters[self.skip_disks:]) < self.disks_number:
            raise RuntimeError(
                'The number of available letters is not enough to support {} disks, '
                'please check your parameters:\n{}'.format(
                    self.disks_number, letters[self.skip_disks:]))

        self.available_disk_labels = letters[self.
                                             skip_disks:self.disks_number +
                                             self.skip_disks]

        ask_confirmation(
            'Please check that the hosts to initialize are the expected ones: {}'
            .format(self.hadoop_workers.hosts))

        ask_confirmation(
            'Please check that the disk labels to act on are the expected '
            'ones: {}'.format(str(self.available_disk_labels)))

        ensure_shell_is_durable()
def test_ensure_shell_is_durable_interactive(mocked_isatty):
    """Should raise WmflibError if in an interactive shell."""
    mocked_isatty.return_value = True
    with pytest.raises(WmflibError, match='Must be run in non-interactive mode or inside a screen or tmux.'):
        interactive.ensure_shell_is_durable()

    assert mocked_isatty.called
Example #5
0
def run(args, spicerack):
    """Required by Spicerack API."""
    ensure_shell_is_durable()
    session = Session()
    session.verify = False
    return_code = 0
    current_password = get_secret('Current password')
    new_password = get_secret("New password", confirm=True)

    session.auth = (args.username, current_password)

    _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query)

    for pdu in _pdus:
        try:
            if not spicerack.dry_run:
                change_password(pdu, session, new_password)
            else:
                logger.info('%s: Dry run, not trying.', pdu)
            if args.check_default:
                if pdus.check_default(pdu, session):
                    # TODO: delete default user
                    return_code = 1
        except (pdus.VersionError, PasswordResetError) as error:
            logger.error(error)
            return_code = 1
    return return_code
 def __init__(self, args, spicerack):
     """Restart Presto on a given cluster."""
     ensure_shell_is_durable()
     self.cluster = args.cluster
     self.presto_workers = spicerack.remote().query("A:presto-" + self.cluster)
     self.icinga_hosts = spicerack.icinga_hosts(self.presto_workers.hosts)
     self.admin_reason = spicerack.admin_reason('Roll restart of all Presto\'s jvm daemons.')
 def __init__(self, args, spicerack):
     """Restart ORES daemons on a given cluster."""
     cluster_cumin_alias = "A:ores-" + args.cluster
     self.cluster = args.cluster
     self.ores_workers = spicerack.remote().query(cluster_cumin_alias)
     self.icinga_hosts = spicerack.icinga_hosts(self.ores_workers.hosts)
     self.admin_reason = spicerack.admin_reason('Roll restart of ORES\'s daemons.')
     self.daemons = args.daemons
     self.spicerack = spicerack
     self.confctl = spicerack.confctl('node')
     ensure_shell_is_durable()
 def __init__(self, args, spicerack):
     """Initialize the runner."""
     ensure_shell_is_durable()
     self.cluster = args.cluster
     self.remote = spicerack.remote()
     self.confctl = spicerack.confctl('node')
     self.aqs_canary = self.remote.query('A:' + args.cluster + '-canary')
     self.aqs_workers = self.remote.query('A:' + args.cluster)
     self.icinga_hosts = spicerack.icinga_hosts(self.aqs_workers.hosts)
     self.admin_reason = spicerack.admin_reason(
         'Roll restart of all AQS\'s nodejs daemons.')
Example #9
0
    def __init__(self, args, spicerack):
        """Initialize the runner"""
        if args.cluster == 'test':
            self.cluster_cumin_alias = 'A:hadoop-worker-test'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test'
        elif args.cluster == 'analytics':
            self.cluster_cumin_alias = 'A:hadoop-worker'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal'
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(
                args.cluster))

        ensure_shell_is_durable()

        self.cluster = args.cluster
        self.hadoop_workers = spicerack.remote().query(
            self.cluster_cumin_alias)
        self.hadoop_hdfs_journal_workers = spicerack.remote().query(
            self.hdfs_jn_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_workers.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons for openjdk upgrade.')

        self.yarn_nm_batch_size = args.yarn_nm_batch_size
        self.yarn_nm_sleep = args.yarn_nm_sleep_seconds

        # Not configurable on purpose, too risky!
        self.hdfs_jn_batch_size = 1
        self.hdfs_jn_sleep = args.hdfs_jn_sleep_seconds

        self.hdfs_dn_batch_size = args.hdfs_dn_batch_size
        self.hdfs_dn_sleep = args.hdfs_dn_sleep_seconds

        # Safety checks
        if self.hdfs_dn_batch_size > 5:
            ask_confirmation(
                'The HDFS Datanode batch size is bigger than 5, are you sure?')
        if self.hdfs_dn_sleep < 20:
            ask_confirmation(
                'The HDFS Datanode sleep between each batch is less than 20s, are you sure?'
            )
        if self.hdfs_jn_sleep < 20:
            ask_confirmation(
                'The HDFS Journalnode sleep between each batch is less than 20s, are you sure?'
            )
        if self.yarn_nm_batch_size > 10:
            ask_confirmation(
                'The Yarn Nodemanager batch size is bigger than 10, are you sure?'
            )
        if self.yarn_nm_sleep < 20:
            ask_confirmation(
                'The Yarn Nodemanager sleep between each batch is less than 20s, are you sure?'
            )
 def __init__(self, args, spicerack):
     """Restart druid daemons on a given cluster."""
     cluster_cumin_alias = "A:druid-" + args.cluster
     self.need_depool = False
     if args.cluster == 'public':
         self.need_depool = True
     self.cluster = args.cluster
     self.druid_workers = spicerack.remote().query(cluster_cumin_alias)
     self.icinga_hosts = spicerack.icinga_hosts(self.druid_workers.hosts)
     self.admin_reason = spicerack.admin_reason(
         'Roll restart of Druid jvm daemons.')
     self.daemons = args.daemons
     ensure_shell_is_durable()
    def __init__(self, args, spicerack):
        """Reboot Presto on a given cluster."""
        ensure_shell_is_durable()

        self.icinga_hosts = spicerack.icinga_hosts
        self.puppet = spicerack.puppet
        self.admin_reason = spicerack.admin_reason('Reboot Presto nodes')
        self.remote = spicerack.remote()

        self.cluster = args.cluster

        cluster_cumin_alias = 'A:presto-' + self.cluster

        self.presto_workers = self.remote.query(cluster_cumin_alias)
Example #12
0
    def __init__(self, args, spicerack):
        """Change Hadoop distribution on a given cluster"""
        if args.cluster == 'test':
            suffix = '-test'
        elif args.cluster == 'analytics':
            suffix = ''
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        spicerack_remote = spicerack.remote()

        self.hadoop_hosts = spicerack_remote.query(CLUSTER_CUMIN_ALIAS + suffix)
        self.hadoop_hdfs_journal_workers = spicerack_remote.query(HDFS_JOURNAL_CUMIN_ALIAS + suffix)
        if args.journalnodes_cumin_query:
            hadoop_hdfs_journal_override = spicerack_remote.query(args.journalnodes_cumin_query)
            self.hadoop_hdfs_journal_workers = spicerack_remote.query(
                "D{{{}}}".format(
                    self.hadoop_hdfs_journal_workers.hosts.intersection(hadoop_hdfs_journal_override.hosts)))
            ask_confirmation(
                'The cookbook will run only on the following journal hosts ({}), please verify that '
                'the list looks correct: {}'
                .format(len(self.hadoop_hdfs_journal_workers), self.hadoop_hdfs_journal_workers))

        self.hadoop_workers = spicerack_remote.query(WORKERS_CUMIN_ALIAS + suffix)
        if args.workers_cumin_query:
            hadoop_workers_override = spicerack_remote.query(args.workers_cumin_query)
            self.hadoop_workers = spicerack_remote.query(
                "D{{{}}}".format(self.hadoop_workers.hosts.intersection(hadoop_workers_override.hosts)))
            ask_confirmation(
                'The cookbook will run only on the following worker hosts ({}), please verify that '
                'the list looks correct: {}'
                .format(len(self.hadoop_workers), self.hadoop_workers))

        self.hadoop_master = spicerack_remote.query(MASTER_CUMIN_ALIAS + suffix)
        self.hadoop_standby = spicerack_remote.query(STANDBY_CUMIN_ALIAS + suffix)

        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_hosts.hosts)
        self.admin_reason = spicerack.admin_reason('Change Hadoop distribution')

        self.rollback = args.rollback
        self.cluster = args.cluster

        self.apt_install_options = '-y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold"'

        # Workaround needed for https://issues.apache.org/jira/browse/YARN-8310
        self.yarn_metadata_cleanup_commands = [
            f'setAcl /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot world:anyone:cdrwa',
            f'rmr /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot']
    def __init__(self, args, spicerack):
        """Reboot kafka on a given cluster."""
        ensure_shell_is_durable()

        self.icinga_hosts = spicerack.icinga_hosts
        self.admin_reason = spicerack.admin_reason('Reboot kafka nodes')
        self.puppet = spicerack.puppet
        self.remote = spicerack.remote()

        self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election
        self.batch_sleep_seconds = args.batch_sleep_seconds
        self.cluster = args.cluster

        cluster_cumin_alias = "A:kafka-" + args.cluster

        self.kafka_brokers = self.remote.query(cluster_cumin_alias)
Example #14
0
    def __init__(self, args, spicerack):
        """Decommission a host from all inventories."""
        ensure_shell_is_durable()
        self.remote = spicerack.remote()
        try:
            self.decom_hosts = self.remote.query(args.query).hosts
        except RemoteError:
            logger.debug("Query '%s' did not match any host or failed",
                         args.query,
                         exc_info=True)
            decom_hosts = NodeSet(args.query)
            ask_confirmation(
                'ATTENTION: the query does not match any host in PuppetDB or failed\n'
                'Hostname expansion matches {n} hosts: {hosts}\n'
                'Do you want to proceed anyway?'.format(n=len(decom_hosts),
                                                        hosts=decom_hosts))
            self.decom_hosts = decom_hosts

        if len(self.decom_hosts) > 20:
            raise RuntimeError(
                'Matched {} hosts, aborting. (max 20 with --force, 5 without)'.
                format(len(self.decom_hosts)))

        if len(self.decom_hosts) > 5:
            if args.force:
                logger.info(
                    'Authorized decommisioning of %s hosts with --force',
                    len(self.decom_hosts))
            else:
                raise RuntimeError(
                    'Matched {} hosts, and --force not set aborting. (max 20 with --force, 5 without)'
                    .format(len(self.decom_hosts)))

        ask_confirmation(
            'ATTENTION: destructive action for {n} hosts: {hosts}\nAre you sure to proceed?'
            .format(n=len(self.decom_hosts), hosts=self.decom_hosts))

        self.spicerack = spicerack
        self.task_id = args.task_id
        self.puppet_master = self.remote.query(get_puppet_ca_hostname())
        self.kerberos_kadmin = self.remote.query(KERBEROS_KADMIN_CUMIN_ALIAS)
        self.dns = self.spicerack.dns()
        self.deployment_host = self.remote.query(
            self.dns.resolve_cname(DEPLOYMENT_HOST))
        self.patterns = get_grep_patterns(self.dns, self.decom_hosts)
        self.reason = self.spicerack.admin_reason('Host decommission',
                                                  task_id=self.task_id)
    def __init__(self, args, spicerack):
        """Reboot Druid on a given cluster."""
        ensure_shell_is_durable()

        self.icinga_hosts = spicerack.icinga_hosts  # Store the method to be called on each host
        self.puppet = spicerack.puppet
        self.spicerack = spicerack
        self.admin_reason = spicerack.admin_reason('Reboot Druid nodes')
        self.remote = spicerack.remote()

        self.cluster = args.cluster

        cluster_cumin_alias = 'A:druid-' + self.cluster

        self.druid_workers = self.remote.query(cluster_cumin_alias)

        self.need_depool = self.cluster == 'public'
Example #16
0
    def __init__(self, args, spicerack):
        """Upgrade MySQL on a given set of hosts."""
        ensure_shell_is_durable()

        self.icinga_hosts = spicerack.icinga_hosts
        self.admin_reason = spicerack.admin_reason('MySQL upgrade')
        self.remote = spicerack.remote()
        query = 'P{' + args.query + '} and A:db-all and not A:db-multiinstance'
        self.hosts = spicerack.remote().query(query)
        self.puppet = spicerack.puppet
        self.logger = logging.getLogger(__name__)
        if not self.hosts:
            print('No hosts have been found, exiting')
        if len(self.hosts) <= 5:
            self.hosts_message = str(self.hosts)
        else:
            self.hosts_message = f'{len(self.hosts)} hosts'
Example #17
0
    def __init__(self, args, spicerack):
        """Initialize the runner"""
        ensure_shell_is_durable()

        self.cluster_cumin_alias = "A:zookeeper-" + args.cluster
        self.zookeeper = spicerack.remote().query(self.cluster_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.zookeeper.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons.')
        self.batch_sleep_seconds = args.batch_sleep_seconds

        # Safety checks
        self.zookeeper.run_sync('echo stats | nc -q 1 localhost 2181')

        logger.info('\n=========================================\n')
        ask_confirmation(
            'Please check the status of Zookeeper before proceeding.'
            'There must be only one leader and the rest must be followers.')
Example #18
0
    def __init__(self, args, spicerack):
        """Add a new node to a Ganeti cluster."""
        self.cluster, self.row, self.datacenter = get_locations()[
            args.location]
        ganeti = spicerack.ganeti()
        self.remote = spicerack.remote()
        self.master = self.remote.query(ganeti.rapi(self.cluster).master)
        self.remote_host = self.remote.query(args.fqdn)
        self.fqdn = args.fqdn
        self.group = args.group

        ensure_shell_is_durable()

        if len(self.remote_host) == 0:
            raise RuntimeError('Specified server not found, bailing out')

        if len(self.remote_host) != 1:
            raise RuntimeError('Only a single server can be added at a time')
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        if args.cluster == 'test':
            self.suffix = '-test'
            self.cluster = 'test'
        elif args.cluster == 'analytics':
            self.suffix = ''
            self.cluster = 'analytics'
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        self.remote = spicerack.remote()
        self.hadoop_master = self.remote.query('A:hadoop-master' + self.suffix)
        self.hadoop_standby = self.remote.query('A:hadoop-standby' + self.suffix)
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_master.hosts | self.hadoop_standby.hosts)
        self.admin_reason = spicerack.admin_reason('Restart of jvm daemons.')

        self.yarn_rm_sleep = args.yarn_rm_sleep_seconds
        self.hdfs_nn_sleep = args.hdfs_nn_sleep_seconds

        # Safety checks
        if self.hdfs_nn_sleep < 600:
            ask_confirmation('The HDFS Namenode restart sleep is less than 600s, are you sure?')
        if self.yarn_rm_sleep < 60:
            ask_confirmation('The Yarn Resourcemanager restart sleep is less than 60s, are you sure?')
        if len(self.hadoop_master) != 1:
            raise RuntimeError("Expecting exactly one Hadoop master server. Found: {}".format(self.hadoop_master))
        if len(self.hadoop_standby) != 1:
            raise RuntimeError("Expecting exactly one Hadoop standby server. Found: {}".format(self.hadoop_standby))

        # This is needed due to the format of the hostname in the command, for example:
        # sudo -u hdfs /usr/bin/hdfs haadmin -getServiceState an-master1001-eqiad-wmnet
        self.hadoop_master_service = self.hadoop_master.hosts[0].replace('.', '-')
        self.hadoop_standby_service = self.hadoop_standby.hosts[0].replace('.', '-')

        logger.info('Checking HDFS and Yarn daemon status. We expect active statuses on the Master node, '
                    'and standby statuses on the other. Please do not proceed otherwise.')

        print_hadoop_service_state(
            self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service)

        ask_confirmation('Please make sure that the active/standby nodes shown are correct.')
Example #20
0
    def __init__(self, args, spicerack):
        """Reboot all workers of a given Hadoop cluster."""
        if args.cluster == 'test':
            self.cluster_cumin_alias = 'A:hadoop-worker-test'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test'
        elif args.cluster == 'analytics':
            self.cluster_cumin_alias = 'A:hadoop-worker'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal'
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        self.cluster = args.cluster
        self.spicerack_remote = spicerack.remote()
        self.spicerack = spicerack
        self.reboot_batch_size = args.batch_size
        self.yarn_nm_sleep_seconds = args.yarn_nm_sleep_seconds
        self.workers_cumin_query = args.workers_cumin_query
        self.reason = spicerack.admin_reason('Reboot.')
Example #21
0
    def __init__(self, args, spicerack):
        """Create a new Virtual Machine in Ganeti."""
        self.cluster, self.row, self.datacenter = get_locations()[args.location]
        self.hostname = args.hostname
        self.vcpus = args.vcpus
        self.memory = args.memory
        self.network = args.network
        self.disk = args.disk
        self.skip_v6 = args.skip_v6
        self.spicerack = spicerack
        self.netbox = self.spicerack.netbox(read_write=True)
        self.fqdn = make_fqdn(self.hostname, self.network, self.datacenter)
        self.allocated = []  # Store allocated IPs to rollback them on failure
        self.dns_propagated = False  # Whether to run the DNS cookbook on rollback
        self.need_netbox_sync = False  # Whether to sync the VM to Netbox on rollback

        print('Ready to create Ganeti VM {a.fqdn} in the {a.cluster} cluster on row {a.row} with {a.vcpus} vCPUs, '
              '{a.memory}GB of RAM, {a.disk}GB of disk in the {a.network} network.'.format(a=self))
        ask_confirmation('Is this correct?')

        ensure_shell_is_durable()
    def __init__(self, args: Namespace, spicerack: Spicerack) -> None:
        """Initialize the runner."""
        ensure_shell_is_durable()
        if args.alias and args.alias not in self.allowed_aliases:
            raise ValueError(
                f"Alias ({args.alias}) does not match allowed aliases: " +
                ', '.join(self.allowed_aliases))
        self._args = args
        self.query = self._query()
        self.hosts = spicerack.remote().query(self.query)
        if not self.hosts:
            raise ValueError(f'Cumin query ({self.query}) matched zero hosts')

        self.number_of_batches = ceil(len(self.hosts.hosts) / args.batchsize)
        self.results = Results(action=args.action, hosts=self.hosts.hosts)

        reason = f'{args.action} {self.hosts.hosts}: {args.reason}'
        self.reason = spicerack.admin_reason(reason, args.task_id)
        self._spicerack = spicerack
        self.logger = getLogger('.'.join(
            (self.__module__, self.__class__.__name__)))
Example #23
0
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        ensure_shell_is_durable()
        self.cluster_cumin_alias = "A:kafka-" + args.cluster
        self.kafka_brokers = spicerack.remote().query(self.cluster_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.kafka_brokers.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons for openjdk upgrade.')
        self.batch_sleep_seconds = args.batch_sleep_seconds
        self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election

        ask_confirmation(
            'Please check the Grafana dashboard of the cluster and make sure that '
            'topic partition leaders are well balanced and that all brokers are up and running.'
        )

        if args.sleep_before_pref_replica_election < 900:
            ask_confirmation(
                'The sleep time between a broker restart and kafka preferred-replica-election '
                'is less than 900 seconds. The broker needs some time to recover after a restart. '
                'Are you sure?')
def run(args, spicerack):
    """Required by Spicerack API."""
    if spicerack.dry_run:
        logger.info('this cookbook does nothing with with --dry-run')
        return 0
    ensure_shell_is_durable()
    session = Session()
    session.verify = False
    return_code = 0
    current_password = get_secret('Current password')
    session.auth = (args.username, current_password)

    _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query)

    for pdu in _pdus:
        try:
            if args.since:
                uptime = pdus.parse_uptime(pdus.get_uptime(pdu, session))
                if uptime < args.since:
                    logger.info('%s: Not rebooting uptime is %d', pdu, uptime)
                    continue
            reboot_time = datetime.utcnow()
            version = pdus.get_version(pdu, session)
            pdus.reboot(pdu, version, session)
            # Reboots from expereince take at least 60 seconds
            logger.info('%s: sleep while reboot', pdu)
            sleep(60)
            pdus.wait_reboot_since(pdu, reboot_time, session)
        except (pdus.VersionError, pdus.RebootError,
                pdus.UptimeError) as error:
            logger.error(error)
            return_code = 1
        if args.check_default:
            if pdus.check_default(pdu, session):
                # TODO: delete default user
                return_code = 1
    return return_code
def run(args, spicerack):
    """Required by Spicerack API."""
    ensure_shell_is_durable()
    return_code = 0
    session = Session()
    session.verify = False
    password = get_secret('Enter login password')
    snmp_ro = get_secret('New SNMP RO String', confirm=True)

    session.auth = (args.username, password)

    _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query)

    for pdu in _pdus:
        snmp_rw = random_string() if args.reset_rw else None
        try:
            if not spicerack.dry_run:
                version = pdus.get_version(pdu, session)
                if change_snmp(pdu, version, session, snmp_ro, snmp_rw,
                               args.force):
                    reboot_time = datetime.utcnow()
                    pdus.reboot(pdu, version, session)
                    # Reboots from experience take at least 60 seconds
                    logger.info('%s: sleep while reboot', pdu)
                    sleep(60)
                    pdus.wait_reboot_since(pdu, reboot_time, session)
            else:
                logger.info('%s: Dry run, not trying.', pdu)
            if args.check_default:
                if pdus.check_default(pdu, session):
                    # TODO: delete default user
                    pass
        except (pdus.VersionError, SnmpResetError, pdus.RebootError) as error:
            logger.error(error)
            return_code = 1
    return return_code
Example #26
0
    def __init__(self, args, spicerack):
        """Initiliaze the provision runner."""
        ensure_shell_is_durable()
        self.args = args

        self.netbox = spicerack.netbox()
        self.netbox_server = spicerack.netbox_server(self.args.host)
        self.netbox_data = self.netbox_server.as_dict()
        self.fqdn = self.netbox_server.mgmt_fqdn
        self.ipmi = spicerack.ipmi(self.fqdn)
        self.remote = spicerack.remote()
        if self.netbox_server.virtual:
            raise RuntimeError(
                f'Host {self.args.host} is a virtual machine. VMs are not supported.'
            )

        if self.netbox_data['device_type']['manufacturer']['slug'] != 'dell':
            vendor = self.netbox_data['device_type']['manufacturer']['name']
            raise RuntimeError(
                f'Host {self.args.host} manufacturer is {vendor}. Only Dell is supported.'
            )

        if self.netbox_server.status == 'active' and (not self.args.no_dhcp or
                                                      not self.args.no_users):
            raise RuntimeError(
                f'Host {self.args.host} has active status in Netbox but --no-dhcp and --no-users were not set.'
            )

        # DHCP automation
        try:
            self.dhcp_hosts = self.remote.query(
                f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}'
            )
        except RemoteError:  # Fallback to eqiad's install server if the above fails, i.e. for a new DC
            self.dhcp_hosts = self.remote.query(
                'A:installserver-light and A:eqiad')

        self.dhcp = spicerack.dhcp(self.dhcp_hosts)
        address = self.netbox.api.ipam.ip_addresses.get(
            dns_name=self.fqdn).address
        self.interface = ipaddress.ip_interface(address)
        self.dhcp_config = DHCPConfMgmt(
            datacenter=self.netbox_data['site']['slug'],
            serial=self.netbox_data['serial'],
            fqdn=self.fqdn,
            ipv4=self.interface.ip,
        )
        if self.args.no_users:
            password = ''  # nosec
        else:
            password = DELL_DEFAULT

        if self.netbox_server.status in ('active', 'staged'):
            self.reboot_policy = DellSCPRebootPolicy.GRACEFUL
        else:
            self.reboot_policy = DellSCPRebootPolicy.FORCED

        self.redfish = spicerack.redfish(self.fqdn, 'root', password)
        self.mgmt_password = spicerack.management_password

        # Testing that the management password is correct connecting to the current cumin host
        localhost = gethostname()
        netbox_localhost = spicerack.netbox_server(localhost)
        try:
            spicerack.redfish(netbox_localhost.mgmt_fqdn,
                              'root').check_connection()
        except RedfishError:
            raise RuntimeError(
                f'The management password provided seems incorrect, it does not work on {localhost}.'
            ) from None

        self.config_changes = {
            'BIOS.Setup.1-1': {
                'BootMode':
                'Bios',
                'CpuInterconnectBusLinkPower':
                'Enabled',
                'EnergyPerformanceBias':
                'BalancedPerformance',
                'InternalUsb':
                'Off',
                'PcieAspmL1':
                'Enabled',
                'ProcC1E':
                'Enabled',
                'ProcCStates':
                'Enabled',
                'ProcPwrPerf':
                'OsDbpm',
                'ProcVirtualization':
                'Enabled' if self.args.enable_virtualization else 'Disabled',
                'ProcX2Apic':
                'Disabled',
                'SerialComm':
                'OnConRedirCom2',
                'SerialPortAddress':
                'Serial1Com1Serial2Com2',
                'SysProfile':
                'PerfPerWattOptimizedOs',
                'UncoreFrequency':
                'DynamicUFS',
                'UsbPorts':
                'OnlyBackPortsOn',
            },
            'iDRAC.Embedded.1': {
                'IPMILan.1#Enable': 'Enabled',
                'IPv4.1#DHCPEnable': 'Disabled',
                'IPv4Static.1#Address': str(self.interface.ip),
                'IPv4Static.1#DNS1': DNS_ADDRESS,
                'IPv4Static.1#Gateway':
                str(next(self.interface.network.hosts())),
                'IPv4Static.1#Netmask': str(self.interface.netmask),
                'NICStatic.1#DNSDomainFromDHCP': 'Disabled',
            },
            'System.Embedded.1': {
                'ServerPwr.1#PSRapidOn': 'Disabled',
            }
        }

        netbox_host = self.netbox.api.dcim.devices.get(name=self.args.host)
        self.multi_gigabit = False
        if 'gbase-' in netbox_host.primary_ip.assigned_object.type.value:
            logger.info(
                'Detected multi-gigabit interface, will add specific settings.'
            )
            self.multi_gigabit = True

        ask_confirmation(
            f'Are you sure to proceed to apply BIOS/iDRAC settings {self.runtime_description}?'
        )
def test_ensure_shell_is_durable_non_interactive(mocked_isatty):
    """Should raise WmflibError if in an interactive shell."""
    mocked_isatty.return_value = False
    interactive.ensure_shell_is_durable()
    assert mocked_isatty.called
Example #28
0
def run(args, spicerack):  # pylint: disable=too-many-return-statements
    """Required by Spicerack API."""
    ensure_shell_is_durable()

    logger.info('Get source image checksum')
    dns = spicerack.dns()
    image_server = dns.resolve_ptr(dns.resolve_ipv4('apt.wikimedia.org')[0])[0]
    remote = spicerack.remote()
    image_server = remote.query(image_server)
    cmd = "sha1sum /srv/junos/{} | cut -d' ' -f1".format(args.image)
    results = image_server.run_sync(cmd)
    for _, output in results:
        src_checksum = output.message().decode()
        break
    if len(src_checksum) != 40:
        logger.info(src_checksum)
        logger.error('Can\'t checksum, is the file there and readable?')
        return 1
    device = remote.query('D{' + args.fqdn + '}')
    if len(device.hosts) > 1:
        logger.error('Only 1 target device please.')
        return 1

    logger.info('Cleanup device storage')
    results = device.run_sync(
        'request system storage cleanup no-confirm | display json')
    json_output = output_to_json(results)
    if not json_output:
        return 1
    if 'success' not in json_output['system-storage-cleanup-information'][0]:
        logger.info(json_output)
        logger.error('Command did not run successfully')
        return 1

    logger.info('Copy image to device')
    cmd = 'file copy "https://apt.wikimedia.org/junos/{}" /var/tmp/'.format(
        args.image)
    device.run_sync(cmd)

    logger.info('Compare checksums')
    cmd = 'file checksum sha1 /var/tmp/{} | display json'.format(args.image)

    results = device.run_sync(cmd)
    json_output = output_to_json(results)
    if not json_output:
        return 1
    try:
        dst_checksum = json_output['checksum-information'][0]['file-checksum'][
            0]['checksum'][0]['data']
    except KeyError:
        logger.info(json_output)
        logger.error(
            'Can\'t generate destination side checksum, did the file copy go well?'
        )
        return 1

    if src_checksum != dst_checksum:
        logger.error('Checksum missmatch, maybe partial file transfer?')
        return 1

    logger.info('Save rescue config')
    results = device.run_sync(
        'request system configuration rescue save | display json')
    json_output = output_to_json(results)
    if not json_output:
        return 1
    if 'success' not in json_output['rescue-management-results'][0][
            'routing-engine'][0]:
        logger.info(json_output)
        logger.error('Command did not run successfully.')
        return 1

    logger.info('Validate image')
    if 'vmhost' in args.image:
        logger.info('Introduced in Junos OS Release 18.4R1, good luck.')
    else:
        cmd = 'request system software validate /var/tmp/{}.tgz'.format(
            args.image)
        if not present_in_output(device.run_sync(cmd), 'Validation succeeded'):
            logger.error('Validation failed, try running it manually.')
            return 1
    logger.info('Ready for next cookbook')
    return 0
def test_ensure_shell_is_durable_sty(mocked_isatty, env_name, env_value, monkeypatch):
    """Should not raise if in an interactive shell with STY set, TMUX set or a screen-line TERM."""
    mocked_isatty.return_value = True
    monkeypatch.setenv(env_name, env_value)
    interactive.ensure_shell_is_durable()
    assert mocked_isatty.called
Example #30
0
    def __init__(self, args, spicerack):
        """Initiliaze the reimage runner."""
        ensure_shell_is_durable()
        self.args = args
        self.host = self.args.host

        self.netbox = spicerack.netbox()
        self.netbox_server = spicerack.netbox_server(self.host,
                                                     read_write=True)
        self.netbox_data = self.netbox_server.as_dict()

        ask_confirmation(
            f'ATTENTION: destructive action for host: {self.host}\nAre you sure to proceed?'
        )

        # Shortcut variables
        self.fqdn = self.netbox_server.fqdn
        self.mgmt_fqdn = self.netbox_server.mgmt_fqdn
        self.output_filename = self._get_output_filename(spicerack.username)
        self.actions = spicerack.actions
        self.host_actions = self.actions[self.host]
        self.confctl_services = []

        if self.netbox_server.virtual:
            raise RuntimeError(
                f'Host {self.host} is a virtual machine. VMs are not yet supported.'
            )

        self.dns = spicerack.dns()
        self.icinga_host = spicerack.icinga_hosts([self.host])
        self.ipmi = spicerack.ipmi(self.mgmt_fqdn)
        self.reason = spicerack.admin_reason('Host reimage',
                                             task_id=self.args.task_id)
        self.puppet_master = spicerack.puppet_master()
        self.debmonitor = spicerack.debmonitor()
        self.confctl = spicerack.confctl('node')
        self.remote = spicerack.remote()
        self.spicerack = spicerack

        try:
            self.remote_host = self.remote.query(self.fqdn)
            if self.args.new:
                ask_confirmation(
                    f'Host {self.fqdn} was found in PuppetDB but --new was set. Are you sure you want to '
                    'proceed? The --new option will be unset')
                self.args.new = False  # Unset --new
                logger.info('The option --new has been unset')
        except RemoteError as e:
            self.remote_host = self.remote.query(
                f'D{{{self.fqdn}}}')  # Use the Direct backend instead
            if not self.args.new:
                raise RuntimeError(
                    f'Host {self.fqdn} was not found in PuppetDB but --new was not set. Check that the '
                    'FQDN is correct. If the host is new or has disappeared from PuppetDB because down '
                    'for too long use --new.') from e

        if len(self.remote_host) != 1:
            raise RuntimeError(
                f'Expected 1 host for query {self.fqdn} but got {len(self.remote_host)}: {self.remote_host}'
            )

        # The same as self.remote_host but using the SSH key valid only during installation before the first Puppet run
        self.remote_installer = spicerack.remote(installer=True).query(
            self.fqdn)
        # Get a Puppet instance for the current cumin host to update the known hosts file
        remote_localhost = self.remote.query(f'{self.reason.hostname}.*')
        if len(remote_localhost) != 1:
            raise RuntimeError(
                f'Localhost matched the wrong number of hosts ({len(remote_localhost)}) for '
                f'query "{self.reason.hostname}.*": {remote_localhost}')
        self.puppet_localhost = spicerack.puppet(remote_localhost)
        self.puppet = spicerack.puppet(self.remote_host)
        # The same as self.puppet but using the SSH key valid only during installation before the first Puppet run
        self.puppet_installer = spicerack.puppet(self.remote_installer)

        # DHCP automation
        try:
            self.dhcp_hosts = self.remote.query(
                f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}'
            )
        except RemoteError:  # Fallback to eqiad's install server if the above fails, i.e. for a new DC
            self.dhcp_hosts = self.remote.query(
                'A:installserver-light and A:eqiad')
        self.dhcp = spicerack.dhcp(self.dhcp_hosts)
        self.dhcp_config = self._get_dhcp_config()

        self._validate()

        # Keep track of some specific actions for the eventual rollback
        self.rollback_masks = False
        self.rollback_depool = False

        if self.args.task_id is not None:
            self.phabricator = spicerack.phabricator(
                PHABRICATOR_BOT_CONFIG_FILE)
        else:
            self.phabricator = None