Example #1
0
 def test_get_oscmd_with_valid_os_types(self):
     """
     Verify get_oscmd returns proper modules.
     """
     for os_type in ('atomic', 'fedora', 'rhel'):
         self.assertEquals('commissaire.oscmd.{0}'.format(os_type),
                           oscmd.get_oscmd(os_type).__module__)
Example #2
0
 def test_get_oscmd_with_valid_os_types(self):
     """
     Verify get_oscmd returns proper modules.
     """
     for os_type in available_os_types:
         self.assertEquals('commissaire.oscmd.{0}'.format(os_type),
                           oscmd.get_oscmd(os_type).__module__)
Example #3
0
 def test_get_oscmd_with_valid_os_types(self):
     """
     Verify get_oscmd returns proper modules.
     """
     for os_type in available_os_types:
         self.assertEquals(
             'commissaire.oscmd.{0}'.format(os_type),
             oscmd.get_oscmd(os_type).__module__)
Example #4
0
 def test_get_oscmd_with_valid_os_types(self):
     """
     Verify get_oscmd returns proper modules.
     """
     for os_type in ('atomic', 'fedora', 'rhel'):
         self.assertEquals(
             'commissaire.oscmd.{0}'.format(os_type),
             oscmd.get_oscmd(os_type).__module__)
    def test_bootstrap(self):
        """
        Verify Transport().bootstrap works as expected.
        """
        with patch(
                'commissaire.transport.ansibleapi.TaskQueueManager') as _tqm:
            _tqm().run.return_value = 0

            transport = ansibleapi.Transport()
            transport.variable_manager._fact_cache = {}
            oscmd = MagicMock(OSCmdBase)

            config = Config(etcd={
                'uri': urlparse('http://127.0.0.1:2379'),
            },
                            kubernetes={
                                'uri': urlparse('http://127.0.0.1:8080'),
                                'token': 'token',
                            })

            result, facts = transport.bootstrap('10.2.0.2', 'test/fake_key',
                                                config, oscmd)
            # We should have a successful response
            self.assertEquals(0, result)
            # We should see expected calls
            self.assertEquals(1, oscmd.install_docker.call_count)
            self.assertEquals(1, oscmd.install_kube.call_count)

            # Check 'commissaire_enable_pkg_repos' playbook variable
            # for various operating systems.
            transport = ansibleapi.Transport()
            transport._run = MagicMock()
            transport._run.return_value = (0, {})

            needs_enable_repos = ('redhat', 'rhel')

            for os_type in available_os_types:
                oscmd = get_oscmd(os_type)
                result, facts = transport.bootstrap('10.2.0.2.',
                                                    'test/fake_key', config,
                                                    oscmd)
                play_vars = transport._run.call_args[0][4]
                command = play_vars['commissaire_enable_pkg_repos']
                if os_type in needs_enable_repos:
                    self.assertIn('subscription-manager repos', command)
                else:
                    self.assertEqual('true', command)  # no-op command
    def test_bootstrap(self):
        """
        Verify Transport().bootstrap works as expected.
        """
        with patch('commissaire.transport.ansibleapi.TaskQueueManager') as _tqm:
            _tqm().run.return_value = 0

            transport = ansibleapi.Transport()
            transport.variable_manager._fact_cache = {}
            oscmd = MagicMock(OSCmdBase)

            config = Config(
                etcd={
                    'uri': urlparse('http://127.0.0.1:2379'),
                },
                kubernetes={
                    'uri': urlparse('http://127.0.0.1:8080'),
                    'token': 'token',
                }
            )

            result, facts = transport.bootstrap(
                '10.2.0.2', 'test/fake_key', config, oscmd)
            # We should have a successful response
            self.assertEquals(0, result)
            # We should see expected calls
            self.assertEquals(1, oscmd.install_docker.call_count)
            self.assertEquals(1, oscmd.install_kube.call_count)

            # Check 'commissaire_enable_pkg_repos' playbook variable
            # for various operating systems.
            transport = ansibleapi.Transport()
            transport._run = MagicMock()
            transport._run.return_value = (0, {})

            needs_enable_repos = ('redhat', 'rhel')

            for os_type in available_os_types:
                oscmd = get_oscmd(os_type)
                result, facts = transport.bootstrap(
                    '10.2.0.2.', 'test/fake_key', config, oscmd)
                play_vars = transport._run.call_args[0][4]
                command = play_vars['commissaire_enable_pkg_repos']
                if os_type in needs_enable_repos:
                    self.assertIn('subscription-manager repos', command)
                else:
                    self.assertEqual('true', command)  # no-op command
Example #7
0
def investigator(queue, config, run_once=False):
    """
    Investigates new hosts to retrieve and store facts.

    :param queue: Queue to pull work from.
    :type queue: Queue.Queue
    :param config: Configuration information.
    :type config: commissaire.config.Config
    """
    logger = logging.getLogger('investigator')
    logger.info('Investigator started')

    while True:
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses
        transport = ansibleapi.Transport()
        to_investigate, ssh_priv_key = queue.get()
        address = to_investigate['address']
        logger.info('{0} is now in investigating.'.format(address))
        logger.debug('Investigation details: key={0}, data={1}'.format(
            to_investigate, ssh_priv_key))

        f = tempfile.NamedTemporaryFile(prefix='key', delete=False)
        key_file = f.name
        logger.debug('Using {0} as the temporary key location for {1}'.format(
            key_file, address))
        f.write(base64.decodestring(ssh_priv_key))
        logger.info('Wrote key for {0}'.format(address))
        f.close()

        key = '/commissaire/hosts/{0}'.format(address)
        etcd_resp, error = cherrypy.engine.publish('store-get', key)[0]
        if error:
            logger.warn('Unable to continue for {0} due to '
                        '{1}: {2}. Returning...'.format(
                            address, type(error), error))
            clean_up_key(key_file)
            continue

        data = json.loads(etcd_resp.value)

        try:
            result, facts = transport.get_info(address, key_file)
            data.update(facts)
            data['last_check'] = datetime.datetime.utcnow().isoformat()
            data['status'] = 'bootstrapping'
            logger.info('Facts for {0} retrieved'.format(address))
            logger.debug('Data: {0}'.format(data))
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Getting info failed for {0}: {1}'.format(
                address, exc_msg))
            data['status'] = 'failed'
            cherrypy.engine.publish('store-save', key, json.dumps(data))[0]
            clean_up_key(key_file)
            if run_once:
                break
            continue

        cherrypy.engine.publish('store-save', key, json.dumps(data))[0]
        logger.info(
            'Finished and stored investigation data for {0}'.format(address))
        logger.debug('Finished investigation update for {0}: {1}'.format(
            address, data))

        logger.info('{0} is now in bootstrapping'.format(address))
        oscmd = get_oscmd(data['os'])
        try:
            result, facts = transport.bootstrap(address, key_file, config,
                                                oscmd)
            data['status'] = 'inactive'
            cherrypy.engine.publish('store-save', key, json.dumps(data))[0]
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Unable to start bootstraping for {0}: {1}'.format(
                address, exc_msg))
            data['status'] = 'disassociated'
            cherrypy.engine.publish('store-save', key, json.dumps(data))[0]
            clean_up_key(key_file)
            if run_once:
                break
            continue

        # Verify association with the container manager
        try:
            container_mgr = KubeContainerManager(config)
            # Try 3 times waiting 5 seconds each time before giving up
            for cnt in range(0, 3):
                if container_mgr.node_registered(address):
                    logger.info(
                        '{0} has been registered with the container manager.')
                    data['status'] = 'active'
                    break
                if cnt == 3:
                    msg = 'Could not register with the container manager'
                    logger.warn(msg)
                    raise Exception(msg)
                logger.debug(
                    '{0} has not been registered with the container manager. '
                    'Checking again in 5 seconds...'.format(address))
                sleep(5)
        except:
            _, exc_msg, _ = sys.exc_info()
            logger.warn(
                'Unable to finish bootstrap for {0} while associating with '
                'the container manager: {1}'.format(address, exc_msg))
            data['status'] = 'inactive'

        cherrypy.engine.publish('store-save', key, json.dumps(data))[0]
        logger.info('Finished bootstrapping for {0}'.format(address))
        logging.debug('Finished bootstrapping for {0}: {1}'.format(
            address, data))

        clean_up_key(key_file)
        if run_once:
            logger.info('Exiting due to run_once request.')
            break

    logger.info('Investigator stopping')
Example #8
0
    def test_bootstrap(self):
        """
        Verify Transport().bootstrap works as expected.
        """
        with patch(
                'commissaire.transport.ansibleapi.TaskQueueManager') as _tqm:
            _tqm().run.return_value = 0

            transport = ansibleapi.Transport()
            transport.variable_manager._fact_cache = {}
            oscmd = MagicMock(OSCmdBase)

            result, facts = transport.bootstrap('10.2.0.2',
                                                Cluster.new().__dict__,
                                                'test/fake_key', MagicMock(),
                                                oscmd)
            # We should have a successful response
            self.assertEquals(0, result)
            # We should see expected calls
            self.assertEquals(1, oscmd.install_docker.call_count)
            self.assertEquals(1, oscmd.install_kube.call_count)

            # Check user-config to playbook-variable translation.
            etcd_config = {
                'server_url': 'https://192.168.1.1:1234',
                'certificate_ca_path': '/path/to/etcd/ca/cert',
                'certificate_path': '/path/to/etcd/client/cert',
                'certificate_key_path': '/path/to/etcd/client/key'
            }
            kube_config = {
                'server_url': 'https://192.168.2.2:4567',
                'certificate_path': '/path/to/kube/client/cert',
                'certificate_key_path': '/path/to/kube/client/key'
            }
            store_manager = MagicMock(StoreHandlerManager)
            store_manager.list_store_handlers.return_value = [
                (EtcdStoreHandler, etcd_config, ()),
                (KubernetesStoreHandler, kube_config, ())
            ]

            store_manager.get.return_value = Network.new(name='default',
                                                         type='flannel_etcd')

            cluster_data = Cluster.new(name='default',
                                       network='default').__dict__

            transport = ansibleapi.Transport()
            transport._run = MagicMock()
            transport._run.return_value = (0, {})
            result, facts = transport.bootstrap('10.2.0.2', cluster_data,
                                                'test/fake_key', store_manager,
                                                oscmd)
            play_vars = transport._run.call_args[0][4]
            self.assertEqual(play_vars['commissaire_etcd_server_url'],
                             'https://192.168.1.1:1234')
            self.assertEqual(play_vars['commissaire_etcd_ca_path_local'],
                             '/path/to/etcd/ca/cert')
            self.assertEqual(
                play_vars['commissaire_etcd_client_cert_path_local'],
                '/path/to/etcd/client/cert')
            self.assertEqual(
                play_vars['commissaire_etcd_client_key_path_local'],
                '/path/to/etcd/client/key')

            # Check 'commissaire_enable_pkg_repos' playbook variable
            # for various operating systems.
            transport = ansibleapi.Transport()
            transport._run = MagicMock()
            transport._run.return_value = (0, {})

            needs_enable_repos = ('redhat', 'rhel')

            for os_type in available_os_types:
                oscmd = get_oscmd(os_type)
                result, facts = transport.bootstrap('10.2.0.2.', cluster_data,
                                                    'test/fake_key',
                                                    MagicMock(), oscmd)
                play_vars = transport._run.call_args[0][4]
                command = play_vars['commissaire_enable_pkg_repos']
                if os_type in needs_enable_repos:
                    self.assertIn('subscription-manager repos', command)
                else:
                    self.assertEqual('true', command)  # no-op command
Example #9
0
def investigator(queue, config, store_kwargs={}, run_once=False):
    """
    Investigates new hosts to retrieve and store facts.

    :param queue: Queue to pull work from.
    :type queue: Queue.Queue
    :param config: Configuration information.
    :type config: commissaire.config.Config
    :param store_kwargs: Keyword arguments used to make the etcd client.
    :type store_kwargs: dict
    """
    logger = logging.getLogger('investigator')
    logger.info('Investigator started')

    store = etcd.Client(**store_kwargs)

    while True:
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses
        to_investigate, ssh_priv_key, remote_user = queue.get()
        address = to_investigate['address']
        logger.info('{0} is now in investigating.'.format(address))
        logger.debug(
            'Investigation details: key={0}, data={1}, remote_user={2}'.format(
                to_investigate, ssh_priv_key, remote_user))

        transport = ansibleapi.Transport(remote_user)

        f = tempfile.NamedTemporaryFile(prefix='key', delete=False)
        key_file = f.name
        logger.debug(
            'Using {0} as the temporary key location for {1}'.format(
                key_file, address))
        f.write(base64.decodestring(ssh_priv_key))
        logger.info('Wrote key for {0}'.format(address))
        f.close()

        try:
            key = '/commissaire/hosts/{0}'.format(address)
            etcd_resp = store.get(key)
        except Exception as error:
            logger.warn(
                'Unable to continue for {0} due to '
                '{1}: {2}. Returning...'.format(address, type(error), error))
            clean_up_key(key_file)
            continue

        data = json.loads(etcd_resp.value)

        try:
            result, facts = transport.get_info(address, key_file)
            data.update(facts)
            data['last_check'] = datetime.datetime.utcnow().isoformat()
            data['status'] = 'bootstrapping'
            logger.info('Facts for {0} retrieved'.format(address))
            logger.debug('Data: {0}'.format(data))
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Getting info failed for {0}: {1}'.format(
                address, exc_msg))
            data['status'] = 'failed'
            store.write(key, json.dumps(data))
            clean_up_key(key_file)
            if run_once:
                break
            continue

        store.write(key, json.dumps(data))
        logger.info(
            'Finished and stored investigation data for {0}'.format(address))
        logger.debug('Finished investigation update for {0}: {1}'.format(
            address, data))

        logger.info('{0} is now in bootstrapping'.format(address))
        oscmd = get_oscmd(data['os'])
        try:
            result, facts = transport.bootstrap(
                address, key_file, config, oscmd)
            data['status'] = 'inactive'
            store.write(key, json.dumps(data))
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Unable to start bootstraping for {0}: {1}'.format(
                address, exc_msg))
            data['status'] = 'disassociated'
            store.write(key, json.dumps(data))
            clean_up_key(key_file)
            if run_once:
                break
            continue

        # Verify association with the container manager
        try:
            container_mgr = KubeContainerManager(config)
            # Try 3 times waiting 5 seconds each time before giving up
            for cnt in range(0, 3):
                if container_mgr.node_registered(address):
                    logger.info(
                        '{0} has been registered with the container manager.')
                    data['status'] = 'active'
                    break
                if cnt == 3:
                    msg = 'Could not register with the container manager'
                    logger.warn(msg)
                    raise Exception(msg)
                logger.debug(
                    '{0} has not been registered with the container manager. '
                    'Checking again in 5 seconds...'.format(address))
                sleep(5)
        except:
            _, exc_msg, _ = sys.exc_info()
            logger.warn(
                'Unable to finish bootstrap for {0} while associating with '
                'the container manager: {1}'.format(address, exc_msg))
            data['status'] = 'inactive'

        store.write(key, json.dumps(data))
        logger.info(
            'Finished bootstrapping for {0}'.format(address))
        logging.debug('Finished bootstrapping for {0}: {1}'.format(
            address, data))

        clean_up_key(key_file)
        if run_once:
            logger.info('Exiting due to run_once request.')
            break

    logger.info('Investigator stopping')
Example #10
0
def clusterexec(cluster_name, command):
    """
    Remote executes a shell commands across a cluster.

    :param store: Data store to place results.
    :type store: etcd.Client
    """
    logger = logging.getLogger('clusterexec')

    # TODO: This is a hack and should really be done elsewhere
    if command == 'upgrade':
        finished_hosts_key = 'upgraded'
        cluster_status = {
            "status": 'in_process',
            "upgrade_to": 'latest',
            "upgraded": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None,
        }
    elif command == 'restart':
        finished_hosts_key = 'restarted'
        cluster_status = {
            "status": 'in_process',
            "restarted": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None
        }

    end_status = 'finished'

    # Set the initial status in the store
    logger.info('Setting initial status.')
    logger.debug('Status={0}'.format(cluster_status))
    cherrypy.engine.publish(
        'store-save',
        '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
        json.dumps(cluster_status))

    # Collect all host addresses in the cluster
    etcd_resp, error = cherrypy.engine.publish(
        'store-get', '/commissaire/clusters/{0}'.format(cluster_name))[0]

    if error:
        logger.warn(
            'Unable to continue for {0} due to '
            '{1}: {2}. Returning...'.format(cluster_name, type(error), error))
        return

    cluster_hosts = set(json.loads(etcd_resp.value).get('hostset', []))
    if cluster_hosts:
        logger.debug(
            '{0} hosts in cluster {1}'.format(
                len(cluster_hosts), cluster_name))
    else:
        logger.warn('No hosts in cluster {0}'.format(cluster_name))

    # TODO: Find better way to do this
    a_hosts, error = cherrypy.engine.publish(
        'store-get', '/commissaire/hosts')[0]
    if error:
        logger.warn(
            'No hosts in the cluster. Error: {0}. Exiting clusterexec'.format(
                error))
        return
    for a_host_dict in a_hosts._children:
        a_host = json.loads(a_host_dict['value'])
        if a_host['address'] not in cluster_hosts:
            logger.debug(
                'Skipping {0} as it is not in this cluster.'.format(
                    a_host['address']))
            continue  # Move on to the next one
        oscmd = get_oscmd(a_host['os'])

        command_list = getattr(oscmd, command)()  # Only used for logging
        logger.info('Executing {0} on {1}...'.format(
            command_list, a_host['address']))

        cluster_status['in_process'].append(a_host['address'])
        cherrypy.engine.publish(
            'store-save',
            '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
            json.dumps(cluster_status))

        # TODO: This is reused, make it reusable
        f = tempfile.NamedTemporaryFile(prefix='key', delete=False)
        key_file = f.name
        logger.debug(
            'Using {0} as the temporary key location for {1}'.format(
                key_file, a_host['address']))
        f.write(base64.decodestring(a_host['ssh_priv_key']))
        logger.debug('Wrote key for {0}'.format(a_host['address']))
        f.close()

        try:
            transport = ansibleapi.Transport()
            exe = getattr(transport, command)
            result, facts = exe(
                a_host['address'], key_file, oscmd)
        # XXX: ansibleapi explicitly raises Exception()
        except Exception:
            # If there was a failure set the end_status and break out
            end_status = 'failed'
            break
        finally:
            try:
                f.unlink(key_file)
                logger.debug('Removed temporary key file {0}'.format(key_file))
            except:
                logger.warn(
                    'Unable to remove the temporary key file: {0}'.format(
                        key_file))

        cluster_status[finished_hosts_key].append(a_host['address'])
        try:
            idx = cluster_status['in_process'].index(a_host['address'])
            cluster_status['in_process'].pop(idx)
        except ValueError:
            logger.warn('Host {0} was not in_process for {1} {2}'.format(
                a_host['address'], command, cluster_name))

        cherrypy.engine.publish(
            'store-save',
            '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
            json.dumps(cluster_status))
        logger.info('Finished executing {0} for {1} in {2}'.format(
            command, a_host['address'], cluster_name))

    # Final set of command result
    cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat()
    cluster_status['status'] = end_status

    logger.debug('Cluster {0} final {1} status: {2}'.format(
        cluster_name, command, cluster_status))

    cherrypy.engine.publish(
        'store-save',
        '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
        json.dumps(cluster_status))

    logger.info('Clusterexec stopping')
Example #11
0
def investigator(queue, config, store, run_once=False):
    """
    Investigates new hosts to retrieve and store facts.

    :param queue: Queue to pull work from.
    :type queue: gevent.queue.Queue
    :param config: Configuration information.
    :type config: commissaire.config.Config
    :param store: Data store to place results.
    :type store: etcd.Client
    """
    # TODO: Change this to be watch and etcd "queue" and kick off a function
    #       similar to clusterpoolexec
    logger = logging.getLogger("investigator")
    logger.info("Investigator started")

    transport = ansibleapi.Transport()
    while True:
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses
        to_investigate, ssh_priv_key = queue.get()
        address = to_investigate["address"]
        logger.info("{0} is now in investigating.".format(address))
        logger.debug("Investigation details: key={0}, data={1}".format(to_investigate, ssh_priv_key))

        f = tempfile.NamedTemporaryFile(prefix="key", delete=False)
        key_file = f.name
        logger.debug("Using {0} as the temporary key location for {1}".format(key_file, address))
        f.write(base64.decodestring(ssh_priv_key))
        logger.debug("Wrote key for {0}".format(address))
        f.close()

        key = "/commissaire/hosts/{0}".format(address)
        data = json.loads(store.get(key).value)

        try:
            result, facts = transport.get_info(address, key_file)
            data.update(facts)
            data["last_check"] = datetime.datetime.utcnow().isoformat()
            data["status"] = "bootstrapping"
            logger.info("Facts for {0} retrieved".format(address))
        except:
            logger.warn("Getting info failed for {0}".format(address))
            data["status"] = "failed"
            store.set(key, json.dumps(data))
            exc_type, exc_msg, tb = sys.exc_info()
            logger.debug("{0} Exception: {1}".format(address, exc_msg))
            clean_up_key(key_file)
            if run_once:
                break
            continue

        store.set(key, json.dumps(data))
        logger.info("Finished and stored investigation data for {0}".format(address))
        logger.debug("Finished investigation update for {0}: {1}".format(address, data))
        # --
        logger.info("{0} is now in bootstrapping".format(address))
        oscmd = get_oscmd(data["os"])()
        try:
            result, facts = transport.bootstrap(address, key_file, config, oscmd)
            data["status"] = "inactive"
            store.set(key, json.dumps(data))
        except:
            logger.warn("Unable to bootstrap {0}".format(address))
            exc_type, exc_msg, tb = sys.exc_info()
            logger.debug("{0} Exception: {1}".format(address, exc_msg))
            data["status"] = "disassociated"
            store.set(key, json.dumps(data))
            clean_up_key(key_file)
            if run_once:
                break
            continue

        # Verify association with the container manager
        try:
            container_mgr = KubeContainerManager(config)
            # Try 3 times waiting 5 seconds each time before giving up
            for cnt in range(0, 3):
                if container_mgr.node_registered(address):
                    logger.info("{0} has been registered with the container manager.")
                    data["status"] = "active"
                    break
                if cnt == 3:
                    raise Exception("Could not register with the container manager")
                logger.debug(
                    "{0} has not been registered with the container manager. "
                    "Checking again in 5 seconds...".format(address)
                )
                gevent.sleep(5)
        except:
            logger.warn("Unable to bootstrap {0}".format(address))
            exc = sys.exc_info()[0]
            logger.debug("{0} Exception: {1}".format(address, exc))
            data["status"] = "inactive"

        store.set(key, json.dumps(data))
        logger.info("Finished bootstrapping for {0}".format(address))
        logging.debug("Finished bootstrapping for {0}: {1}".format(address, data))

        clean_up_key(key_file)
        if run_once:
            logger.info("Exiting due to run_once request.")
            break

    logger.info("Investigator stopping")
Example #12
0
def clusterexec(cluster_name, command, store):
    """
    Remote executes a shell commands across a cluster.

    :param store: Data store to place results.
    :type store: etcd.Client
    """
    logger = logging.getLogger('clusterexec')

    # TODO: This is a hack and should really be done elsewhere
    if command == 'upgrade':
        finished_hosts_key = 'upgraded'
        cluster_status = {
            "status": 'in_process',
            "upgrade_to": 'latest',
            "upgraded": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None,
        }
    elif command == 'restart':
        finished_hosts_key = 'restarted'
        cluster_status = {
            "status": 'in_process',
            "restarted": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None
        }

    end_status = 'finished'

    # Set the initial status in the store
    logger.info('Setting initial status.')
    logger.debug('Status={0}'.format(cluster_status))
    store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
              json.dumps(cluster_status))

    # Collect all host addresses in the cluster
    etcd_resp = store.get('/commissaire/clusters/{0}'.format(cluster_name))
    cluster_hosts = set(json.loads(etcd_resp.value).get('hostset', []))
    if cluster_hosts:
        logger.debug('{0} hosts in cluster {1}'.format(len(cluster_hosts),
                                                       cluster_name))
    else:
        logger.warn('No hosts in cluster {1}'.format(cluster_name))

    # TODO: Find better way to do this
    for a_host_dict in store.get('/commissaire/hosts')._children:
        a_host = json.loads(a_host_dict['value'])
        if a_host['address'] not in cluster_hosts:
            logger.debug('Skipping {0} as it is not in this cluster.'.format(
                a_host['address']))
            continue  # Move on to the next one
        oscmd = get_oscmd(a_host['os'])

        command_list = getattr(oscmd(), command)()  # Only used for logging
        logger.info('Executing {0} on {1}...'.format(command_list,
                                                     a_host['address']))

        cluster_status['in_process'].append(a_host['address'])
        store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
                  json.dumps(cluster_status))

        # TODO: This is reused, make it reusable
        f = tempfile.NamedTemporaryFile(prefix='key', delete=False)
        key_file = f.name
        logger.debug('Using {0} as the temporary key location for {1}'.format(
            key_file, a_host['address']))
        f.write(base64.decodestring(a_host['ssh_priv_key']))
        logger.debug('Wrote key for {0}'.format(a_host['address']))
        f.close()

        transport = ansibleapi.Transport()
        result, facts = getattr(transport, command)(a_host['address'],
                                                    key_file, oscmd())
        try:
            f.unlink(key_file)
            logger.debug('Removed temporary key file {0}'.format(key_file))
        except:
            logger.warn('Unable to remove the temporary key file: {0}'.format(
                key_file))

        # If there was a failure set the end_status and break out
        if result != 0:
            end_status = 'failed'
            break

        cluster_status[finished_hosts_key].append(a_host['address'])
        try:
            idx = cluster_status['in_process'].index(a_host['address'])
            cluster_status['in_process'].pop(idx)
        except ValueError:
            logger.warn('Host {0} was not in_process for {1} {2}'.format(
                a_host['address'], command, cluster_name))

        store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
                  json.dumps(cluster_status))
        logger.info('Finished executing {0} for {1} in {2}'.format(
            command, a_host['address'], cluster_name))

    # Final set of command result
    cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat()
    cluster_status['status'] = end_status
    store.set('/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
              json.dumps(cluster_status))

    logger.info('Clusterexec stopping')
Example #13
0
def investigator(queue, config, run_once=False):
    """
    Investigates new hosts to retrieve and store facts.

    :param queue: Queue to pull work from.
    :type queue: Queue.Queue
    :param config: Configuration information.
    :type config: commissaire.config.Config
    """
    logger = logging.getLogger('investigator')
    logger.info('Investigator started')

    while True:
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses
        store_manager, to_investigate, ssh_priv_key, remote_user = queue.get()
        address = to_investigate['address']
        logger.info('{0} is now in investigating.'.format(address))
        logger.debug(
            'Investigation details: key={0}, data={1}, remote_user={2}'.format(
                to_investigate, ssh_priv_key, remote_user))

        transport = ansibleapi.Transport(remote_user)

        try:
            host = store_manager.get(
                Host(
                    address=address,
                    status='',
                    os='',
                    cpus=0,
                    memory=0,
                    space=0,
                    last_check='',
                    ssh_priv_key='',
                    remote_user=''))
            key = TemporarySSHKey(host, logger)
            key.create()
        except Exception as error:
            logger.warn(
                'Unable to continue for {0} due to '
                '{1}: {2}. Returning...'.format(address, type(error), error))
            key.remove()
            continue

        try:
            result, facts = transport.get_info(address, key.path)
            # recreate the host instance with new data
            data = json.loads(host.to_json(secure=True))
            data.update(facts)
            host = Host(**data)
            host.last_check = datetime.datetime.utcnow().isoformat()
            host.status = 'bootstrapping'
            logger.info('Facts for {0} retrieved'.format(address))
            logger.debug('Data: {0}'.format(host.to_json()))
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Getting info failed for {0}: {1}'.format(
                address, exc_msg))
            host.status = 'failed'
            store_manager.save(host)
            key.remove()
            if run_once:
                break
            continue

        store_manager.save(host)
        logger.info(
            'Finished and stored investigation data for {0}'.format(address))
        logger.debug('Finished investigation update for {0}: {1}'.format(
            address, host.to_json()))

        logger.info('{0} is now in bootstrapping'.format(address))
        oscmd = get_oscmd(host.os)
        try:
            result, facts = transport.bootstrap(
                address, key.path, config, oscmd, store_manager)
            host.status = 'inactive'
            store_manager.save(host)
        except:
            exc_type, exc_msg, tb = sys.exc_info()
            logger.warn('Unable to start bootstraping for {0}: {1}'.format(
                address, exc_msg))
            host.status = 'disassociated'
            store_manager.save(host)
            key.remove()
            if run_once:
                break
            continue

        host.status = cluster_type = C.CLUSTER_TYPE_HOST
        try:
            cluster = util.cluster_for_host(address, store_manager)
            cluster_type = cluster.type
        except KeyError:
            # Not part of a cluster
            pass

        # Verify association with the container manager
        if cluster_type == C.CLUSTER_TYPE_KUBERNETES:
            try:
                container_mgr = KubeContainerManager(config)
                # Try 3 times waiting 5 seconds each time before giving up
                for cnt in range(0, 3):
                    if container_mgr.node_registered(address):
                        logger.info(
                            '{0} has been registered with the '
                            'container manager.'.format(address))
                        host.status = 'active'
                        break
                    if cnt == 3:
                        msg = 'Could not register with the container manager'
                        logger.warn(msg)
                        raise Exception(msg)
                    logger.debug(
                        '{0} has not been registered with the container '
                        ' manager. Checking again in 5 seconds...'.format(
                            address))
                    sleep(5)
            except:
                _, exc_msg, _ = sys.exc_info()
                logger.warn(
                    'Unable to finish bootstrap for {0} while associating '
                    'with the container manager: {1}'.format(
                        address, exc_msg))
                host.status = 'inactive'

        store_manager.save(host)
        logger.info(
            'Finished bootstrapping for {0}'.format(address))
        logging.debug('Finished bootstrapping for {0}: {1}'.format(
            address, host.to_json()))

        key.remove()
        if run_once:
            logger.info('Exiting due to run_once request.')
            break

    logger.info('Investigator stopping')
Example #14
0
def clusterexec(cluster_name, command, store):
    """
    Remote executes a shell commands across a cluster.

    :param store: Data store to place results.
    :type store: etcd.Client
    """
    logger = logging.getLogger('clusterexec')

    # TODO: This is a hack and should really be done elsewhere
    if command == 'upgrade':
        finished_hosts_key = 'upgraded'
        cluster_status = {
            "status": 'inprocess',
            "upgrade_to": 'latest',
            "upgraded": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None,
        }
    elif command == 'restart':
        finished_hosts_key = 'restarted'
        cluster_status = {
            "status": 'inprocess',
            "restarted": [],
            "in_process": [],
            "started_at": datetime.datetime.utcnow().isoformat(),
            "finished_at": None
        }

    end_status = 'finished'

    # Set the initial status in the store
    logger.info('Setting initial status.')
    logger.debug('Status={0}'.format(cluster_status))
    store.set(
        '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
        json.dumps(cluster_status))

    # TODO: Find better way to do this
    for a_host_dict in store.get('/commissaire/hosts')._children:
        a_host = json.loads(a_host_dict['value'])
        if a_host['cluster'] != cluster_name:
            logger.debug('Skipping {0} as it is not in this cluster.'.format(
                a_host['address']))
            continue  # Move on to the next one
        oscmd = get_oscmd(a_host['os'])

        command_list = getattr(oscmd(), command)()  # Only used for logging
        logger.info('Executing {0} on {1}...'.format(
            command_list, a_host['address']))

        cluster_status['in_process'].append(a_host['address'])
        store.set(
            '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
            json.dumps(cluster_status))

        # TODO: This is reused, make it reusable
        f = tempfile.NamedTemporaryFile(prefix='key', delete=False)
        key_file = f.name
        logger.debug(
            'Using {0} as the temporary key location for {1}'.format(
                key_file, a_host['address']))
        f.write(base64.decodestring(a_host['ssh_priv_key']))
        logger.debug('Wrote key for {0}'.format(a_host['address']))
        f.close()

        transport = ansibleapi.Transport()
        result, facts = getattr(transport, command)(
            a_host['address'], key_file, oscmd())
        try:
            f.unlink(key_file)
            logger.debug('Removed temporary key file {0}'.format(key_file))
        except:
            logger.warn(
                'Unable to remove the temporary key file: {0}'.format(
                    key_file))

        # If there was a failure set the end_status and break out
        if result != 0:
            end_status = 'failed'
            break

        cluster_status[finished_hosts_key].append(a_host['address'])
        try:
            idx = cluster_status['in_process'].index(a_host['address'])
            cluster_status['in_process'].pop(idx)
        except ValueError:
            logger.warn('Host {0} was not in_process for {1} {2}'.format(
                a_host['address'], command, cluster_name))

        store.set(
            '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
            json.dumps(cluster_status))
        logger.info('Finished executing {0} for {1} in {2}'.format(
            command, a_host['address'], cluster_name))

    # Final set of command result
    cluster_status['finished_at'] = datetime.datetime.utcnow().isoformat()
    cluster_status['status'] = end_status
    store.set(
        '/commissaire/cluster/{0}/{1}'.format(cluster_name, command),
        json.dumps(cluster_status))

    logger.info('Clusterexec stopping')
Example #15
0
def clusterexec(store_manager, cluster_name, command, kwargs={}):
    """
    Remote executes a shell commands across a cluster.

    :param store_manager: Proxy object for remtote stores
    :type store_manager: commissaire.store.StoreHandlerManager
    :param cluster_name: Name of the cluster to act on
    :type cluster_name: str
    :param command: Top-level command to execute
    :type command: str
    :param kwargs: Keyword arguments for the command
    :type kwargs: dict
    """
    logger = logging.getLogger('clusterexec')

    # TODO: This is a hack and should really be done elsewhere
    command_args = ()
    if command == 'upgrade':
        finished_hosts_key = 'upgraded'
        model_instance = ClusterUpgrade.new(
            name=cluster_name,
            status='in_process',
            started_at=datetime.datetime.utcnow().isoformat(),
            upgraded=[],
            in_process=[],
        )
    elif command == 'restart':
        finished_hosts_key = 'restarted'
        model_instance = ClusterRestart.new(
            name=cluster_name,
            status='in_process',
            started_at=datetime.datetime.utcnow().isoformat(),
            restarted=[],
            in_process=[],
        )
    elif command == 'deploy':
        finished_hosts_key = 'deployed'
        version = kwargs.get('version', '')
        command_args = (version,)
        model_instance = ClusterDeploy.new(
            name=cluster_name,
            status='in_process',
            started_at=datetime.datetime.utcnow().isoformat(),
            version=version,
            deployed=[],
            in_process=[],
        )

    end_status = 'finished'

    try:
        # Set the initial status in the store
        logger.info('Setting initial status.')
        logger.debug('Status={0}'.format(model_instance.to_json()))
        store_manager.save(model_instance)
    except Exception as error:
        logger.error(
            'Unable to save initial state for "{0}" clusterexec due to '
            '{1}: {2}'.format(cluster_name, type(error), error))
        return

    # Collect all host addresses in the cluster
    try:
        cluster = store_manager.get(Cluster.new(
            name=cluster_name, status='', hostset=[]))
    except Exception as error:
        logger.warn(
            'Unable to continue for cluster "{0}" due to '
            '{1}: {2}. Returning...'.format(cluster_name, type(error), error))
        return

    if cluster.hostset:
        logger.debug(
            '{0} hosts in cluster "{1}"'.format(
                len(cluster.hostset), cluster_name))
    else:
        logger.warn('No hosts in cluster "{0}"'.format(cluster_name))

    # TODO: Find better way to do this
    try:
        hosts = store_manager.list(Hosts(hosts=[]))
    except Exception as error:
        logger.warn(
            'No hosts in the cluster. Error: {0}. Exiting clusterexec'.format(
                error))
        return

    for host in hosts.hosts:
        if host.address not in cluster.hostset:
            logger.debug(
                'Skipping {0} as it is not in this cluster.'.format(
                    host.address))
            continue  # Move on to the next one
        oscmd = get_oscmd(host.os)

        # command_list is only used for logging
        command_list = getattr(oscmd, command)(*command_args)
        logger.info('Executing {0} on {1}...'.format(
            command_list, host.address))

        model_instance.in_process.append(host.address)
        try:
            store_manager.save(model_instance)
        except Exception as error:
            logger.error(
                'Unable to save in_process state for "{0}" clusterexec due to '
                '{1}: {2}'.format(cluster_name, type(error), error))
            return

        key = TemporarySSHKey(host, logger)
        key.create()

        try:
            transport = ansibleapi.Transport(host.remote_user)
            exe = getattr(transport, command)
            result, facts = exe(
                host.address, key.path, oscmd, kwargs)
        # XXX: ansibleapi explicitly raises Exception()
        except Exception as ex:
            # If there was a failure set the end_status and break out
            end_status = 'failed'
            logger.error('Clusterexec {0} for {1} failed: {2}: {3}'.format(
                command, host.address, type(ex), ex))
            break
        finally:
            try:
                key.remove()
                logger.debug('Removed temporary key file {0}'.format(key.path))
            except:
                logger.warn(
                    'Unable to remove the temporary key file: {0}'.format(
                        key.path))

        # Set the finished hosts
        new_finished_hosts = getattr(
            model_instance, finished_hosts_key) + [host.address]
        setattr(
            model_instance,
            finished_hosts_key,
            new_finished_hosts)
        try:
            idx = model_instance.in_process.index(host.address)
            model_instance.in_process.pop(idx)
        except ValueError:
            logger.warn('Host {0} was not in_process for {1} {2}'.format(
                host['address'], command, cluster_name))
        try:
            store_manager.save(model_instance)
            logger.info('Finished executing {0} for {1} in {2}'.format(
                command, host.address, cluster_name))
        except Exception as error:
            logger.error(
                'Unable to save cluster state for "{0}" clusterexec due to '
                '{1}: {2}'.format(cluster_name, type(error), error))
            return

    # Final set of command result
    model_instance.finished_at = datetime.datetime.utcnow().isoformat()
    model_instance.status = end_status

    logger.info('Cluster {0} final {1} status: {2}'.format(
        cluster_name, command, model_instance.to_json()))

    try:
        store_manager.save(model_instance)
    except Exception as error:
        logger.error(
            'Unable to save final state for "{0}" clusterexec due to '
            '{1}: {2}'.format(cluster_name, type(error), error))

    logger.info('Clusterexec stopping')