Beispiel #1
0
    def _run_playbook(self, cluster, playbook, extra_args):
        run_id = ('elasticluster.{name}.{date}.{pid}@{host}'.format(
            name=cluster.name,
            date=datetime.now().isoformat(),
            pid=os.getpid(),
            host=platform.node(),
        ))
        inventory_path = self._build_inventory(cluster)
        if inventory_path is None:
            # no inventory file has been created: this can only happen
            # if no nodes have been started nor can be reached
            raise ClusterSizeError()
        assert os.path.exists(inventory_path), (
            "inventory file `{inventory_path}` does not exist".format(
                inventory_path=inventory_path))

        # build list of directories to search for roles/include files
        ansible_roles_dirs = [
            # include Ansible default first ...
            '/etc/ansible/roles',
        ]
        for root_path in [
                # ... then ElastiCluster's built-in defaults
                resource_filename('elasticluster', 'share/playbooks'),
                # ... then wherever the playbook is
                os.path.dirname(playbook),
        ]:
            for path in [
                    root_path,
                    os.path.join(root_path, 'roles'),
            ]:
                if path not in ansible_roles_dirs and os.path.exists(path):
                    ansible_roles_dirs.append(path)

        # Use env vars to configure Ansible;
        # see all values in https://github.com/ansible/ansible/blob/devel/lib/ansible/constants.py
        #
        # Ansible does not merge keys in configuration files: rather
        # it uses the first configuration file found.  However,
        # environment variables can be used to selectively override
        # parts of the config; according to [1]: "they are mostly
        # considered to be a legacy system as compared to the config
        # file, but are equally valid."
        #
        # [1]: http://docs.ansible.com/ansible/intro_configuration.html#environmental-configuration
        #
        # Provide default values for important configuration variables...
        ansible_env = {
            'ANSIBLE_FORKS': ('%d' % 4 * get_num_processors()),
            'ANSIBLE_HOST_KEY_CHECKING': 'no',
            'ANSIBLE_RETRY_FILES_ENABLED': 'no',
            'ANSIBLE_ROLES_PATH': ':'.join(reversed(ansible_roles_dirs)),
            'ANSIBLE_SSH_PIPELINING': 'yes',
            'ANSIBLE_TIMEOUT': '120',
        }
        try:
            import ara
            ara_location = os.path.dirname(ara.__file__)
            ansible_env['ANSIBLE_CALLBACK_PLUGINS'] = (
                '{ara_location}/plugins/callbacks'.format(
                    ara_location=ara_location))
            ansible_env['ANSIBLE_ACTION_PLUGINS'] = (
                '{ara_location}/plugins/actions'.format(
                    ara_location=ara_location))
            ansible_env['ANSIBLE_LIBRARY'] = (
                '{ara_location}/plugins/modules'.format(
                    ara_location=ara_location))
            ara_dir = os.getcwd()
            ansible_env['ARA_DIR'] = ara_dir
            ansible_env['ARA_DATABASE'] = (
                'sqlite:///{ara_dir}/{run_id}.ara.sqlite'.format(
                    ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_CONFIG'] = ('{run_id}.ara.yml'.format(
                ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_FILE'] = ('{run_id}.ara.log'.format(
                ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_LEVEL'] = 'DEBUG'
            ansible_env['ARA_PLAYBOOK_PER_PAGE'] = '0'
            ansible_env['ARA_RESULT_PER_PAGE'] = '0'
        except ImportError:
            elasticluster.log.info(
                "Could not import module `ara`:"
                " no detailed information about the playbook will be recorded."
            )
        # ...override them with key/values set in the config file(s)
        for k, v in self.extra_conf.items():
            if k.startswith('ansible_'):
                ansible_env[k.upper()] = str(v)
        # ...finally allow the environment have the final word
        ansible_env.update(os.environ)
        # however, this is needed for correct detection of success/failure...
        ansible_env['ANSIBLE_ANY_ERRORS_FATAL'] = 'yes'
        # ...and this might be needed to connect (see issue #567)
        if cluster.ssh_proxy_command:
            ansible_env['ANSIBLE_SSH_ARGS'] = (
                ansible_env.get('ANSIBLE_SSH_ARGS', '')
                + (" -o ProxyCommand='{proxy_command}'"
                   # NOTE: in contrast to `Node.connect()`, we must
                   # *not* expand %-escapes in the SSH proxy command:
                   # it will be done by the `ssh` client
                   .format(proxy_command=cluster.ssh_proxy_command)))

        # report on calling environment
        if __debug__:
            elasticluster.log.debug(
                "Calling `ansible-playbook` with the following environment:")
            for var, value in sorted(ansible_env.items()):
                # sanity check. Do not print password content....
                if "password" in var.lower() or "secret" in var.lower():
                    elasticluster.log.debug("- %s=******", var)
                else:
                    elasticluster.log.debug("- %s=%r", var, value)

        elasticluster.log.debug("Using playbook file %s.", playbook)

        # build `ansible-playbook` command-line
        cmd = shlex.split(
            self.extra_conf.get('ansible_command', 'ansible-playbook'))
        cmd += [
            ('--private-key=' + cluster.user_key_private),
            os.path.realpath(playbook),
            ('--inventory=' + inventory_path),
        ]

        if self._sudo:
            cmd.extend([
                # force all plays to use `sudo` (even if not marked as such)
                '--become',
                # desired sudo-to user
                ('--become-user='******'s
        # log level (we cannot read `ElastiCluster().params.verbose`
        # here, still we can access the log configuration since it's
        # global).
        verbosity = int(
            (logging.WARNING - elasticluster.log.getEffectiveLevel()) / 10)
        if verbosity > 0:
            cmd.append('-' + ('v' * verbosity))  # e.g., `-vv`

        # append any additional arguments provided by users in config file
        ansible_extra_args = self.extra_conf.get('ansible_extra_args', None)
        if ansible_extra_args:
            cmd += shlex.split(ansible_extra_args)

        # finally, append any additional arguments provided on command-line
        for arg in extra_args:
            # XXX: since we are going to change working directory,
            # make sure that anything that looks like a path to an
            # existing file is made absolute before appending to
            # Ansible's command line.  (Yes, this is a ugly hack.)
            if os.path.exists(arg):
                arg = os.path.abspath(arg)
            cmd.append(arg)

        with temporary_dir():
            # adjust execution environment, for the part that needs a
            # the current directory path
            cmd += ['-e', ('@' + self._write_extra_vars(cluster))]
            # run it!
            cmdline = ' '.join(cmd)
            elasticluster.log.debug("Running Ansible command `%s` ...",
                                    cmdline)
            rc = call(cmd, env=ansible_env, bufsize=1, close_fds=True)
            # check outcome
            ok = False  # pessimistic default
            if rc != 0:
                elasticluster.log.error(
                    "Command `%s` failed with exit code %d.", cmdline, rc)
            else:
                # even if Ansible exited with return code 0, the
                # playbook might still have failed -- so explicitly
                # check for a "done" report showing that each node run
                # the playbook until the very last task
                cluster_hosts = set(node.name
                                    for node in cluster.get_all_nodes())
                done_hosts = set()
                for node_name in cluster_hosts:
                    try:
                        with open(node_name + '.log') as stream:
                            status = stream.read().strip()
                        if status == 'done':
                            done_hosts.add(node_name)
                    except (OSError, IOError):
                        # no status file for host, do not add it to
                        # `done_hosts`
                        pass
                if done_hosts == cluster_hosts:
                    # success!
                    ok = True
                elif len(done_hosts) == 0:
                    # total failure
                    elasticluster.log.error(
                        "No host reported successfully running the setup playbook!"
                    )
                else:
                    # partial failure
                    elasticluster.log.error(
                        "The following nodes did not report"
                        " successful termination of the setup playbook:"
                        " %s", (', '.join(cluster_hosts - done_hosts)))
        if ok:
            elasticluster.log.info("Cluster correctly configured.")
            return True
        else:
            elasticluster.log.warning(
                "The cluster has likely *not* been configured correctly."
                " You may need to re-run `elasticluster setup`.")
            return False
    def start(self, min_nodes=None, max_concurrent_requests=0):
        """
        Starts up all the instances in the cloud.

        To speed things up, all
        instances are started in a seperate thread. To make sure
        ElastiCluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.

        A VM instance is considered 'up and running' as soon as an SSH
        connection can be established. If the startup timeout is reached before
        all instances are started, ElastiCluster stops the cluster and
        terminates all VM instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        :param int max_concurrent_requests:
          Issue at most this number of requests to start
          VMs; if 1 or less, start nodes one at a time (sequentially).
          The special value ``0`` means run 4 threads for each available
          processor.
        """

        nodes = self.get_all_nodes()

        log.info("Starting cluster nodes ...")
        if max_concurrent_requests == 0:
            try:
                max_concurrent_requests = 4 * get_num_processors()
            except RuntimeError:
                log.warning("Cannot determine number of processors!"
                            " will start nodes sequentially...")
                max_concurrent_requests = 1
        if max_concurrent_requests > 1:
            nodes = self._start_nodes_parallel(nodes, max_concurrent_requests)
        else:
            nodes = self._start_nodes_sequentially(nodes)

        # checkpoint cluster state
        self.repository.save_or_update(self)

        not_started_nodes = self._check_starting_nodes(nodes,
                                                       self.startup_timeout)

        # now that all nodes are up, checkpoint cluster state again
        self.repository.save_or_update(self)

        # Try to connect to each node to gather IP addresses and SSH host keys
        log.info("Checking SSH connection to nodes ...")
        pending_nodes = nodes - not_started_nodes
        self._gather_node_ip_addresses(pending_nodes, self.startup_timeout)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        min_nodes = self._compute_min_nodes(min_nodes)
        self._check_cluster_size(min_nodes)
    def _run_playbook(self, cluster, playbook, extra_args):
        run_id = (
            'elasticluster.{name}.{date}.{pid}@{host}'
            .format(
                name=cluster.name,
                date=datetime.now().isoformat(),
                pid=os.getpid(),
                host=platform.node(),
            )
        )
        inventory_path = self._build_inventory(cluster)
        if inventory_path is None:
            # no inventory file has been created: this can only happen
            # if no nodes have been started nor can be reached
            raise ClusterSizeError()
        assert os.path.exists(inventory_path), (
                "inventory file `{inventory_path}` does not exist"
                .format(inventory_path=inventory_path))

        # build list of directories to search for roles/include files
        ansible_roles_dirs = [
            # include Ansible default first ...
            '/etc/ansible/roles',
        ]
        for root_path in [
                # ... then ElastiCluster's built-in defaults
                resource_filename('elasticluster', 'share/playbooks'),
                # ... then wherever the playbook is
                os.path.dirname(playbook),
        ]:
            for path in [
                    root_path,
                    os.path.join(root_path, 'roles'),
            ]:
                if path not in ansible_roles_dirs and os.path.exists(path):
                    ansible_roles_dirs.append(path)


        # Use env vars to configure Ansible;
        # see all values in https://github.com/ansible/ansible/blob/devel/lib/ansible/constants.py
        #
        # Ansible does not merge keys in configuration files: rather
        # it uses the first configuration file found.  However,
        # environment variables can be used to selectively override
        # parts of the config; according to [1]: "they are mostly
        # considered to be a legacy system as compared to the config
        # file, but are equally valid."
        #
        # [1]: http://docs.ansible.com/ansible/intro_configuration.html#environmental-configuration
        #
        # Provide default values for important configuration variables...
        ansible_env = {
            'ANSIBLE_FORKS':             ('%d' % 4*get_num_processors()),
            'ANSIBLE_HOST_KEY_CHECKING': 'no',
            'ANSIBLE_RETRY_FILES_ENABLED': 'no',
            'ANSIBLE_ROLES_PATH':        ':'.join(reversed(ansible_roles_dirs)),
            'ANSIBLE_SSH_PIPELINING':    'yes',
            'ANSIBLE_TIMEOUT':           '120',
        }
        try:
            import ara
            ara_location = os.path.dirname(ara.__file__)
            ansible_env['ANSIBLE_CALLBACK_PLUGINS'] = (
                '{ara_location}/plugins/callbacks'
                .format(ara_location=ara_location))
            ansible_env['ANSIBLE_ACTION_PLUGINS'] = (
                '{ara_location}/plugins/actions'
                .format(ara_location=ara_location))
            ansible_env['ANSIBLE_LIBRARY'] = (
                '{ara_location}/plugins/modules'
                .format(ara_location=ara_location))
            ara_dir = os.getcwd()
            ansible_env['ARA_DIR'] = ara_dir
            ansible_env['ARA_DATABASE'] = (
                'sqlite:///{ara_dir}/{run_id}.ara.sqlite'
                .format(ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_CONFIG'] = (
                '{run_id}.ara.yml'
                .format(ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_FILE'] = (
                '{run_id}.ara.log'
                .format(ara_dir=ara_dir, run_id=run_id))
            ansible_env['ARA_LOG_LEVEL'] = 'DEBUG'
            ansible_env['ARA_PLAYBOOK_PER_PAGE'] = '0'
            ansible_env['ARA_RESULT_PER_PAGE'] = '0'
        except ImportError:
            elasticluster.log.info(
                "Could not import module `ara`:"
                " no detailed information about the playbook will be recorded.")
        # ...override them with key/values set in the config file(s)
        for k, v in self.extra_conf.items():
            if k.startswith('ansible_'):
                ansible_env[k.upper()] = str(v)
        # ...finally allow the environment have the final word
        ansible_env.update(os.environ)
        # however, this is needed for correct detection of success/failure...
        ansible_env['ANSIBLE_ANY_ERRORS_FATAL'] = 'yes'
        # ...and this might be needed to connect (see issue #567)
        if cluster.ssh_proxy_command:
            ansible_env['ANSIBLE_SSH_ARGS'] = (
                ansible_env.get('ANSIBLE_SSH_ARGS', '')
                + (" -o ProxyCommand='{proxy_command}'"
                   # NOTE: in contrast to `Node.connect()`, we must
                   # *not* expand %-escapes in the SSH proxy command:
                   # it will be done by the `ssh` client
                   .format(proxy_command=cluster.ssh_proxy_command)))

        # report on calling environment
        if __debug__:
            elasticluster.log.debug(
                "Calling `ansible-playbook` with the following environment:")
            for var, value in sorted(ansible_env.items()):
                # sanity check. Do not print password content....
                if "password" in var.lower() or "secret" in var.lower():
                    elasticluster.log.debug("- %s=******", var)
                else:
                    elasticluster.log.debug("- %s=%r", var, value)

        elasticluster.log.debug("Using playbook file %s.", playbook)

        # build `ansible-playbook` command-line
        cmd = shlex.split(self.extra_conf.get('ansible_command', 'ansible-playbook'))
        cmd += [
            ('--private-key=' + cluster.user_key_private),
            os.path.realpath(playbook),
            ('--inventory=' + inventory_path),
        ]

        if self._sudo:
            cmd.extend([
                # force all plays to use `sudo` (even if not marked as such)
                '--become',
                # desired sudo-to user
                ('--become-user='******'s
        # log level (we cannot read `ElastiCluster().params.verbose`
        # here, still we can access the log configuration since it's
        # global).
        verbosity = int((logging.WARNING - elasticluster.log.getEffectiveLevel()) / 10)
        if verbosity > 0:
            cmd.append('-' + ('v' * verbosity))  # e.g., `-vv`

        # append any additional arguments provided by users in config file
        ansible_extra_args = self.extra_conf.get('ansible_extra_args', None)
        if ansible_extra_args:
            cmd += shlex.split(ansible_extra_args)

        # finally, append any additional arguments provided on command-line
        for arg in extra_args:
            # XXX: since we are going to change working directory,
            # make sure that anything that looks like a path to an
            # existing file is made absolute before appending to
            # Ansible's command line.  (Yes, this is a ugly hack.)
            if os.path.exists(arg):
                arg = os.path.abspath(arg)
            cmd.append(arg)

        with temporary_dir():
            # adjust execution environment, for the part that needs a
            # the current directory path
            cmd += [
                '-e', ('@' + self._write_extra_vars(cluster))
            ]
            # run it!
            cmdline = ' '.join(cmd)
            elasticluster.log.debug(
                "Running Ansible command `%s` ...", cmdline)
            rc = call(cmd, env=ansible_env, bufsize=1, close_fds=True)
            # check outcome
            ok = False  # pessimistic default
            if rc != 0:
                elasticluster.log.error(
                    "Command `%s` failed with exit code %d.", cmdline, rc)
            else:
                # even if Ansible exited with return code 0, the
                # playbook might still have failed -- so explicitly
                # check for a "done" report showing that each node run
                # the playbook until the very last task
                cluster_hosts = set(node.name
                                    for node in cluster.get_all_nodes())
                done_hosts = set()
                for node_name in cluster_hosts:
                    try:
                        with open(node_name + '.log') as stream:
                            status = stream.read().strip()
                        if status == 'done':
                            done_hosts.add(node_name)
                    except (OSError, IOError):
                        # no status file for host, do not add it to
                        # `done_hosts`
                        pass
                if done_hosts == cluster_hosts:
                    # success!
                    ok = True
                elif len(done_hosts) == 0:
                    # total failure
                    elasticluster.log.error(
                        "No host reported successfully running the setup playbook!")
                else:
                    # partial failure
                    elasticluster.log.error(
                        "The following nodes did not report"
                        " successful termination of the setup playbook:"
                        " %s", (', '.join(cluster_hosts - done_hosts)))
        if ok:
            elasticluster.log.info("Cluster correctly configured.")
            return True
        else:
            elasticluster.log.warning(
                "The cluster has likely *not* been configured correctly."
                " You may need to re-run `elasticluster setup`.")
            return False