Ejemplo n.º 1
0
Archivo: ops.py Proyecto: turu/sahara
def _provision_cluster(cluster_id):
    ctx, cluster, plugin = _prepare_provisioning(cluster_id)

    if CONF.use_identity_api_v3 and cluster.is_transient:
        trusts.create_trust_for_cluster(cluster)

    # updating cluster infra
    cluster = g.change_cluster_status(cluster, "InfraUpdating")
    plugin.update_infra(cluster)

    # creating instances and configuring them
    cluster = conductor.cluster_get(ctx, cluster_id)
    INFRA.create_cluster(cluster)

    if not g.check_cluster_exists(cluster):
        LOG.info(g.format_cluster_deleted_message(cluster))
        return

    # configure cluster
    cluster = g.change_cluster_status(cluster, "Configuring")
    try:
        plugin.configure_cluster(cluster)
    except Exception as ex:
        if not g.check_cluster_exists(cluster):
            LOG.info(g.format_cluster_deleted_message(cluster))
            return
        LOG.exception(
            _LE("Can't configure cluster '%(name)s' (reason: %(reason)s)"),
            {'name': cluster.name, 'reason': ex})
        g.change_cluster_status(cluster, "Error")
        return

    if not g.check_cluster_exists(cluster):
        LOG.info(g.format_cluster_deleted_message(cluster))
        return

    # starting prepared and configured cluster
    cluster = g.change_cluster_status(cluster, "Starting")
    try:
        plugin.start_cluster(cluster)
    except Exception as ex:
        if not g.check_cluster_exists(cluster):
            LOG.info(g.format_cluster_deleted_message(cluster))
            return
        LOG.exception(
            _LE("Can't start services for cluster '%(name)s' (reason: "
                "%(reason)s)"), {'name': cluster.name, 'reason': ex})
        g.change_cluster_status(cluster, "Error")
        return

    if not g.check_cluster_exists(cluster):
        LOG.info(g.format_cluster_deleted_message(cluster))
        return

    # cluster is now up and ready
    cluster = g.change_cluster_status(cluster, "Active")

    # schedule execution pending job for cluster
    for je in conductor.job_execution_get_all(ctx, cluster_id=cluster.id):
        job_manager.run_job(je.id)
Ejemplo n.º 2
0
    def scale_cluster(self, cluster, node_group_id_map):
        ctx = context.ctx()

        instance_ids = []
        try:
            instance_ids = self._scale_cluster_instances(cluster,
                                                         node_group_id_map)

            cluster = conductor.cluster_get(ctx, cluster)
            g.clean_cluster_from_empty_ng(cluster)

            cluster = conductor.cluster_get(ctx, cluster)
            instances = g.get_instances(cluster, instance_ids)

            self._await_active(cluster, instances)

            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return []

            self._assign_floating_ips(instances)

            self._await_networks(cluster, instances)

            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return []

            cluster = conductor.cluster_get(ctx, cluster)

            volumes.attach_to_instances(
                g.get_instances(cluster, instance_ids))

        except Exception as ex:
            with excutils.save_and_reraise_exception():
                if not g.check_cluster_exists(cluster):
                    LOG.info(g.format_cluster_deleted_message(cluster))
                    return []

                self._log_operation_exception(
                    "Can't scale cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_get(ctx, cluster)
                self._rollback_cluster_scaling(
                    cluster, g.get_instances(cluster, instance_ids), ex)
                instance_ids = []

                cluster = conductor.cluster_get(ctx, cluster)
                g.clean_cluster_from_empty_ng(cluster)
                cluster = conductor.cluster_update(ctx, cluster,
                                                   {"status": "Active"})

                LOG.info(g.format_cluster_status(cluster))

        # we should be here with valid cluster: if instances creation
        # was not successful all extra-instances will be removed above
        if instance_ids:
            self._configure_instances(cluster)
        return instance_ids
Ejemplo n.º 3
0
    def create_cluster(self, cluster):
        ctx = context.ctx()
        try:
            # create all instances
            conductor.cluster_update(ctx, cluster, {"status": "Spawning"})
            LOG.info(g.format_cluster_status(cluster))
            self._create_instances(cluster)

            # wait for all instances are up and networks ready
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Waiting"})
            LOG.info(g.format_cluster_status(cluster))

            instances = g.get_instances(cluster)

            self._await_active(cluster, instances)

            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return

            self._assign_floating_ips(instances)

            self._await_networks(cluster, instances)

            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return

            cluster = conductor.cluster_get(ctx, cluster)

            # attach volumes
            volumes.attach(cluster)

            # prepare all instances
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Preparing"})
            LOG.info(g.format_cluster_status(cluster))

            self._configure_instances(cluster)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                if not g.check_cluster_exists(cluster):
                    LOG.info(g.format_cluster_deleted_message(cluster))
                    return

                self._log_operation_exception(
                    "Can't start cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_update(
                    ctx, cluster, {"status": "Error",
                                   "status_description": str(ex)})
                LOG.info(g.format_cluster_status(cluster))
                self._rollback_cluster_creation(cluster, ex)
Ejemplo n.º 4
0
Archivo: ops.py Proyecto: B-Rich/sahara
def _provision_scaled_cluster(cluster_id, node_group_id_map):
    ctx, cluster, plugin = _prepare_provisioning(cluster_id)

    # Decommissioning surplus nodes with the plugin

    cluster = conductor.cluster_update(ctx, cluster,
                                       {"status": "Decommissioning"})
    LOG.info(g.format_cluster_status(cluster))

    instances_to_delete = []

    for node_group in cluster.node_groups:
        new_count = node_group_id_map[node_group.id]
        if new_count < node_group.count:
            instances_to_delete += node_group.instances[new_count:
                                                        node_group.count]

    if instances_to_delete:
        plugin.decommission_nodes(cluster, instances_to_delete)

    # Scaling infrastructure
    cluster = conductor.cluster_update(ctx, cluster, {"status": "Scaling"})
    LOG.info(g.format_cluster_status(cluster))

    instances = INFRA.scale_cluster(cluster, node_group_id_map)

    # Setting up new nodes with the plugin

    if instances:
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Configuring"})
        LOG.info(g.format_cluster_status(cluster))
        try:
            instances = g.get_instances(cluster, instances)
            plugin.scale_cluster(cluster, instances)
        except Exception as ex:
            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return
            LOG.exception("Can't scale cluster '%s' (reason: %s)",
                          cluster.name, ex)
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Error"})
            LOG.info(g.format_cluster_status(cluster))
            return

    if not g.check_cluster_exists(cluster):
        LOG.info(g.format_cluster_deleted_message(cluster))
        return

    cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"})
    LOG.info(g.format_cluster_status(cluster))
Ejemplo n.º 5
0
    def create_cluster(self, cluster):
        ctx = context.ctx()

        launcher = _CreateLauncher()

        try:
            target_count = self._get_ng_counts(cluster)
            self._nullify_ng_counts(cluster)

            cluster = conductor.cluster_get(ctx, cluster)

            launcher.launch_instances(ctx, cluster, target_count)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                if not g.check_cluster_exists(cluster):
                    LOG.info(g.format_cluster_deleted_message(cluster))
                    return
                self._log_operation_exception(
                    "Can't start cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_update(
                    ctx, cluster, {"status": "Error",
                                   "status_description": str(ex)})
                LOG.info(g.format_cluster_status(cluster))
                self._rollback_cluster_creation(cluster)
Ejemplo n.º 6
0
    def create_cluster(self, cluster):
        ctx = context.ctx()

        launcher = _CreateLauncher()

        try:
            target_count = self._get_ng_counts(cluster)
            self._nullify_ng_counts(cluster)

            cluster = conductor.cluster_get(ctx, cluster)
            launcher.launch_instances(ctx, cluster, target_count)

            cluster = conductor.cluster_get(ctx, cluster)
            self._add_volumes(ctx, cluster)

        except Exception as ex:
            with excutils.save_and_reraise_exception():
                if not g.check_cluster_exists(cluster):
                    LOG.info(g.format_cluster_deleted_message(cluster))
                    return
                self._log_operation_exception(
                    _LW("Can't start cluster '%(cluster)s' "
                        "(reason: %(reason)s)"), cluster, ex)

                cluster = g.change_cluster_status(
                    cluster, "Error", status_description=six.text_type(ex))
                self._rollback_cluster_creation(cluster)
Ejemplo n.º 7
0
    def launch_instances(self, ctx, cluster, target_count):
        # create all instances
        cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[0]})
        LOG.info(g.format_cluster_status(cluster))

        tmpl = heat.ClusterTemplate(cluster)

        self._configure_template(ctx, tmpl, cluster, target_count)
        stack = tmpl.instantiate(update_existing=self.UPDATE_STACK)
        stack.wait_till_active()

        self.inst_ids = self._populate_cluster(ctx, cluster, stack)

        # wait for all instances are up and networks ready
        cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[1]})
        LOG.info(g.format_cluster_status(cluster))

        instances = g.get_instances(cluster, self.inst_ids)

        self._await_networks(cluster, instances)

        if not g.check_cluster_exists(cluster):
            LOG.info(g.format_cluster_deleted_message(cluster))
            return

        # prepare all instances
        cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[2]})
        LOG.info(g.format_cluster_status(cluster))

        instances = g.get_instances(cluster, self.inst_ids)
        volumes.mount_to_instances(instances)

        self._configure_instances(cluster)
Ejemplo n.º 8
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        ips_assigned = set()
        while len(ips_assigned) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in ips_assigned:
                    if networks.init_instances_ips(instance):
                        ips_assigned.add(instance.id)

            context.sleep(1)

        LOG.info(
            _LI("Cluster '%s': all instances have IPs assigned"), cluster.id)

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
Ejemplo n.º 9
0
Archivo: ops.py Proyecto: turu/sahara
def _provision_scaled_cluster(cluster_id, node_group_id_map):
    ctx, cluster, plugin = _prepare_provisioning(cluster_id)

    # Decommissioning surplus nodes with the plugin
    cluster = g.change_cluster_status(cluster, "Decommissioning")

    instances_to_delete = []

    for node_group in cluster.node_groups:
        new_count = node_group_id_map[node_group.id]
        if new_count < node_group.count:
            instances_to_delete += node_group.instances[new_count:
                                                        node_group.count]

    if instances_to_delete:
        plugin.decommission_nodes(cluster, instances_to_delete)

    # Scaling infrastructure
    cluster = g.change_cluster_status(cluster, "Scaling")

    instances = INFRA.scale_cluster(cluster, node_group_id_map)

    # Setting up new nodes with the plugin

    if instances:
        cluster = g.change_cluster_status(cluster, "Configuring")
        try:
            instances = g.get_instances(cluster, instances)
            plugin.scale_cluster(cluster, instances)
        except Exception as ex:
            if not g.check_cluster_exists(cluster):
                LOG.info(g.format_cluster_deleted_message(cluster))
                return
            LOG.exception(
                _LE("Can't scale cluster '%(name)s' (reason: %(reason)s)"),
                {'name': cluster.name, 'reason': ex})

            g.change_cluster_status(cluster, "Error")
            return

    if not g.check_cluster_exists(cluster):
        LOG.info(g.format_cluster_deleted_message(cluster))
        return

    g.change_cluster_status(cluster, "Active")
Ejemplo n.º 10
0
 def _check_active(self, active_ids, cluster, instances):
     if not g.check_cluster_exists(cluster):
         return True
     for instance in instances:
         if instance.id not in active_ids:
             if self._check_if_active(instance):
                 active_ids.add(instance.id)
                 cpo.add_successful_event(instance)
     return len(instances) == len(active_ids)
Ejemplo n.º 11
0
 def _ips_assign(self, ips_assigned, cluster, instances):
     if not g.check_cluster_exists(cluster):
         return True
     for instance in instances:
         if instance.id not in ips_assigned:
             if networks.init_instances_ips(instance):
                 ips_assigned.add(instance.id)
                 cpo.add_successful_event(instance)
     return len(ips_assigned) == len(instances)
Ejemplo n.º 12
0
 def _check_active(self, active_ids, cluster, instances):
     if not g.check_cluster_exists(cluster):
         return True
     for instance in instances:
         if instance.id not in active_ids:
             if self._check_if_active(instance):
                 active_ids.add(instance.id)
                 cpo.add_successful_event(instance)
     return len(instances) == len(active_ids)
Ejemplo n.º 13
0
 def _ips_assign(self, ips_assigned, cluster, instances):
     if not g.check_cluster_exists(cluster):
         return True
     for instance in instances:
         if instance.id not in ips_assigned:
             if networks.init_instances_ips(instance):
                 ips_assigned.add(instance.id)
                 cpo.add_successful_event(instance)
     return len(ips_assigned) == len(instances)
Ejemplo n.º 14
0
    def scale_cluster(self, cluster, target_count):
        ctx = context.ctx()

        rollback_count = self._get_ng_counts(cluster)

        launcher = _ScaleLauncher()

        try:
            launcher.launch_instances(ctx, cluster, target_count)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                if not g.check_cluster_exists(cluster):
                    LOG.info(g.format_cluster_deleted_message(cluster))
                    return
                self._log_operation_exception(
                    "Can't scale cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_get(ctx, cluster)

                try:
                    self._rollback_cluster_scaling(
                        ctx, cluster, rollback_count, target_count)
                except Exception:
                    if not g.check_cluster_exists(cluster):
                        LOG.info(g.format_cluster_deleted_message(cluster))
                        return
                    # if something fails during the rollback, we stop
                    # doing anything further
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Error"})
                    LOG.info(g.format_cluster_status(cluster))
                    LOG.error("Unable to complete rollback, aborting")
                    raise

                cluster = conductor.cluster_update(ctx, cluster,
                                                   {"status": "Active"})
                LOG.info(g.format_cluster_status(cluster))
                LOG.warn(
                    "Rollback successful. Throwing off an initial exception.")
        finally:
            cluster = conductor.cluster_get(ctx, cluster)
            g.clean_cluster_from_empty_ng(cluster)

        return launcher.inst_ids
Ejemplo n.º 15
0
def add_provisioning_step(cluster_id, step_name, total):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return

    update_provisioning_steps(cluster_id)
    return conductor.cluster_provision_step_add(context.ctx(), cluster_id, {
        'step_name': step_name,
        'completed': 0,
        'total': total,
        'started_at': timeutils.utcnow(),
    })
Ejemplo n.º 16
0
    def _check_deleted(self, deleted_ids, cluster, instances):
        if not g.check_cluster_exists(cluster):
            return True

        for instance in instances:
            if instance.id not in deleted_ids:
                if self._check_if_deleted(instance):
                    LOG.debug("Instance {instance} is deleted".format(
                        instance=instance.instance_name))
                    deleted_ids.add(instance.id)
                    cpo.add_successful_event(instance)
        return len(deleted_ids) == len(instances)
Ejemplo n.º 17
0
    def _check_deleted(self, deleted_ids, cluster, instances):
        if not g.check_cluster_exists(cluster):
            return True

        for instance in instances:
            if instance.id not in deleted_ids:
                with context.set_current_instance_id(instance.instance_id):
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance is deleted")
                        deleted_ids.add(instance.id)
                        cpo.add_successful_event(instance)
        return len(deleted_ids) == len(instances)
Ejemplo n.º 18
0
    def _check_deleted(self, deleted_ids, cluster, instances):
        if not g.check_cluster_exists(cluster):
            return True

        for instance in instances:
            if instance.id not in deleted_ids:
                with context.set_current_instance_id(instance.instance_id):
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance is deleted")
                        deleted_ids.add(instance.id)
                        cpo.add_successful_event(instance)
        return len(deleted_ids) == len(instances)
Ejemplo n.º 19
0
    def _check_deleted(self, deleted_ids, cluster, instances):
        if not g.check_cluster_exists(cluster):
            return True

        for instance in instances:
            if instance.id not in deleted_ids:
                if self._check_if_deleted(instance):
                    LOG.debug("Instance {instance} is deleted".format(
                              instance=instance.instance_name))
                    deleted_ids.add(instance.id)
                    cpo.add_successful_event(instance)
        return len(deleted_ids) == len(instances)
Ejemplo n.º 20
0
def get_current_provisioning_step(cluster_id):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return None

    update_provisioning_steps(cluster_id)
    ctx = context.ctx()
    cluster = conductor.cluster_get(ctx, cluster_id)
    for step in cluster.provision_progress:
        if step.successful is not None:
            continue

        return step.id

    return None
Ejemplo n.º 21
0
def get_cluster_events(cluster_id, provision_step=None):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return []
    update_provisioning_steps(cluster_id)
    if provision_step:
        return conductor.cluster_provision_step_get_events(
            context.ctx(), provision_step)
    else:
        cluster = conductor.cluster_get(context.ctx(), cluster_id)
        events = []
        for step in cluster['provision_progress']:
            step_id = step['id']
            events += conductor.cluster_provision_step_get_events(
                context.ctx(), step_id)
        return events
Ejemplo n.º 22
0
def await_datanodes(cluster):
    datanodes_count = len(vu.get_datanodes(cluster))
    if datanodes_count < 1:
        return

    LOG.info("Waiting %s datanodes to start up" % datanodes_count)
    with vu.get_namenode(cluster).remote() as r:
        while True:
            if _check_datanodes_count(r, datanodes_count):
                LOG.info("Datanodes on cluster %s has been started" % cluster.name)
                return

            context.sleep(1)

            if not g.check_cluster_exists(cluster):
                LOG.info("Stop waiting datanodes on cluster %s since it has " "been deleted" % cluster.name)
                return
Ejemplo n.º 23
0
    def _await_deleted(self, cluster, instances):
        """Await all instances are deleted."""
        if not instances:
            return

        deleted_ids = set()
        while len(deleted_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in deleted_ids:
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance '%s' is deleted" %
                                  instance.instance_name)
                        deleted_ids.add(instance.id)

            context.sleep(1)
Ejemplo n.º 24
0
    def _await_active(self, cluster, instances):
        """Await all instances are in Active status and available."""
        if not instances:
            return

        active_ids = set()
        while len(active_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in active_ids:
                    if self._check_if_active(instance):
                        active_ids.add(instance.id)

            context.sleep(1)

        LOG.info(_LI("Cluster '%s': all instances are active"), cluster.id)
Ejemplo n.º 25
0
    def _await_deleted(self, cluster, instances):
        """Await all instances are deleted."""
        if not instances:
            return

        deleted_ids = set()
        while len(deleted_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in deleted_ids:
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance '%s' is deleted" %
                                  instance.instance_name)
                        deleted_ids.add(instance.id)

            context.sleep(1)
Ejemplo n.º 26
0
    def _await_active(self, cluster, instances):
        """Await all instances are in Active status and available."""
        if not instances:
            return

        active_ids = set()
        while len(active_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in active_ids:
                    if self._check_if_active(instance):
                        active_ids.add(instance.id)

            context.sleep(1)

        LOG.info(_LI("Cluster '%s': all instances are active"), cluster.id)
Ejemplo n.º 27
0
def add_provisioning_step(cluster_id, step_name, total):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return

    prev_step = get_current_provisioning_step(cluster_id)
    if prev_step:
        conductor.cluster_provision_step_update(context.ctx(), prev_step)

    step_type = context.ctx().current_instance_info.step_type
    new_step = conductor.cluster_provision_step_add(
        context.ctx(), cluster_id, {
            'step_name': step_name,
            'step_type': step_type,
            'total': total,
            'started_at': timeutils.utcnow(),
        })
    context.current().current_instance_info.step_id = new_step
    return new_step
Ejemplo n.º 28
0
def add_provisioning_step(cluster_id, step_name, total):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return

    prev_step = get_current_provisioning_step(cluster_id)
    if prev_step:
        conductor.cluster_provision_step_update(context.ctx(), prev_step)

    step_type = context.ctx().current_instance_info.step_type
    new_step = conductor.cluster_provision_step_add(
        context.ctx(), cluster_id, {
            'step_name': step_name,
            'step_type': step_type,
            'total': total,
            'started_at': timeutils.utcnow(),
        })
    context.current().current_instance_info.step_id = new_step
    return new_step
Ejemplo n.º 29
0
    def _is_accessible(self, instance):
        if not g.check_cluster_exists(instance.cluster):
            return True
        try:
            # check if ssh is accessible and cloud-init
            # script is finished generating authorized_keys
            exit_code, stdout = instance.remote().execute_command(
                "ls .ssh/authorized_keys", raise_when_error=False)

            if exit_code == 0:
                LOG.debug('Instance is accessible')
                return True
        except Exception as ex:
            LOG.debug("Can't login to node, IP: {mgmt_ip}, "
                      "reason {reason}".format(mgmt_ip=instance.management_ip,
                                               reason=ex))
            return False

        return False
Ejemplo n.º 30
0
def await_datanodes(cluster):
    datanodes_count = len(vu.get_datanodes(cluster))
    if datanodes_count < 1:
        return

    LOG.info("Waiting %s datanodes to start up" % datanodes_count)
    with vu.get_namenode(cluster).remote() as r:
        while True:
            if _check_datanodes_count(r, datanodes_count):
                LOG.info('Datanodes on cluster %s has been started' %
                         cluster.name)
                return

            context.sleep(1)

            if not g.check_cluster_exists(cluster):
                LOG.info('Stop waiting datanodes on cluster %s since it has '
                         'been deleted' % cluster.name)
                return
Ejemplo n.º 31
0
    def _is_accessible(self, instance):
        if not g.check_cluster_exists(instance.cluster):
            return True
        try:
            # check if ssh is accessible and cloud-init
            # script is finished generating authorized_keys
            exit_code, stdout = instance.remote().execute_command(
                "ls .ssh/authorized_keys", raise_when_error=False)

            if exit_code == 0:
                LOG.debug('Instance is accessible')
                return True
        except Exception as ex:
            LOG.debug("Can't login to node, IP: {mgmt_ip}, "
                      "reason {reason}".format(mgmt_ip=instance.management_ip,
                                               reason=ex))
            return False

        return False
Ejemplo n.º 32
0
    def _wait_until_accessible(self, instance):
        while True:
            try:
                # check if ssh is accessible and cloud-init
                # script is finished generating authorized_keys
                exit_code, stdout = instance.remote().execute_command(
                    "ls .ssh/authorized_keys", raise_when_error=False)

                if exit_code == 0:
                    LOG.debug(
                        'Instance %s is accessible' % instance.instance_name)
                    return
            except Exception as ex:
                LOG.debug("Can't login to node %s (%s), reason %s",
                          instance.instance_name, instance.management_ip, ex)

            context.sleep(5)

            if not g.check_cluster_exists(instance.node_group.cluster):
                return
Ejemplo n.º 33
0
    def _await_datanodes(self, cluster):
        datanodes_count = len(vu.get_datanodes(cluster))
        if datanodes_count < 1:
            return

        LOG.info(_LI("Waiting %s datanodes to start up"), datanodes_count)
        with remote.get_remote(vu.get_namenode(cluster)) as r:
            while True:
                if run.check_datanodes_count(r, datanodes_count):
                    LOG.info(_LI('Datanodes on cluster %s have been started'),
                             cluster.name)
                    return

                context.sleep(1)

                if not g.check_cluster_exists(cluster):
                    LOG.info(
                        _LI('Stop waiting datanodes on cluster %s since it has'
                            ' been deleted'), cluster.name)
                    return
Ejemplo n.º 34
0
    def _await_deleted(self, cluster, instances):
        """Await all instances are deleted."""
        if not instances:
            return
        cpo.add_provisioning_step(
            cluster.id, _("Wait for instances to be deleted"), len(instances))

        deleted_ids = set()
        while len(deleted_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in deleted_ids:
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance '%s' is deleted" %
                                  instance.instance_name)
                        deleted_ids.add(instance.id)
                        cpo.add_successful_event(instance)

            context.sleep(1)
Ejemplo n.º 35
0
    def _await_datanodes(self, cluster):
        datanodes_count = len(vu.get_datanodes(cluster))
        if datanodes_count < 1:
            return

        LOG.info(_LI("Waiting %s datanodes to start up"), datanodes_count)
        with remote.get_remote(vu.get_namenode(cluster)) as r:
            while True:
                if run.check_datanodes_count(r, datanodes_count):
                    LOG.info(
                        _LI('Datanodes on cluster %s has been started'),
                        cluster.name)
                    return

                context.sleep(1)

                if not g.check_cluster_exists(cluster):
                    LOG.info(
                        _LI('Stop waiting datanodes on cluster %s since it has'
                            ' been deleted'), cluster.name)
                    return
Ejemplo n.º 36
0
def update_provisioning_steps(cluster_id):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return

    ctx = context.ctx()
    cluster = conductor.cluster_get(ctx, cluster_id)

    for step in cluster.provision_progress:
        if step.successful is not None:
            continue

        has_failed = False
        successful_events_count = 0
        events = conductor.cluster_provision_step_get_events(
            ctx, step.id)
        for event in events:
            if event.successful:
                successful_events_count += 1
            else:
                has_failed = True

        successful = None
        if has_failed:
            successful = False
        elif successful_events_count == step.total:
            successful = True

        completed_at = None
        if successful and not step.completed_at:
            completed_at = timeutils.utcnow()

        conductor.cluster_provision_step_update(ctx, step.id, {
            'completed': successful_events_count,
            'successful': successful,
            'completed_at': completed_at,
        })

        if successful:
            conductor.cluster_provision_step_remove_events(
                ctx, step.id)
Ejemplo n.º 37
0
    def _await_active(self, cluster, instances):
        """Await all instances are in Active status and available."""
        if not instances:
            return

        cpo.add_provisioning_step(cluster.id,
                                  _("Wait for instances to become active"),
                                  len(instances))

        active_ids = set()
        while len(active_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in active_ids:
                    if self._check_if_active(instance):
                        active_ids.add(instance.id)
                        cpo.add_successful_event(instance)

            context.sleep(1)

        LOG.info(_LI("Cluster '%s': all instances are active"), cluster.id)
Ejemplo n.º 38
0
        def handler(*args, **kwargs):
            if CONF.disable_event_log:
                return func(*args, **kwargs)
            step_name = spec.get('step', None)
            instance = _find_in_args(spec, *args, **kwargs)
            cluster_id = instance.cluster_id

            if not g.check_cluster_exists(cluster_id):
                return func(*args, **kwargs)

            if step_name:
                # It's single process, let's add provisioning step here
                add_provisioning_step(cluster_id, step_name, 1)

            try:
                value = func(*args, **kwargs)
            except Exception as e:
                with excutils.save_and_reraise_exception():
                    add_fail_event(instance, e)

            if mark_successful_on_exit:
                add_successful_event(instance)

            return value
Ejemplo n.º 39
0
        def handler(*args, **kwargs):
            if CONF.disable_event_log:
                return func(*args, **kwargs)
            step_name = spec.get('step', None)
            instance = _find_in_args(spec, *args, **kwargs)
            cluster_id = instance.cluster_id

            if not g.check_cluster_exists(cluster_id):
                return func(*args, **kwargs)

            if step_name:
                # It's single process, let's add provisioning step here
                add_provisioning_step(cluster_id, step_name, 1)

            try:
                value = func(*args, **kwargs)
            except Exception as e:
                with excutils.save_and_reraise_exception():
                    add_fail_event(instance, e)

            if mark_successful_on_exit:
                add_successful_event(instance)

            return value
Ejemplo n.º 40
0
 def _get(n_cluster, n_kwargs):
     if not general.check_cluster_exists(n_cluster):
         return True
     return get_status(**n_kwargs)
Ejemplo n.º 41
0
 def _get(n_cluster, n_kwargs):
     if not general.check_cluster_exists(n_cluster):
         return True
     return get_status(**n_kwargs)
Ejemplo n.º 42
0
def get_current_provisioning_step(cluster_id):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return None
    current_instance_info = context.ctx().current_instance_info
    return current_instance_info.step_id
Ejemplo n.º 43
0
def get_current_provisioning_step(cluster_id):
    if CONF.disable_event_log or not g.check_cluster_exists(cluster_id):
        return None
    current_instance_info = context.ctx().current_instance_info
    return current_instance_info.step_id