Exemple #1
0
def _create_attach_volume(ctx,
                          instance,
                          size,
                          volume_type,
                          name=None,
                          availability_zone=None):
    if CONF.cinder.api_version == 1:
        kwargs = {'size': size, 'display_name': name}
    else:
        kwargs = {'size': size, 'name': name}

    kwargs['volume_type'] = volume_type
    if availability_zone is not None:
        kwargs['availability_zone'] = availability_zone

    volume = cinder.client().volumes.create(**kwargs)
    conductor.append_volume(ctx, instance, volume.id)

    while volume.status != 'available':
        volume = cinder.get_volume(volume.id)
        if volume.status == 'error':
            raise ex.SystemError(_("Volume %s has error status") % volume.id)

        context.sleep(1)

    resp = nova.client().volumes.create_server_volume(instance.instance_id,
                                                      volume.id, None)
    return resp.device
Exemple #2
0
 def wait_ambari_requests(self, requests, cluster_name):
     requests = set(requests)
     failed = []
     while len(requests) > 0:
         completed, not_completed = set(), set()
         for req_id in requests:
             request = self.get_request_info(cluster_name, req_id)
             status = request.get("request_status")
             if status == 'COMPLETED':
                 completed.add(req_id)
             elif status in ['IN_PROGRESS', 'PENDING']:
                 not_completed.add(req_id)
             else:
                 failed.append(request)
         if failed:
             msg = _("Some Ambari request(s) "
                     "not in COMPLETED state: %(description)s.")
             descrs = []
             for req in failed:
                 descr = _(
                     "request %(id)d: %(name)s - in status %(status)s")
                 descrs.append(descr %
                               {'id': req.get("id"),
                                'name': req.get("request_context"),
                                'status': req.get("request_status")})
             raise p_exc.HadoopProvisionError(msg % {'description': descrs})
         requests = not_completed
         context.sleep(5)
         LOG.debug("Waiting for %d ambari request(s) to be completed",
                   len(not_completed))
     LOG.debug("All ambari requests have been completed")
Exemple #3
0
    def wait_till_active(self):
        while self.heat_stack.stack_status in ("CREATE_IN_PROGRESS", "UPDATE_IN_PROGRESS"):
            context.sleep(1)
            self.heat_stack.get()

        if self.heat_stack.stack_status not in ("CREATE_COMPLETE", "UPDATE_COMPLETE"):
            raise ex.HeatStackException(self.heat_stack.stack_status)
Exemple #4
0
def start_cluster(cluster):
    cl_tmpl = {
        "blueprint": cluster.name,
        "default_password": uuidutils.generate_uuid(),
        "host_groups": []
    }
    for ng in cluster.node_groups:
        for instance in ng.instances:
            cl_tmpl["host_groups"].append({
                "name": instance.instance_name,
                "hosts": [{"fqdn": instance.fqdn()}]
            })
    ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER)
    password = cluster.extra["ambari_password"]
    with ambari_client.AmbariClient(ambari, password=password) as client:
        req_id = client.create_cluster(cluster.name, cl_tmpl)["id"]
        while True:
            status = client.check_request_status(cluster.name, req_id)
            LOG.debug("Task %s in %s state. Completed %.1f%%" % (
                status["request_context"], status["request_status"],
                status["progress_percent"]))
            if status["request_status"] == "COMPLETED":
                return
            if status["request_status"] in ["IN_PROGRESS", "PENDING"]:
                context.sleep(5)
            else:
                raise p_exc.HadoopProvisionError(
                    _("Ambari request in %s state") % status["request_status"])
Exemple #5
0
def _start_cloudera_manager(cluster):
    manager = pu.get_manager(cluster)
    with manager.remote() as r:
        cmd.start_cloudera_db(r)
        cmd.start_manager(r)

    timeout = 300
    LOG.debug("Waiting %(timeout)s seconds for Manager to start : " %
              {'timeout': timeout})
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        try:
            conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT)
            conn.close()
            break
        except IOError:
            context.sleep(2)
    else:
        message = _("Cloudera Manager failed to start in %(timeout)s minutes "
                    "on node '%(node)s' of cluster '%(cluster)s'") % {
                        'timeout': timeout / 60,
                        'node': manager.management_ip,
                        'cluster': cluster.name
                    }
        raise ex.HadoopProvisionError(message)

    LOG.info(_LI("Cloudera Manager has been started"))
Exemple #6
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        ips_assigned = set()
        while len(ips_assigned) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in ips_assigned:
                    if networks.init_instances_ips(instance):
                        ips_assigned.add(instance.id)

            context.sleep(1)

        LOG.info(
            _LI("Cluster '%s': all instances have IPs assigned"), cluster.id)

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
Exemple #7
0
def _start_cloudera_manager(cluster):
    manager = pu.get_manager(cluster)
    with manager.remote() as r:
        cmd.start_cloudera_db(r)
        cmd.start_manager(r)

    timeout = 300
    LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % {
        'timeout': timeout})
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        try:
            conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT)
            conn.close()
            break
        except IOError:
            context.sleep(2)
    else:
        message = _("Cloudera Manager failed to start in %(timeout)s minutes "
                    "on node '%(node)s' of cluster '%(cluster)s'") % {
                        'timeout': timeout / 60,
                        'node': manager.management_ip,
                        'cluster': cluster.name}
        raise ex.HadoopProvisionError(message)

    LOG.info(_LI("Cloudera Manager has been started"))
Exemple #8
0
def wait_stack_completion(stack):
    while stack.status == 'IN_PROGRESS':
        context.sleep(1)
        stack.get()

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status)
Exemple #9
0
    def wait_for_host_registrations(self, num_hosts, ambari_info):
        LOG.info(
            _LI('Waiting for all Ambari agents to register with server ...'))

        url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address())
        result = None
        json_result = None

        # TODO(jspeidel): timeout
        while result is None or len(json_result['items']) < num_hosts:
            context.sleep(5)
            try:
                result = self._get(url, ambari_info)
                json_result = json.loads(result.text)

                LOG.info(_LI('Registered Hosts: %(current_number)s of '
                             '%(final_number)s'),
                         {'current_number': len(json_result['items']),
                          'final_number': num_hosts})
                for hosts in json_result['items']:
                    LOG.debug('Registered Host: {0}'.format(
                        hosts['Hosts']['host_name']))
            except Exception:
                # TODO(jspeidel): max wait time
                LOG.info(_LI('Waiting to connect to ambari server ...'))
Exemple #10
0
def execute_with_retries(method, *args, **kwargs):
    attempts = CONF.retries.retries_number + 1
    while attempts > 0:
        try:
            return method(*args, **kwargs)
        except Exception as e:
            error_code = getattr(e, 'http_status', None) or getattr(
                e, 'status_code', None) or getattr(e, 'code', None)
            if error_code in ERRORS_TO_RETRY:
                LOG.warning(_LW('Occasional error occurred during "{method}" '
                                'execution: {error_msg} ({error_code}). '
                                'Operation will be retried.').format(
                            method=method.__name__,
                            error_msg=e,
                            error_code=error_code))
                attempts -= 1
                retry_after = getattr(e, 'retry_after', 0)
                context.sleep(max(retry_after, CONF.retries.retry_after))
            else:
                LOG.debug('Permanent error occurred during "{method}" '
                          'execution: {error_msg}.'.format(
                              method=method.__name__, error_msg=e))
                raise e
    else:
        raise ex.MaxRetriesExceeded(attempts, method.__name__)
Exemple #11
0
def _wait_all_processes_removed(cluster, instance):
    with _get_ambari_client(cluster) as client:
        while True:
            hdp_processes = client.list_host_processes(cluster.name, instance)
            if not hdp_processes:
                return
            context.sleep(5)
Exemple #12
0
def _detach_volume(instance, volume_id):
    volume = cinder.get_volume(volume_id)
    try:
        LOG.debug("Detaching volume %s from instance %s" % (
            volume_id, instance.instance_name))
        nova.client().volumes.delete_server_volume(instance.instance_id,
                                                   volume_id)
    except Exception:
        LOG.exception(_LE("Can't detach volume %s"), volume.id)

    detach_timeout = CONF.detach_volume_timeout
    LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout,
                                                          volume_id))
    s_time = tu.utcnow()
    while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout:
        volume = cinder.get_volume(volume_id)
        if volume.status not in ['available', 'error']:
            context.sleep(2)
        else:
            LOG.debug("Volume %s has been detached" % volume_id)
            return
    else:
        LOG.warn(_LW("Can't detach volume %(volume)s. "
                     "Current status of volume: %(status)s"),
                 {'volume': volume_id, 'status': volume.status})
Exemple #13
0
    def wait(self, timeout=None):
        """Wait for command to finish

        :param timeout: (Optional) Max amount of time (in seconds) to wait.
                        Wait forever by default.
        :return: The final ApiCommand object, containing the last known state.
                 The command may still be running in case of timeout.
        """
        if self.id == ApiCommand.SYNCHRONOUS_COMMAND_ID:
            return self

        SLEEP_SEC = 5

        if timeout is None:
            deadline = None
        else:
            deadline = time.time() + timeout

        while True:
            cmd = self.fetch()
            if not cmd.active:
                return cmd

            if deadline is not None:
                now = time.time()
                if deadline < now:
                    return cmd
                else:
                    context.sleep(min(SLEEP_SEC, deadline - now))
            else:
                context.sleep(SLEEP_SEC)
 def _await_cldb(self, cluster_context, instances=None, timeout=600):
     instances = instances or cluster_context.get_instances()
     cldb_node = cluster_context.get_instance(mfs.CLDB)
     start_time = timeutils.utcnow()
     retry_count = 0
     with cldb_node.remote() as r:
         LOG.debug("Waiting {count} seconds for CLDB initialization".format(
             count=timeout))
         while timeutils.delta_seconds(start_time,
                                       timeutils.utcnow()) < timeout:
             ec, out = r.execute_command(NODE_LIST_CMD,
                                         raise_when_error=False)
             resp = json.loads(out)
             status = resp['status']
             if str(status).lower() == 'ok':
                 ips = [n['ip'] for n in resp['data']]
                 retry_count += 1
                 for i in instances:
                     if (i.management_ip not in ips
                             and retry_count > DEFAULT_RETRY_COUNT):
                         raise ex.HadoopProvisionError(_(
                             "Node failed to connect to CLDB: %s") %
                             i.management_ip)
                 break
             else:
                 context.sleep(DELAY)
         else:
             raise ex.HadoopProvisionError(_("CLDB failed to start"))
Exemple #15
0
    def _get_job_status_from_remote(self, job_execution, retries=3):

        topology_name, inst_id = self._get_instance_if_running(job_execution)
        if topology_name is None or inst_id is None:
            return edp.JOB_STATUSES_TERMINATED

        topology_name = self._get_topology_name(job_execution)
        master = plugin_utils.get_instance(self.cluster, "nimbus")

        cmd = ("%(storm)s -c nimbus.host=%(host)s "
               "list | grep %(topology_name)s | awk '{print $2}'") % (
                   {
                       "storm": "/usr/local/storm/bin/storm",
                       "host": master.hostname(),
                       "topology_name": topology_name
                   })
        for i in range(retries):
            with remote.get_remote(master) as r:
                ret, stdout = r.execute_command("%s " % (cmd))
            # If the status is ACTIVE is there, it's still running
            if stdout.strip() == "ACTIVE":
                return {"status": edp.JOB_STATUS_RUNNING}
            else:
                if i == retries - 1:
                    return {"status": edp.JOB_STATUS_KILLED}
                context.sleep(10)
Exemple #16
0
    def get(self, relpath=None, params=None):
        """Invoke the GET method on a resource

        :param relpath: Optional. A relative path to this resource's path.
        :param params: Key-value data.

        :return: A dictionary of the JSON result.
        """
        for retry in six.moves.xrange(self.retries + 1):
            if retry:
                context.sleep(self.retry_sleep)
            try:
                return self.invoke("GET", relpath, params)
            except (socket.error, urllib.error.URLError) as e:
                if "timed out" in six.text_type(e).lower():
                    if retry < self.retries:
                        LOG.warning(_LW("Timeout issuing GET request for "
                                        "{path}. Will retry").format(
                                            path=self._join_uri(relpath)))
                    else:
                        LOG.warning(_LW("Timeout issuing GET request for "
                                        "{path}. No retries left").format(
                                            path=self._join_uri(relpath)))
                else:
                    raise e
        else:
            raise ex.CMApiException(_("Get retry max time reached."))
Exemple #17
0
 def _await_cldb(self, cluster_context, instances=None, timeout=600):
     instances = instances or cluster_context.get_instances()
     cldb_node = cluster_context.get_instance(mfs.CLDB)
     start_time = timeutils.utcnow()
     retry_count = 0
     with cldb_node.remote() as r:
         LOG.debug("Waiting {count} seconds for CLDB initialization".format(
             count=timeout))
         while timeutils.delta_seconds(start_time,
                                       timeutils.utcnow()) < timeout:
             ec, out = r.execute_command(NODE_LIST_CMD,
                                         raise_when_error=False)
             resp = json.loads(out)
             status = resp['status']
             if str(status).lower() == 'ok':
                 ips = [n['ip'] for n in resp['data']]
                 retry_count += 1
                 for i in instances:
                     if (i.internal_ip not in ips
                             and retry_count > DEFAULT_RETRY_COUNT):
                         raise ex.HadoopProvisionError(
                             _("Node failed to connect to CLDB: %s") %
                             i.internal_ip)
                 break
             else:
                 context.sleep(DELAY)
         else:
             raise ex.HadoopProvisionError(_("CLDB failed to start"))
Exemple #18
0
    def get(self, relpath=None, params=None):
        """Invoke the GET method on a resource

        :param relpath: Optional. A relative path to this resource's path.
        :param params: Key-value data.

        :return: A dictionary of the JSON result.
        """
        for retry in six.moves.xrange(self.retries + 1):
            if retry:
                context.sleep(self.retry_sleep)
            try:
                return self.invoke("GET", relpath, params)
            except (socket.error, urllib2.URLError) as e:
                if "timed out" in six.text_type(e).lower():
                    if retry < self.retries:
                        LOG.warning(
                            _LW("Timeout issuing GET request for "
                                "{path}. Will retry").format(
                                    path=self._join_uri(relpath)))
                    else:
                        LOG.warning(
                            _LW("Timeout issuing GET request for "
                                "{path}. No retries left").format(
                                    path=self._join_uri(relpath)))
                else:
                    raise e
        else:
            raise ex.CMApiException(_("Get retry max time reached."))
Exemple #19
0
    def _get_job_status_from_remote(self, job_execution, retries=3):

        topology_name, inst_id = self._get_instance_if_running(
            job_execution)
        if topology_name is None or inst_id is None:
            return edp.JOB_STATUSES_TERMINATED

        topology_name = self._get_topology_name(job_execution)
        master = plugin_utils.get_instance(self.cluster, "nimbus")

        cmd = (
            "%(storm)s -c nimbus.host=%(host)s "
            "list | grep %(topology_name)s | awk '{print $2}'") % (
            {
                "storm": "/usr/local/storm/bin/storm",
                "host": master.hostname(),
                "topology_name": topology_name
            })
        for i in range(retries):
            with remote.get_remote(master) as r:
                ret, stdout = r.execute_command("%s " % (cmd))
            # If the status is ACTIVE is there, it's still running
            if stdout.strip() == "ACTIVE":
                return {"status": edp.JOB_STATUS_RUNNING}
            else:
                if i == retries - 1:
                    return {"status": edp.JOB_STATUS_KILLED}
                context.sleep(10)
Exemple #20
0
def start_cluster(cluster):
    cl_tmpl = {
        "blueprint": cluster.name,
        "default_password": uuidutils.generate_uuid(),
        "host_groups": []
    }
    for ng in cluster.node_groups:
        for instance in ng.instances:
            cl_tmpl["host_groups"].append({
                "name": instance.instance_name,
                "hosts": [{
                    "fqdn": instance.fqdn()
                }]
            })
    ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER)
    password = cluster.extra["ambari_password"]
    with ambari_client.AmbariClient(ambari, password=password) as client:
        req_id = client.create_cluster(cluster.name, cl_tmpl)["id"]
        while True:
            status = client.check_request_status(cluster.name, req_id)
            LOG.debug("Task %s in %s state. Completed %.1f%%" %
                      (status["request_context"], status["request_status"],
                       status["progress_percent"]))
            if status["request_status"] == "COMPLETED":
                return
            if status["request_status"] in ["IN_PROGRESS", "PENDING"]:
                context.sleep(5)
            else:
                raise p_exc.HadoopProvisionError(
                    _("Ambari request in %s state") % status["request_status"])
Exemple #21
0
    def wait(self, timeout=None):
        """Wait for command to finish

        :param timeout: (Optional) Max amount of time (in seconds) to wait.
                        Wait forever by default.
        :return: The final ApiCommand object, containing the last known state.
                 The command may still be running in case of timeout.
        """
        if self.id == ApiCommand.SYNCHRONOUS_COMMAND_ID:
            return self

        SLEEP_SEC = 5

        if timeout is None:
            deadline = None
        else:
            deadline = time.time() + timeout

        while True:
            cmd = self.fetch()
            if not cmd.active:
                return cmd

            if deadline is not None:
                now = time.time()
                if deadline < now:
                    return cmd
                else:
                    context.sleep(min(SLEEP_SEC, deadline - now))
            else:
                context.sleep(SLEEP_SEC)
Exemple #22
0
def _wait_all_processes_removed(cluster, instance):
    with _get_ambari_client(cluster) as client:
        while True:
            hdp_processes = client.list_host_processes(cluster.name, instance)
            if not hdp_processes:
                return
            context.sleep(5)
Exemple #23
0
def _detach_volume(instance, volume_id):
    volume = cinder.get_volume(volume_id)
    try:
        LOG.debug("Detaching volume %s from instance %s" %
                  (volume_id, instance.instance_name))
        nova.client().volumes.delete_server_volume(instance.instance_id,
                                                   volume_id)
    except Exception:
        LOG.exception(_LE("Can't detach volume %s"), volume.id)

    detach_timeout = CONF.detach_volume_timeout
    LOG.debug("Waiting %d seconds to detach %s volume" %
              (detach_timeout, volume_id))
    s_time = tu.utcnow()
    while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout:
        volume = cinder.get_volume(volume_id)
        if volume.status not in ['available', 'error']:
            context.sleep(2)
        else:
            LOG.debug("Volume %s has been detached" % volume_id)
            return
    else:
        LOG.warn(
            _LW("Can't detach volume %(volume)s. "
                "Current status of volume: %(status)s"), {
                    'volume': volume_id,
                    'status': volume.status
                })
Exemple #24
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True

            if instance.fqdn() in dn_hosts:
                stopped = stopped and _is_hadoop_service_stopped(
                    instance, 'hadoop-hdfs-datanode')

            if instance.fqdn() in nm_hosts:
                stopped = stopped and _is_hadoop_service_stopped(
                    instance, 'hadoop-yarn-nodemanager')

            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Exemple #25
0
def wait_stack_completion(stack, is_update=False, last_updated_time=None):
    base.execute_with_retries(stack.get)
    while not _verify_completion(stack, is_update, last_updated_time):
        context.sleep(1)
        base.execute_with_retries(stack.get)

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status_reason)
    def _make_checks(self, instance_info, sleep=True):
        ctx = context.ctx()

        if sleep:
            context.sleep(2)

        current_instance_info = ctx.current_instance_info
        self.assertEqual(instance_info, current_instance_info)
Exemple #27
0
def poll(get_status,
         kwargs=None,
         args=None,
         operation_name=None,
         timeout_name=None,
         timeout=DEFAULT_TIMEOUT,
         sleep=DEFAULT_SLEEP_TIME,
         exception_strategy='raise'):
    """This util poll status of object obj during some timeout.

    :param get_status: function, which return current status of polling
    as Boolean
    :param kwargs: keyword arguments of function get_status
    :param operation_name: name of polling process
    :param timeout_name: name of timeout option
    :param timeout: value of timeout in seconds. By default, it equals to
    3 hours
    :param sleep: duration between two consecutive executions of
    get_status function
    :param exception_strategy: possible values ('raise', 'mark_as_true',
    'mark_as_false'). If exception_strategy is 'raise' exception would be
    raised. If exception_strategy is 'mark_as_true', return value of
    get_status would marked as True, and in case of 'mark_as_false' - False.
    By default it's 'raise'.
    """
    start_time = timeutils.utcnow()
    # We shouldn't raise TimeoutException if incorrect timeout specified and
    # status is ok now. In such way we should execute get_status at least once.
    at_least_once = True
    if not kwargs:
        kwargs = {}
    if not args:
        args = ()

    while at_least_once or _get_consumed(start_time) < timeout:
        at_least_once = False
        try:
            status = get_status(*args, **kwargs)
        except BaseException:
            if exception_strategy == 'raise':
                raise
            elif exception_strategy == 'mark_as_true':
                status = True
            else:
                status = False

        if status:
            operation = "Operation"
            if operation_name:
                operation = "Operation with name {op_name}".format(
                    op_name=operation_name)
            LOG.debug('{operation_desc} was executed successfully in timeout '
                      '{timeout}'.format(operation_desc=operation,
                                         timeout=timeout))
            return

        context.sleep(sleep)
    raise ex.TimeoutException(timeout, operation_name, timeout_name)
Exemple #28
0
def wait_stack_completion(cluster, is_update=False, last_updated_time=None):
    stack_name = cluster.stack_name
    stack = get_stack(stack_name)
    while not _verify_completion(stack, is_update, last_updated_time):
        context.sleep(1)
        stack = get_stack(stack_name)

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status_reason)
Exemple #29
0
    def wait_till_active(self):
        while self.heat_stack.stack_status in ('CREATE_IN_PROGRESS',
                                               'UPDATE_IN_PROGRESS'):
            context.sleep(1)
            self.heat_stack.get()

        if self.heat_stack.stack_status not in ('CREATE_COMPLETE',
                                                'UPDATE_COMPLETE'):
            raise ex.HeatStackException(self.heat_stack.stack_status)
Exemple #30
0
def wait_stack_completion(stack):
    # NOTE: expected empty status because status of stack
    # maybe is not set in heat database
    while stack.status in ['IN_PROGRESS', '']:
        context.sleep(1)
        stack.get()

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status)
Exemple #31
0
 def update_configs(self, instances):
     # instances non-empty
     cpo.add_provisioning_step(instances[0].cluster_id, _("Update configs"),
                               len(instances))
     with context.ThreadGroup() as tg:
         for instance in instances:
             tg.spawn("update-configs-%s" % instance.instance_name,
                      self._update_configs, instance)
             context.sleep(1)
Exemple #32
0
def wait_stack_completion(cluster, is_update=False, last_updated_time=None):
    stack_name = cluster.stack_name
    stack = get_stack(stack_name)
    while not _verify_completion(stack, is_update, last_updated_time):
        context.sleep(1)
        stack = get_stack(stack_name)

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status_reason)
    def _make_checks(self, instance, sleep=True):
        ctx = context.ctx()

        if sleep:
            context.sleep(2)

        current_instance_info = ctx.current_instance_info
        expected = [None, instance.id, instance.name, None]
        self.assertEqual(expected, current_instance_info)
Exemple #34
0
def wait_stack_completion(stack):
    # NOTE: expected empty status because status of stack
    # maybe is not set in heat database
    while stack.status in ['IN_PROGRESS', '']:
        context.sleep(1)
        stack.get()

    if stack.status != 'COMPLETE':
        raise ex.HeatStackException(stack.stack_status)
Exemple #35
0
def _wait_all_processes_removed(cluster, instance):
    ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER)
    password = cluster.extra["ambari_password"]

    with ambari_client.AmbariClient(ambari, password=password) as client:
        while True:
            hdp_processes = client.list_host_processes(cluster.name, instance)
            if not hdp_processes:
                return
            context.sleep(5)
Exemple #36
0
def _wait_all_processes_removed(cluster, instance):
    ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER)
    password = cluster.extra["ambari_password"]

    with ambari_client.AmbariClient(ambari, password=password) as client:
        while True:
            hdp_processes = client.list_host_processes(cluster.name, instance)
            if not hdp_processes:
                return
            context.sleep(5)
Exemple #37
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        return job_execution
    engine = _get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {
                'status': edp.JOB_STATUS_TOBEKILLED
            }})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.exception(
                        _LE("Error during cancel of job execution %(job)s: "
                            "%(error)s"), {
                                'job': job_execution.id,
                                'error': ex
                            })
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution %s was canceled successfully"),
                             job_execution.id)
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(
                        _LI("Job execution %(job_exec_id)s was deleted. "
                            "Canceling current operation."),
                        {'job_exec_id': job_execution_id})
                    return job_execution
            else:
                LOG.info(
                    _LI("Job execution status %(job)s: %(status)s"), {
                        'job': job_execution.id,
                        'status': job_execution.info['status']
                    })
                return job_execution
        else:
            raise e.CancelingFailed(
                _('Job execution %s was not canceled') % job_execution.id)
Exemple #38
0
 def start_cluster(self, cluster):
     start_helper.exec_configure_sh_on_cluster(
         cluster, self.get_configure_sh_string(cluster))
     start_helper.wait_for_mfs_unlock(cluster, self.get_waiting_script())
     start_helper.setup_maprfs_on_cluster(cluster,
                                          self.get_disk_setup_script())
     start_helper.start_zookeeper_nodes_on_cluster(cluster)
     start_helper.start_warden_on_cldb_nodes(cluster)
     context.sleep(SIXTY_SECONDS)
     start_helper.start_warden_on_other_nodes(cluster)
     start_helper.start_ecosystem(self.get_context(cluster))
Exemple #39
0
def poll(get_status, kwargs=None, args=None, operation_name=None,
         timeout_name=None, timeout=DEFAULT_TIMEOUT, sleep=DEFAULT_SLEEP_TIME,
         exception_strategy='raise'):
    """This util poll status of object obj during some timeout.

    :param get_status: function, which return current status of polling
    as Boolean
    :param kwargs: keyword arguments of function get_status
    :param operation_name: name of polling process
    :param timeout_name: name of timeout option
    :param timeout: value of timeout in seconds. By default, it equals to
    3 hours
    :param sleep: duration between two consecutive executions of
    get_status function
    :param exception_strategy: possible values ('raise', 'mark_as_true',
    'mark_as_false'). If exception_strategy is 'raise' exception would be
    raised. If exception_strategy is 'mark_as_true', return value of
    get_status would marked as True, and in case of 'mark_as_false' - False.
    By default it's 'raise'.
    """
    start_time = timeutils.utcnow()
    # We shouldn't raise TimeoutException if incorrect timeout specified and
    # status is ok now. In such way we should execute get_status at least once.
    at_least_once = True
    if not kwargs:
        kwargs = {}
    if not args:
        args = ()

    while at_least_once or _get_consumed(start_time) < timeout:
        at_least_once = False
        try:
            status = get_status(*args, **kwargs)
        except BaseException:
            if exception_strategy == 'raise':
                raise
            elif exception_strategy == 'mark_as_true':
                status = True
            else:
                status = False

        if status:
            operation = "Operation"
            if operation_name:
                operation = "Operation with name {op_name}".format(
                    op_name=operation_name)
            LOG.debug(
                '{operation_desc} was executed successfully in timeout '
                '{timeout}'
                .format(operation_desc=operation, timeout=timeout))
            return

        context.sleep(sleep)
    raise ex.TimeoutException(timeout, operation_name, timeout_name)
Exemple #40
0
def decommission_tt(jt, inst_to_be_deleted, survived_inst):
    with remote.get_remote(jt) as r:
        r.write_file_to('/etc/hadoop/tt.excl',
                        utils.generate_fqdn_host_names(
                            inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(jt), "mradmin")
        context.sleep(3)
        r.write_files_to({'/etc/hadoop/tt.incl':
                         utils.generate_fqdn_host_names(survived_inst),
                         '/etc/hadoop/tt.excl': "",
                          })
Exemple #41
0
 def start_cluster(self, cluster):
     start_helper.exec_configure_sh_on_cluster(
         cluster, self.get_configure_sh_string(cluster))
     start_helper.wait_for_mfs_unlock(cluster, self.get_waiting_script())
     start_helper.setup_maprfs_on_cluster(
         cluster, self.get_disk_setup_script())
     start_helper.start_zookeeper_nodes_on_cluster(cluster)
     start_helper.start_warden_on_cldb_nodes(cluster)
     context.sleep(SIXTY_SECONDS)
     start_helper.start_warden_on_other_nodes(cluster)
     start_helper.start_ecosystem(self.get_context(cluster))
Exemple #42
0
def _await_attach_volumes(instance, count_volumes):
    timeout = 10
    step = 2
    while timeout > 0:
        if len(_get_unmounted_devices(instance)) == count_volumes:
            return

        timeout -= step
        context.sleep(step)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Exemple #43
0
def decommission_tt(jt, inst_to_be_deleted, survived_inst):
    with remote.get_remote(jt) as r:
        r.write_file_to('/etc/hadoop/tt.excl',
                        utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(jt), "mradmin")
        context.sleep(3)
        r.write_files_to({
            '/etc/hadoop/tt.incl':
            utils.generate_fqdn_host_names(survived_inst),
            '/etc/hadoop/tt.excl':
            "",
        })
Exemple #44
0
def _await_attach_volumes(instance, devices):
    timeout = 10
    step = 2
    while timeout > 0:
        if _count_attached_devices(instance, devices) == len(devices):
            return

        timeout -= step
        context.sleep(step)

    raise ex.SystemError(_("Error attach volume to instance %s") %
                         instance.instance_name)
Exemple #45
0
def _await_attach_volumes(instance, count_volumes):
    timeout = 10
    step = 2
    while timeout > 0:
        if len(_get_unmounted_devices(instance)) == count_volumes:
            return

        timeout -= step
        context.sleep(step)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Exemple #46
0
def _await_attach_volumes(instance, devices):
    timeout = 10
    step = 2
    while timeout > 0:
        if _count_attached_devices(instance, devices) == len(devices):
            return

        timeout -= step
        context.sleep(step)

    raise ex.SystemError(
        _("Error attach volume to instance %s") % instance.instance_name)
Exemple #47
0
 def wait_ambari_request(self, request_id, cluster_name):
     while True:
         status = self.check_request_status(cluster_name, request_id)
         LOG.debug("Task %s in %s state. Completed %.1f%%" % (
             status["request_context"], status["request_status"],
             status["progress_percent"]))
         if status["request_status"] == "COMPLETED":
             return
         if status["request_status"] in ["IN_PROGRESS", "PENDING"]:
             context.sleep(5)
         else:
             raise p_exc.HadoopProvisionError(
                 _("Ambari request in %s state") % status["request_status"])
Exemple #48
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        LOG.info(
            _LI("Job execution is already finished and shouldn't be"
                " canceled"))
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        LOG.info(_LI("Can not cancel this job on a non-existant cluster."))
        return job_execution
    engine = get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {
                'status': edp.JOB_STATUS_TOBEKILLED
            }})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.warning(
                        _LW("Error during cancel of job execution: "
                            "{error}").format(error=ex))
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution was canceled successfully"))
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(
                        _LI("Job execution was deleted. "
                            "Canceling current operation."))
                    return job_execution
            else:
                LOG.info(
                    _LI("Job execution status: {status}").format(
                        status=job_execution.info['status']))
                return job_execution
        else:
            raise e.CancelingFailed(
                _('Job execution %s was not canceled') % job_execution.id)
Exemple #49
0
def _create_attach_volume(ctx, instance, size, display_name=None):
    volume = cinder.client().volumes.create(size=size,
                                            display_name=display_name)
    conductor.append_volume(ctx, instance, volume.id)

    while volume.status != 'available':
        volume = cinder.get_volume(volume.id)
        if volume.status == 'error':
            raise ex.SystemError("Volume %s has error status" % volume.id)

        context.sleep(1)

    resp = nova.client().volumes.create_server_volume(
        instance.instance_id, volume.id, None)
    return resp.device
Exemple #50
0
def decommission_nodes(cluster, instances, configure_sh_string):
    LOG.info(_LI('Start decommission . Cluster = %s'), cluster.name)
    move_node(cluster, instances)
    stop_services(cluster, instances)
    context.sleep(names.WAIT_NODE_ALARM_NO_HEARTBEAT)
    remove_node(cluster, instances)
    remove_services(cluster, instances)
    if check_for_cldb_or_zookeeper_service(instances):
        all_instances = gen.get_instances(cluster)
        current_cluster_instances = [
            x for x in all_instances if x not in instances]
        for inst in current_cluster_instances:
            start_helper.exec_configure_sh_on_instance(
                cluster, inst, configure_sh_string)
    LOG.info(_LI('End decommission. Cluster = %s'), cluster.name)
Exemple #51
0
def _create_attach_volume(ctx, instance, size, display_name=None):
    volume = cinder.client().volumes.create(size=size,
                                            display_name=display_name)
    conductor.append_volume(ctx, instance, volume.id)

    while volume.status != 'available':
        volume = cinder.get_volume(volume.id)
        if volume.status == 'error':
            raise ex.SystemError(_("Volume %s has error status") % volume.id)

        context.sleep(1)

    resp = nova.client().volumes.create_server_volume(instance.instance_id,
                                                      volume.id, None)
    return resp.device
Exemple #52
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        return job_execution
    engine = _get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {'status': edp.JOB_STATUS_TOBEKILLED}})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.warning(
                        _LW("Error during cancel of job execution {job}: "
                            "{error}").format(job=job_execution.id,
                                              error=ex))
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution {job_id} was canceled "
                                 "successfully").format(
                                     job_id=job_execution.id))
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(_LI("Job execution {job_exec_id} was deleted. "
                                 "Canceling current operation.").format(
                             job_exec_id=job_execution_id))
                    return job_execution
            else:
                LOG.info(_LI("Job execution status {job}: {status}").format(
                         job=job_execution.id,
                         status=job_execution.info['status']))
                return job_execution
        else:
            raise e.CancelingFailed(_('Job execution %s was not canceled')
                                    % job_execution.id)
Exemple #53
0
def delete_stack(cluster):
    stack_name = cluster.stack_name
    base.execute_with_retries(client().stacks.delete, stack_name)
    stack = get_stack(stack_name, raise_on_missing=False)
    while stack is not None:
        # Valid states: IN_PROGRESS, empty and COMPLETE
        if stack.status in ['IN_PROGRESS', '', 'COMPLETE']:
            context.sleep(5)
        else:
            raise ex.HeatStackException(
                message=_(
                    "Cannot delete heat stack {name}, reason: "
                    "stack status: {status}, status reason: {reason}").format(
                    name=stack_name, status=stack.status,
                    reason=stack.stack_status_reason))
        stack = get_stack(stack_name, raise_on_missing=False)
Exemple #54
0
def _check_decommission(cluster, instances, check_func, timeout):
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        statuses = check_func(cluster)
        dec_ok = True
        for instance in instances:
            if statuses[instance.fqdn()] != 'decommissioned':
                dec_ok = False

        if dec_ok:
            return
        else:
            context.sleep(5)
    else:
        ex.SaharaException("Cannot finish decommission in %d seconds" %
                           timeout)