def _create_attach_volume(ctx, instance, size, volume_type, name=None, availability_zone=None): if CONF.cinder.api_version == 1: kwargs = {'size': size, 'display_name': name} else: kwargs = {'size': size, 'name': name} kwargs['volume_type'] = volume_type if availability_zone is not None: kwargs['availability_zone'] = availability_zone volume = cinder.client().volumes.create(**kwargs) conductor.append_volume(ctx, instance, volume.id) while volume.status != 'available': volume = cinder.get_volume(volume.id) if volume.status == 'error': raise ex.SystemError(_("Volume %s has error status") % volume.id) context.sleep(1) resp = nova.client().volumes.create_server_volume(instance.instance_id, volume.id, None) return resp.device
def wait_ambari_requests(self, requests, cluster_name): requests = set(requests) failed = [] while len(requests) > 0: completed, not_completed = set(), set() for req_id in requests: request = self.get_request_info(cluster_name, req_id) status = request.get("request_status") if status == 'COMPLETED': completed.add(req_id) elif status in ['IN_PROGRESS', 'PENDING']: not_completed.add(req_id) else: failed.append(request) if failed: msg = _("Some Ambari request(s) " "not in COMPLETED state: %(description)s.") descrs = [] for req in failed: descr = _( "request %(id)d: %(name)s - in status %(status)s") descrs.append(descr % {'id': req.get("id"), 'name': req.get("request_context"), 'status': req.get("request_status")}) raise p_exc.HadoopProvisionError(msg % {'description': descrs}) requests = not_completed context.sleep(5) LOG.debug("Waiting for %d ambari request(s) to be completed", len(not_completed)) LOG.debug("All ambari requests have been completed")
def wait_till_active(self): while self.heat_stack.stack_status in ("CREATE_IN_PROGRESS", "UPDATE_IN_PROGRESS"): context.sleep(1) self.heat_stack.get() if self.heat_stack.stack_status not in ("CREATE_COMPLETE", "UPDATE_COMPLETE"): raise ex.HeatStackException(self.heat_stack.stack_status)
def start_cluster(cluster): cl_tmpl = { "blueprint": cluster.name, "default_password": uuidutils.generate_uuid(), "host_groups": [] } for ng in cluster.node_groups: for instance in ng.instances: cl_tmpl["host_groups"].append({ "name": instance.instance_name, "hosts": [{"fqdn": instance.fqdn()}] }) ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER) password = cluster.extra["ambari_password"] with ambari_client.AmbariClient(ambari, password=password) as client: req_id = client.create_cluster(cluster.name, cl_tmpl)["id"] while True: status = client.check_request_status(cluster.name, req_id) LOG.debug("Task %s in %s state. Completed %.1f%%" % ( status["request_context"], status["request_status"], status["progress_percent"])) if status["request_status"] == "COMPLETED": return if status["request_status"] in ["IN_PROGRESS", "PENDING"]: context.sleep(5) else: raise p_exc.HadoopProvisionError( _("Ambari request in %s state") % status["request_status"])
def _start_cloudera_manager(cluster): manager = pu.get_manager(cluster) with manager.remote() as r: cmd.start_cloudera_db(r) cmd.start_manager(r) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % {'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT) conn.close() break except IOError: context.sleep(2) else: message = _("Cloudera Manager failed to start in %(timeout)s minutes " "on node '%(node)s' of cluster '%(cluster)s'") % { 'timeout': timeout / 60, 'node': manager.management_ip, 'cluster': cluster.name } raise ex.HadoopProvisionError(message) LOG.info(_LI("Cloudera Manager has been started"))
def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info( _LI("Cluster '%s': all instances have IPs assigned"), cluster.id) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
def _start_cloudera_manager(cluster): manager = pu.get_manager(cluster) with manager.remote() as r: cmd.start_cloudera_db(r) cmd.start_manager(r) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % { 'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT) conn.close() break except IOError: context.sleep(2) else: message = _("Cloudera Manager failed to start in %(timeout)s minutes " "on node '%(node)s' of cluster '%(cluster)s'") % { 'timeout': timeout / 60, 'node': manager.management_ip, 'cluster': cluster.name} raise ex.HadoopProvisionError(message) LOG.info(_LI("Cloudera Manager has been started"))
def wait_stack_completion(stack): while stack.status == 'IN_PROGRESS': context.sleep(1) stack.get() if stack.status != 'COMPLETE': raise ex.HeatStackException(stack.stack_status)
def wait_for_host_registrations(self, num_hosts, ambari_info): LOG.info( _LI('Waiting for all Ambari agents to register with server ...')) url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address()) result = None json_result = None # TODO(jspeidel): timeout while result is None or len(json_result['items']) < num_hosts: context.sleep(5) try: result = self._get(url, ambari_info) json_result = json.loads(result.text) LOG.info(_LI('Registered Hosts: %(current_number)s of ' '%(final_number)s'), {'current_number': len(json_result['items']), 'final_number': num_hosts}) for hosts in json_result['items']: LOG.debug('Registered Host: {0}'.format( hosts['Hosts']['host_name'])) except Exception: # TODO(jspeidel): max wait time LOG.info(_LI('Waiting to connect to ambari server ...'))
def execute_with_retries(method, *args, **kwargs): attempts = CONF.retries.retries_number + 1 while attempts > 0: try: return method(*args, **kwargs) except Exception as e: error_code = getattr(e, 'http_status', None) or getattr( e, 'status_code', None) or getattr(e, 'code', None) if error_code in ERRORS_TO_RETRY: LOG.warning(_LW('Occasional error occurred during "{method}" ' 'execution: {error_msg} ({error_code}). ' 'Operation will be retried.').format( method=method.__name__, error_msg=e, error_code=error_code)) attempts -= 1 retry_after = getattr(e, 'retry_after', 0) context.sleep(max(retry_after, CONF.retries.retry_after)) else: LOG.debug('Permanent error occurred during "{method}" ' 'execution: {error_msg}.'.format( method=method.__name__, error_msg=e)) raise e else: raise ex.MaxRetriesExceeded(attempts, method.__name__)
def _wait_all_processes_removed(cluster, instance): with _get_ambari_client(cluster) as client: while True: hdp_processes = client.list_host_processes(cluster.name, instance) if not hdp_processes: return context.sleep(5)
def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % ( volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn(_LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), {'volume': volume_id, 'status': volume.status})
def wait(self, timeout=None): """Wait for command to finish :param timeout: (Optional) Max amount of time (in seconds) to wait. Wait forever by default. :return: The final ApiCommand object, containing the last known state. The command may still be running in case of timeout. """ if self.id == ApiCommand.SYNCHRONOUS_COMMAND_ID: return self SLEEP_SEC = 5 if timeout is None: deadline = None else: deadline = time.time() + timeout while True: cmd = self.fetch() if not cmd.active: return cmd if deadline is not None: now = time.time() if deadline < now: return cmd else: context.sleep(min(SLEEP_SEC, deadline - now)) else: context.sleep(SLEEP_SEC)
def _await_cldb(self, cluster_context, instances=None, timeout=600): instances = instances or cluster_context.get_instances() cldb_node = cluster_context.get_instance(mfs.CLDB) start_time = timeutils.utcnow() retry_count = 0 with cldb_node.remote() as r: LOG.debug("Waiting {count} seconds for CLDB initialization".format( count=timeout)) while timeutils.delta_seconds(start_time, timeutils.utcnow()) < timeout: ec, out = r.execute_command(NODE_LIST_CMD, raise_when_error=False) resp = json.loads(out) status = resp['status'] if str(status).lower() == 'ok': ips = [n['ip'] for n in resp['data']] retry_count += 1 for i in instances: if (i.management_ip not in ips and retry_count > DEFAULT_RETRY_COUNT): raise ex.HadoopProvisionError(_( "Node failed to connect to CLDB: %s") % i.management_ip) break else: context.sleep(DELAY) else: raise ex.HadoopProvisionError(_("CLDB failed to start"))
def _get_job_status_from_remote(self, job_execution, retries=3): topology_name, inst_id = self._get_instance_if_running(job_execution) if topology_name is None or inst_id is None: return edp.JOB_STATUSES_TERMINATED topology_name = self._get_topology_name(job_execution) master = plugin_utils.get_instance(self.cluster, "nimbus") cmd = ("%(storm)s -c nimbus.host=%(host)s " "list | grep %(topology_name)s | awk '{print $2}'") % ( { "storm": "/usr/local/storm/bin/storm", "host": master.hostname(), "topology_name": topology_name }) for i in range(retries): with remote.get_remote(master) as r: ret, stdout = r.execute_command("%s " % (cmd)) # If the status is ACTIVE is there, it's still running if stdout.strip() == "ACTIVE": return {"status": edp.JOB_STATUS_RUNNING} else: if i == retries - 1: return {"status": edp.JOB_STATUS_KILLED} context.sleep(10)
def get(self, relpath=None, params=None): """Invoke the GET method on a resource :param relpath: Optional. A relative path to this resource's path. :param params: Key-value data. :return: A dictionary of the JSON result. """ for retry in six.moves.xrange(self.retries + 1): if retry: context.sleep(self.retry_sleep) try: return self.invoke("GET", relpath, params) except (socket.error, urllib.error.URLError) as e: if "timed out" in six.text_type(e).lower(): if retry < self.retries: LOG.warning(_LW("Timeout issuing GET request for " "{path}. Will retry").format( path=self._join_uri(relpath))) else: LOG.warning(_LW("Timeout issuing GET request for " "{path}. No retries left").format( path=self._join_uri(relpath))) else: raise e else: raise ex.CMApiException(_("Get retry max time reached."))
def _await_cldb(self, cluster_context, instances=None, timeout=600): instances = instances or cluster_context.get_instances() cldb_node = cluster_context.get_instance(mfs.CLDB) start_time = timeutils.utcnow() retry_count = 0 with cldb_node.remote() as r: LOG.debug("Waiting {count} seconds for CLDB initialization".format( count=timeout)) while timeutils.delta_seconds(start_time, timeutils.utcnow()) < timeout: ec, out = r.execute_command(NODE_LIST_CMD, raise_when_error=False) resp = json.loads(out) status = resp['status'] if str(status).lower() == 'ok': ips = [n['ip'] for n in resp['data']] retry_count += 1 for i in instances: if (i.internal_ip not in ips and retry_count > DEFAULT_RETRY_COUNT): raise ex.HadoopProvisionError( _("Node failed to connect to CLDB: %s") % i.internal_ip) break else: context.sleep(DELAY) else: raise ex.HadoopProvisionError(_("CLDB failed to start"))
def get(self, relpath=None, params=None): """Invoke the GET method on a resource :param relpath: Optional. A relative path to this resource's path. :param params: Key-value data. :return: A dictionary of the JSON result. """ for retry in six.moves.xrange(self.retries + 1): if retry: context.sleep(self.retry_sleep) try: return self.invoke("GET", relpath, params) except (socket.error, urllib2.URLError) as e: if "timed out" in six.text_type(e).lower(): if retry < self.retries: LOG.warning( _LW("Timeout issuing GET request for " "{path}. Will retry").format( path=self._join_uri(relpath))) else: LOG.warning( _LW("Timeout issuing GET request for " "{path}. No retries left").format( path=self._join_uri(relpath))) else: raise e else: raise ex.CMApiException(_("Get retry max time reached."))
def _get_job_status_from_remote(self, job_execution, retries=3): topology_name, inst_id = self._get_instance_if_running( job_execution) if topology_name is None or inst_id is None: return edp.JOB_STATUSES_TERMINATED topology_name = self._get_topology_name(job_execution) master = plugin_utils.get_instance(self.cluster, "nimbus") cmd = ( "%(storm)s -c nimbus.host=%(host)s " "list | grep %(topology_name)s | awk '{print $2}'") % ( { "storm": "/usr/local/storm/bin/storm", "host": master.hostname(), "topology_name": topology_name }) for i in range(retries): with remote.get_remote(master) as r: ret, stdout = r.execute_command("%s " % (cmd)) # If the status is ACTIVE is there, it's still running if stdout.strip() == "ACTIVE": return {"status": edp.JOB_STATUS_RUNNING} else: if i == retries - 1: return {"status": edp.JOB_STATUS_KILLED} context.sleep(10)
def start_cluster(cluster): cl_tmpl = { "blueprint": cluster.name, "default_password": uuidutils.generate_uuid(), "host_groups": [] } for ng in cluster.node_groups: for instance in ng.instances: cl_tmpl["host_groups"].append({ "name": instance.instance_name, "hosts": [{ "fqdn": instance.fqdn() }] }) ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER) password = cluster.extra["ambari_password"] with ambari_client.AmbariClient(ambari, password=password) as client: req_id = client.create_cluster(cluster.name, cl_tmpl)["id"] while True: status = client.check_request_status(cluster.name, req_id) LOG.debug("Task %s in %s state. Completed %.1f%%" % (status["request_context"], status["request_status"], status["progress_percent"])) if status["request_status"] == "COMPLETED": return if status["request_status"] in ["IN_PROGRESS", "PENDING"]: context.sleep(5) else: raise p_exc.HadoopProvisionError( _("Ambari request in %s state") % status["request_status"])
def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % (volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn( _LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), { 'volume': volume_id, 'status': volume.status })
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)] client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout / 60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: stopped = stopped and _is_hadoop_service_stopped( instance, 'hadoop-hdfs-datanode') if instance.fqdn() in nm_hosts: stopped = stopped and _is_hadoop_service_stopped( instance, 'hadoop-yarn-nodemanager') if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout / 60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)
def wait_stack_completion(stack, is_update=False, last_updated_time=None): base.execute_with_retries(stack.get) while not _verify_completion(stack, is_update, last_updated_time): context.sleep(1) base.execute_with_retries(stack.get) if stack.status != 'COMPLETE': raise ex.HeatStackException(stack.stack_status_reason)
def _make_checks(self, instance_info, sleep=True): ctx = context.ctx() if sleep: context.sleep(2) current_instance_info = ctx.current_instance_info self.assertEqual(instance_info, current_instance_info)
def poll(get_status, kwargs=None, args=None, operation_name=None, timeout_name=None, timeout=DEFAULT_TIMEOUT, sleep=DEFAULT_SLEEP_TIME, exception_strategy='raise'): """This util poll status of object obj during some timeout. :param get_status: function, which return current status of polling as Boolean :param kwargs: keyword arguments of function get_status :param operation_name: name of polling process :param timeout_name: name of timeout option :param timeout: value of timeout in seconds. By default, it equals to 3 hours :param sleep: duration between two consecutive executions of get_status function :param exception_strategy: possible values ('raise', 'mark_as_true', 'mark_as_false'). If exception_strategy is 'raise' exception would be raised. If exception_strategy is 'mark_as_true', return value of get_status would marked as True, and in case of 'mark_as_false' - False. By default it's 'raise'. """ start_time = timeutils.utcnow() # We shouldn't raise TimeoutException if incorrect timeout specified and # status is ok now. In such way we should execute get_status at least once. at_least_once = True if not kwargs: kwargs = {} if not args: args = () while at_least_once or _get_consumed(start_time) < timeout: at_least_once = False try: status = get_status(*args, **kwargs) except BaseException: if exception_strategy == 'raise': raise elif exception_strategy == 'mark_as_true': status = True else: status = False if status: operation = "Operation" if operation_name: operation = "Operation with name {op_name}".format( op_name=operation_name) LOG.debug('{operation_desc} was executed successfully in timeout ' '{timeout}'.format(operation_desc=operation, timeout=timeout)) return context.sleep(sleep) raise ex.TimeoutException(timeout, operation_name, timeout_name)
def wait_stack_completion(cluster, is_update=False, last_updated_time=None): stack_name = cluster.stack_name stack = get_stack(stack_name) while not _verify_completion(stack, is_update, last_updated_time): context.sleep(1) stack = get_stack(stack_name) if stack.status != 'COMPLETE': raise ex.HeatStackException(stack.stack_status_reason)
def wait_till_active(self): while self.heat_stack.stack_status in ('CREATE_IN_PROGRESS', 'UPDATE_IN_PROGRESS'): context.sleep(1) self.heat_stack.get() if self.heat_stack.stack_status not in ('CREATE_COMPLETE', 'UPDATE_COMPLETE'): raise ex.HeatStackException(self.heat_stack.stack_status)
def wait_stack_completion(stack): # NOTE: expected empty status because status of stack # maybe is not set in heat database while stack.status in ['IN_PROGRESS', '']: context.sleep(1) stack.get() if stack.status != 'COMPLETE': raise ex.HeatStackException(stack.stack_status)
def update_configs(self, instances): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Update configs"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("update-configs-%s" % instance.instance_name, self._update_configs, instance) context.sleep(1)
def _make_checks(self, instance, sleep=True): ctx = context.ctx() if sleep: context.sleep(2) current_instance_info = ctx.current_instance_info expected = [None, instance.id, instance.name, None] self.assertEqual(expected, current_instance_info)
def _wait_all_processes_removed(cluster, instance): ambari = plugin_utils.get_instance(cluster, p_common.AMBARI_SERVER) password = cluster.extra["ambari_password"] with ambari_client.AmbariClient(ambari, password=password) as client: while True: hdp_processes = client.list_host_processes(cluster.name, instance) if not hdp_processes: return context.sleep(5)
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: return job_execution engine = _get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': { 'status': edp.JOB_STATUS_TOBEKILLED }}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.exception( _LE("Error during cancel of job execution %(job)s: " "%(error)s"), { 'job': job_execution.id, 'error': ex }) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution %s was canceled successfully"), job_execution.id) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info( _LI("Job execution %(job_exec_id)s was deleted. " "Canceling current operation."), {'job_exec_id': job_execution_id}) return job_execution else: LOG.info( _LI("Job execution status %(job)s: %(status)s"), { 'job': job_execution.id, 'status': job_execution.info['status'] }) return job_execution else: raise e.CancelingFailed( _('Job execution %s was not canceled') % job_execution.id)
def start_cluster(self, cluster): start_helper.exec_configure_sh_on_cluster( cluster, self.get_configure_sh_string(cluster)) start_helper.wait_for_mfs_unlock(cluster, self.get_waiting_script()) start_helper.setup_maprfs_on_cluster(cluster, self.get_disk_setup_script()) start_helper.start_zookeeper_nodes_on_cluster(cluster) start_helper.start_warden_on_cldb_nodes(cluster) context.sleep(SIXTY_SECONDS) start_helper.start_warden_on_other_nodes(cluster) start_helper.start_ecosystem(self.get_context(cluster))
def poll(get_status, kwargs=None, args=None, operation_name=None, timeout_name=None, timeout=DEFAULT_TIMEOUT, sleep=DEFAULT_SLEEP_TIME, exception_strategy='raise'): """This util poll status of object obj during some timeout. :param get_status: function, which return current status of polling as Boolean :param kwargs: keyword arguments of function get_status :param operation_name: name of polling process :param timeout_name: name of timeout option :param timeout: value of timeout in seconds. By default, it equals to 3 hours :param sleep: duration between two consecutive executions of get_status function :param exception_strategy: possible values ('raise', 'mark_as_true', 'mark_as_false'). If exception_strategy is 'raise' exception would be raised. If exception_strategy is 'mark_as_true', return value of get_status would marked as True, and in case of 'mark_as_false' - False. By default it's 'raise'. """ start_time = timeutils.utcnow() # We shouldn't raise TimeoutException if incorrect timeout specified and # status is ok now. In such way we should execute get_status at least once. at_least_once = True if not kwargs: kwargs = {} if not args: args = () while at_least_once or _get_consumed(start_time) < timeout: at_least_once = False try: status = get_status(*args, **kwargs) except BaseException: if exception_strategy == 'raise': raise elif exception_strategy == 'mark_as_true': status = True else: status = False if status: operation = "Operation" if operation_name: operation = "Operation with name {op_name}".format( op_name=operation_name) LOG.debug( '{operation_desc} was executed successfully in timeout ' '{timeout}' .format(operation_desc=operation, timeout=timeout)) return context.sleep(sleep) raise ex.TimeoutException(timeout, operation_name, timeout_name)
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to({'/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def start_cluster(self, cluster): start_helper.exec_configure_sh_on_cluster( cluster, self.get_configure_sh_string(cluster)) start_helper.wait_for_mfs_unlock(cluster, self.get_waiting_script()) start_helper.setup_maprfs_on_cluster( cluster, self.get_disk_setup_script()) start_helper.start_zookeeper_nodes_on_cluster(cluster) start_helper.start_warden_on_cldb_nodes(cluster) context.sleep(SIXTY_SECONDS) start_helper.start_warden_on_other_nodes(cluster) start_helper.start_ecosystem(self.get_context(cluster))
def _await_attach_volumes(instance, count_volumes): timeout = 10 step = 2 while timeout > 0: if len(_get_unmounted_devices(instance)) == count_volumes: return timeout -= step context.sleep(step) raise RuntimeError("Error attach volume to instance %s" % instance.instance_name)
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to({ '/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def _await_attach_volumes(instance, devices): timeout = 10 step = 2 while timeout > 0: if _count_attached_devices(instance, devices) == len(devices): return timeout -= step context.sleep(step) raise ex.SystemError(_("Error attach volume to instance %s") % instance.instance_name)
def _await_attach_volumes(instance, devices): timeout = 10 step = 2 while timeout > 0: if _count_attached_devices(instance, devices) == len(devices): return timeout -= step context.sleep(step) raise ex.SystemError( _("Error attach volume to instance %s") % instance.instance_name)
def wait_ambari_request(self, request_id, cluster_name): while True: status = self.check_request_status(cluster_name, request_id) LOG.debug("Task %s in %s state. Completed %.1f%%" % ( status["request_context"], status["request_status"], status["progress_percent"])) if status["request_status"] == "COMPLETED": return if status["request_status"] in ["IN_PROGRESS", "PENDING"]: context.sleep(5) else: raise p_exc.HadoopProvisionError( _("Ambari request in %s state") % status["request_status"])
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: LOG.info( _LI("Job execution is already finished and shouldn't be" " canceled")) return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: LOG.info(_LI("Can not cancel this job on a non-existant cluster.")) return job_execution engine = get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': { 'status': edp.JOB_STATUS_TOBEKILLED }}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.warning( _LW("Error during cancel of job execution: " "{error}").format(error=ex)) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution was canceled successfully")) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info( _LI("Job execution was deleted. " "Canceling current operation.")) return job_execution else: LOG.info( _LI("Job execution status: {status}").format( status=job_execution.info['status'])) return job_execution else: raise e.CancelingFailed( _('Job execution %s was not canceled') % job_execution.id)
def _create_attach_volume(ctx, instance, size, display_name=None): volume = cinder.client().volumes.create(size=size, display_name=display_name) conductor.append_volume(ctx, instance, volume.id) while volume.status != 'available': volume = cinder.get_volume(volume.id) if volume.status == 'error': raise ex.SystemError("Volume %s has error status" % volume.id) context.sleep(1) resp = nova.client().volumes.create_server_volume( instance.instance_id, volume.id, None) return resp.device
def decommission_nodes(cluster, instances, configure_sh_string): LOG.info(_LI('Start decommission . Cluster = %s'), cluster.name) move_node(cluster, instances) stop_services(cluster, instances) context.sleep(names.WAIT_NODE_ALARM_NO_HEARTBEAT) remove_node(cluster, instances) remove_services(cluster, instances) if check_for_cldb_or_zookeeper_service(instances): all_instances = gen.get_instances(cluster) current_cluster_instances = [ x for x in all_instances if x not in instances] for inst in current_cluster_instances: start_helper.exec_configure_sh_on_instance( cluster, inst, configure_sh_string) LOG.info(_LI('End decommission. Cluster = %s'), cluster.name)
def _create_attach_volume(ctx, instance, size, display_name=None): volume = cinder.client().volumes.create(size=size, display_name=display_name) conductor.append_volume(ctx, instance, volume.id) while volume.status != 'available': volume = cinder.get_volume(volume.id) if volume.status == 'error': raise ex.SystemError(_("Volume %s has error status") % volume.id) context.sleep(1) resp = nova.client().volumes.create_server_volume(instance.instance_id, volume.id, None) return resp.device
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: return job_execution engine = _get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': {'status': edp.JOB_STATUS_TOBEKILLED}}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.warning( _LW("Error during cancel of job execution {job}: " "{error}").format(job=job_execution.id, error=ex)) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution {job_id} was canceled " "successfully").format( job_id=job_execution.id)) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info(_LI("Job execution {job_exec_id} was deleted. " "Canceling current operation.").format( job_exec_id=job_execution_id)) return job_execution else: LOG.info(_LI("Job execution status {job}: {status}").format( job=job_execution.id, status=job_execution.info['status'])) return job_execution else: raise e.CancelingFailed(_('Job execution %s was not canceled') % job_execution.id)
def delete_stack(cluster): stack_name = cluster.stack_name base.execute_with_retries(client().stacks.delete, stack_name) stack = get_stack(stack_name, raise_on_missing=False) while stack is not None: # Valid states: IN_PROGRESS, empty and COMPLETE if stack.status in ['IN_PROGRESS', '', 'COMPLETE']: context.sleep(5) else: raise ex.HeatStackException( message=_( "Cannot delete heat stack {name}, reason: " "stack status: {status}, status reason: {reason}").format( name=stack_name, status=stack.status, reason=stack.stack_status_reason)) stack = get_stack(stack_name, raise_on_missing=False)
def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.SaharaException("Cannot finish decommission in %d seconds" % timeout)