def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info("Cluster '%s': all instances have IPs assigned" % cluster.id) ctx = context.ctx() cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info("Cluster '%s': all instances are accessible" % cluster.id)
def _configure_instances(cluster): """Configure active instances. * generate /etc/hosts * setup passwordless login * etc. """ hosts = _generate_etc_hosts(cluster) for node_group in cluster.node_groups: for instance in node_group.instances: with remote.get_remote(instance) as r: r.write_file_to('etc-hosts', hosts) r.execute_command('sudo mv etc-hosts /etc/hosts') # wait generate id_rsa key timeout = 10 cur_time = 0 while cur_time < timeout: code, _ = r.execute_command('ls .ssh/id_rsa', raise_when_error=False) if code: cur_time += 1 context.sleep(1) else: break else: raise RuntimeError("Error getting user private key") r.execute_command('sudo chown $USER:$USER .ssh/id_rsa') r.execute_command('chmod 400 .ssh/id_rsa')
def _wait_for_host_registrations(self, num_hosts, ambari_host): LOG.info('Waiting for all Ambari agents to register with server ...') url = 'http://{0}:8080/api/v1/hosts'.format(ambari_host.management_ip) result = None json_result = None #TODO(jspeidel): timeout while result is None or len(json_result['items']) < num_hosts: context.sleep(5) try: result = requests.get(url, auth=(self.ambari_user, self.ambari_password)) json_result = json.loads(result.text) # TODO(jspeidel): just for debug LOG.info('Registered Hosts: {0} of {1}'.format( len(json_result['items']), num_hosts)) for hosts in json_result['items']: LOG.debug('Registered Host: {0}'.format( hosts['Hosts']['host_name'])) except requests.ConnectionError: #TODO(jspeidel): max wait time LOG.info('Waiting to connect to ambari server ...')
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) att_amount = 10 while att_amount: cmd = r.execute_command( "sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) att_amount -= 1 if not att_amount: raise Exception("Cannot finish decommission")
def _await_networks(cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info("Cluster '%s': all instances have IPs assigned" % cluster.id) ctx = context.ctx() cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster) instances = _get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, _wait_until_accessible, instance) LOG.info("Cluster '%s': all instances are accessible" % cluster.id)
def _wait_for_host_registrations(self, num_hosts, ambari_info): LOG.info( 'Waiting for all Ambari agents to register with server ...') url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address()) result = None json_result = None #TODO(jspeidel): timeout while result is None or len(json_result['items']) < num_hosts: context.sleep(5) try: result = requests.get(url, auth=(ambari_info.user, ambari_info.password)) json_result = json.loads(result.text) # TODO(jspeidel): just for debug LOG.info('Registered Hosts: {0} of {1}'.format( len(json_result['items']), num_hosts)) for hosts in json_result['items']: LOG.debug('Registered Host: {0}'.format( hosts['Hosts']['host_name'])) except requests.ConnectionError: #TODO(jspeidel): max wait time LOG.info('Waiting to connect to ambari server ...')
def _await_networks(instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not _check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) ctx = context.ctx() cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster) instances = get_instances(cluster, ips_assigned) accessible_instances = set() while len(accessible_instances) != len(instances): if not _check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in accessible_instances: if _check_if_accessible(instance): accessible_instances.add(instance.id) context.sleep(1)
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to("/etc/hadoop/dn.excl", utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) att_amount = 100 while att_amount: cmd = r.execute_command("sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and (dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to( {"/etc/hadoop/dn.incl": utils.generate_fqdn_host_names(survived_inst), "/etc/hadoop/dn.excl": ""} ) break context.sleep(3) att_amount -= 1 if not att_amount: raise Exception("Cannot finish decommission")
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to("/etc/hadoop/tt.excl", utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to( {"/etc/hadoop/tt.incl": utils.generate_fqdn_host_names(survived_inst), "/etc/hadoop/tt.excl": ""} )
def wait_till_active(self): while self.heat_stack.stack_status in ('CREATE_IN_PROGRESS', 'UPDATE_IN_PROGRESS'): context.sleep(1) self.heat_stack.get() if self.heat_stack.stack_status not in ('CREATE_COMPLETE', 'UPDATE_COMPLETE'): raise ex.HeatStackException(self.heat_stack.stack_status)
def _await_instances(cluster): """Await all instances are in Active status and available.""" all_up = False while not all_up: all_up = True for node_group in cluster.node_groups: for instance in node_group.instances: if not _check_if_up(instance): all_up = False context.sleep(1)
def _await_attach_volume(instance, device_path): timeout = 10 for _ in six.moves.xrange(timeout): device_paths = _get_device_paths(instance) if device_path in device_paths: return else: context.sleep(1) raise RuntimeError("Error attach volume to instance %s" % instance.instance_name)
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with jt.remote as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(jt.remote, "mradmin") context.sleep(3) r.write_files_to({'/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def _await_attach_volumes(instance, count_volumes): timeout = 10 step = 2 while timeout > 0: if len(_get_unmounted_devices(instance)) == count_volumes: return timeout -= step context.sleep(step) raise RuntimeError("Error attach volume to instance %s" % instance.instance_name)
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to({ '/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def _await_active(instances): """Await all instances are in Active status and available.""" if not instances: return active_ids = set() while len(active_ids) != len(instances): if not _check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in active_ids: if _check_if_active(instance): active_ids.add(instance.id) context.sleep(1)
def _create_attach_volume(instance, size, device_path, display_name=None, volume_type=None): volume = cinder.client().volumes.create(size=size, display_name=display_name, volume_type=volume_type) instance.volumes.append(volume.id) while volume.status != 'available': volume = cinder.get_volume(volume.id) if volume.status == 'error': raise RuntimeError("Volume %s has error status" % volume.id) context.sleep(1) nova.client().volumes.create_server_volume(instance.instance_id, volume.id, device_path)
def _create_attach_volume(ctx, instance, size, display_name=None, volume_type=None): volume = cinder.client().volumes.create(size=size, display_name=display_name, volume_type=volume_type) conductor.append_volume(ctx, instance, volume.id) while volume.status != 'available': volume = cinder.get_volume(volume.id) if volume.status == 'error': raise RuntimeError("Volume %s has error status" % volume.id) context.sleep(1) nova.client().volumes.create_server_volume(instance.instance_id, volume.id, None)
def _wait_until_accessible(instance): while True: try: # check if ssh is accessible and cloud-init # script is finished generating authorized_keys exit_code, stdout = instance.remote().execute_command("ls .ssh/authorized_keys", raise_when_error=False) if exit_code == 0: LOG.debug("Instance %s is accessible" % instance.instance_name) return except Exception as ex: LOG.debug("Can't login to node %s (%s), reason %s", instance.instance_name, instance.management_ip, ex) context.sleep(5) if not g.check_cluster_exists(instance.node_group.cluster): return
def _await_active(self, cluster, instances): """Await all instances are in Active status and available.""" if not instances: return active_ids = set() while len(active_ids) != len(instances): if not g.check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in active_ids: if self._check_if_active(instance): active_ids.add(instance.id) context.sleep(1) LOG.info("Cluster '%s': all instances are active" % cluster.id)
def run_in_subprocess(proc, func, args=(), kwargs={}): try: pickle.dump(func, proc.stdin) pickle.dump(args, proc.stdin) pickle.dump(kwargs, proc.stdin) proc.stdin.flush() result = pickle.load(proc.stdout) if 'exception' in result: raise SubprocessException(result['exception']) return result['output'] finally: # NOTE(dmitryme): in openstack/common/processutils.py it # is suggested to sleep a little between calls to multiprocessing. # That should allow it make some necessary cleanup context.sleep(0)
def _wait_for_async_request(self, request_url, ambari_info): started = False while not started: result = self._get(request_url, ambari_info) LOG.debug( 'async request ' + request_url + ' response:\n' + result.text) json_result = json.loads(result.text) started = True for items in json_result['items']: status = items['Tasks']['status'] if status == 'FAILED' or status == 'ABORTED': return False else: if status != 'COMPLETED': started = False context.sleep(5) return started
def _wait_for_async_request(self, request_url, auth): started = False while not started: result = requests.get(request_url, auth=auth) LOG.debug( 'async request ' + request_url + ' response:\n' + result.text) json_result = json.loads(result.text) started = True for items in json_result['items']: status = items['Tasks']['status'] if status == 'FAILED' or status == 'ABORTED': return False else: if status != 'COMPLETED': started = False context.sleep(5) return started
def start(self): url = ('/cluster/%s/services/%s/commands/start' % (self.cluster_name, self.service)) self.rest.post(url) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 cur_time = 0 while cur_time < timeout: context.sleep(2) if self.status() == 'running': break else: cur_time += 2 else: raise iex.IntelPluginException( "Service '%s' has failed to start in %s seconds" % (self.service, timeout))
def _wait_until_accessible(instance): while True: try: # check if ssh is accessible and cloud-init # script is finished generating id_rsa exit_code, stdout = instance.remote.execute_command( "ls .ssh/id_rsa", raise_when_error=False) if exit_code == 0: LOG.debug('Instance %s is accessible' % instance.instance_name) return except Exception as ex: LOG.debug("Can't login to node %s (%s), reason %s", instance.instance_name, instance.management_ip, ex) context.sleep(5) if not _check_cluster_exists(instance.node_group.cluster): return
def _wait_for_async_request(self, request_id, cluster_name, ambari_host): request_url = "http://{0}:8080/api/v1/clusters/{1}/requests/{" "2}/tasks?fields=Tasks/status".format( ambari_host.management_ip, cluster_name, request_id ) started = False while not started: result = requests.get(request_url, auth=(self.ambari_user, self.ambari_password)) LOG.debug("async request " + request_url + " response:\n" + result.text) json_result = json.loads(result.text) started = True for items in json_result["items"]: status = items["Tasks"]["status"] if status == "FAILED" or status == "ABORTED": return False else: if status != "COMPLETED": started = False context.sleep(5) return started
def _await_datanodes(self, cluster): datanodes_count = len(utils.get_datanodes(cluster)) if datanodes_count < 1: return LOG.info("Waiting %s datanodes to start up" % datanodes_count) with remote.get_remote(utils.get_namenode(cluster)) as r: while True: if run.check_datanodes_count(r, datanodes_count): LOG.info('Datanodes on cluster %s has been started' % cluster.name) return context.sleep(1) if not g.check_cluster_exists(cluster): LOG.info( 'Stop waiting datanodes on cluster %s since it has ' 'been deleted' % cluster.name) return
def _await_datanodes(self, cluster): datanodes_count = len(utils.get_datanodes(cluster)) if datanodes_count < 1: return LOG.info("Waiting %s datanodes to start up" % datanodes_count) with remote.get_remote(utils.get_namenode(cluster)) as r: while True: if run.check_datanodes_count(r, datanodes_count): LOG.info( 'Datanodes on cluster %s has been started' % cluster.name) return context.sleep(1) if not g.check_cluster_exists(cluster): LOG.info( 'Stop waiting datanodes on cluster %s since it has ' 'been deleted' % cluster.name) return
def _wait_for_host_registrations(self, num_hosts, ambari_host): LOG.info("Waiting for all Ambari agents to register with server ...") url = "http://{0}:8080/api/v1/hosts".format(ambari_host.management_ip) result = None json_result = None # TODO(jspeidel): timeout while result is None or len(json_result["items"]) < num_hosts: context.sleep(5) try: result = requests.get(url, auth=(self.ambari_user, self.ambari_password)) json_result = json.loads(result.text) # TODO(jspeidel): just for debug LOG.info("Registered Hosts: {0} of {1}".format(len(json_result["items"]), num_hosts)) for hosts in json_result["items"]: LOG.debug("Registered Host: {0}".format(hosts["Hosts"]["host_name"])) except requests.ConnectionError: # TODO(jspeidel): max wait time LOG.info("Waiting to connect to ambari server ...")
def _wait_for_async_request(self, request_id, cluster_name, ambari_host): request_url = 'http://{0}:8080/api/v1/clusters/{1}/requests/{' \ '2}/tasks?fields=Tasks/status'.format( ambari_host.management_ip, cluster_name, request_id) started = False while not started: result = requests.get(request_url, auth=('admin', 'admin')) LOG.debug( 'async request ' + request_url + ' response:\n' + result.text) json_result = json.loads(result.text) started = True for items in json_result['items']: status = items['Tasks']['status'] if status == 'FAILED' or status == 'ABORTED': return False else: if status != 'COMPLETED': started = False context.sleep(5) return started
def wait(ctx, session_id): #TODO(lazarev) add check on savanna cluster state (exit on delete) #TODO(alazarev) make configurable (bug #1262897) timeout = 4 * 60 * 60 # 4 hours cur_time = 0 while cur_time < timeout: info_items = get(ctx, session_id)['items'] for item in info_items: progress = item['nodeprogress'] if progress['info'].strip() == '_ALLFINISH': return else: context.sleep(10) cur_time += 10 debug_msg = 'Hostname: %s\nInfo: %s' debug_msg = debug_msg % (progress['hostname'], progress['info']) LOG.debug(debug_msg) else: raise iex.IntelPluginException( "Cluster '%s' has failed to start in %s minutes" % (ctx.cluster_name, timeout / 60))
def wait_for_host_registrations(self, num_hosts, ambari_info): LOG.info('Waiting for all Ambari agents to register with server ...') url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address()) result = None json_result = None #TODO(jspeidel): timeout while result is None or len(json_result['items']) < num_hosts: context.sleep(5) try: result = self._get(url, ambari_info) json_result = json.loads(result.text) LOG.info('Registered Hosts: {0} of {1}'.format( len(json_result['items']), num_hosts)) for hosts in json_result['items']: LOG.debug('Registered Host: {0}'.format( hosts['Hosts']['host_name'])) except requests.ConnectionError: #TODO(jspeidel): max wait time LOG.info('Waiting to connect to ambari server ...')
def wait(ctx, session_id): #TODO(lazarev) add check on savanna cluster state (exit on delete) #TODO(alazarev) make configurable (bug #1262897) timeout = 4*60*60 # 4 hours cur_time = 0 while cur_time < timeout: info_items = get(ctx, session_id)['items'] for item in info_items: progress = item['nodeprogress'] if progress['info'].strip() == '_ALLFINISH': return else: context.sleep(10) cur_time += 10 debug_msg = 'Hostname: %s\nInfo: %s' debug_msg = debug_msg % (progress['hostname'], progress['info']) LOG.debug(debug_msg) else: raise iex.IntelPluginException( "Cluster '%s' has failed to start in %s minutes" % (ctx.cluster_name, timeout / 60))
def _await_instances(cluster): """Await all instances are in Active status and available.""" ctx = context.ctx() all_up = False is_accesible = set() while not all_up: all_up = True for node_group in cluster.node_groups: for instance in node_group.instances: if not _check_if_up(instance): all_up = False cluster = conductor.cluster_get(ctx, cluster) for node_group in cluster.node_groups: for instance in node_group.instances: if not _check_if_accessible(instance, is_accesible): all_up = False context.sleep(1) return cluster
def _wait_for_async_request(self, request_id, cluster_name, ambari_host): request_url = 'http://{0}:8080/api/v1/clusters/{1}/requests/{' \ '2}/tasks?fields=Tasks/status'.format( ambari_host.management_ip, cluster_name, request_id) started = False while not started: result = requests.get(request_url, auth=(self.ambari_user, self.ambari_password)) LOG.debug('async request ' + request_url + ' response:\n' + result.text) json_result = json.loads(result.text) started = True for items in json_result['items']: status = items['Tasks']['status'] if status == 'FAILED' or status == 'ABORTED': return False else: if status != 'COMPLETED': started = False context.sleep(5) return started
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)] mng_ip = u.get_instances(cluster, 'manager')[0].management_ip client = c.IntelClient(mng_ip, cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout/60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-datanode status', raise_when_error=False) if out.strip() != 'datanode is stopped': stopped = False if out.strip() == 'datanode dead but pid file exists': instance.remote().execute_command( 'sudo rm -f ' '/var/run/hadoop/hadoop-hadoop-datanode.pid') if instance.fqdn() in tt_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-tasktracker status', raise_when_error=False) if out.strip() != 'tasktracker is stopped': stopped = False if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout/60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)
def install_manager(cluster): LOG.info("Starting Install Manager Process") mng_instance = u.get_instance(cluster, 'manager') idh_tarball_path = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL) idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1] idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')] LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path) idh_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL) os_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL) idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir with mng_instance.remote() as r: LOG.info("Download IDH manager ") try: r.execute_command('curl -O %s 2>&1' % idh_tarball_path) except Exception as e: raise RuntimeError("Unable to download IDH manager from %s", idh_tarball_path, e) # unpack archive LOG.info("Unpack manager %s ", idh_tarball_filename) try: r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename) except Exception as e: raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename, e) # install idh LOG.debug("Install manager with %s : ", idh_install_cmd) inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo) r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf) #TODO(alazarev) make timeout configurable (bug #1262897) r.execute_command(idh_install_cmd, timeout=3600) # fix nginx persimmions bug r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp ' '/var/lib/nginx/tmp/client_body') # waiting start idh manager #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 LOG.debug("Waiting %s seconds for Manager to start : ", timeout) while timeout: try: telnetlib.Telnet(mng_instance.management_ip, 9443) break except IOError: timeout -= 2 context.sleep(2) else: message = ("IDH Manager failed to start in %s minutes on node '%s' " "of cluster '%s'" % (timeout / 60, mng_instance.management_ip, cluster.name)) LOG.error(message) raise iex.IntelPluginException(message)
def install_manager(cluster): LOG.info("Starting Install Manager Process") mng_instance = u.get_instance(cluster, 'manager') idh_tarball_path = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL) idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1] idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')] LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path) idh_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL) os_repo = c_helper.get_config_value(cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL) idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir with mng_instance.remote() as r: LOG.info("Download IDH manager ") try: r.execute_command('curl -O %s 2>&1' % idh_tarball_path) except Exception as e: raise RuntimeError("Unable to download IDH manager from %s", idh_tarball_path, e) # unpack archive LOG.info("Unpack manager %s ", idh_tarball_filename) try: r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename) except Exception as e: raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename, e) # install idh LOG.debug("Install manager with %s : ", idh_install_cmd) inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo) r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf) #TODO(alazarev) make timeout configurable (bug #1262897) r.execute_command(idh_install_cmd, timeout=3600) # fix nginx persimmions bug r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp ' '/var/lib/nginx/tmp/client_body') # waiting start idh manager #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 LOG.debug("Waiting %s seconds for Manager to start : ", timeout) while timeout: try: telnetlib.Telnet(mng_instance.management_ip, 9443) break except IOError: timeout -= 2 context.sleep(2) else: message = ("IDH Manager failed to start in %s minutes on node '%s' " "of cluster '%s'" % (timeout / 60, mng_instance.management_ip, cluster.name)) LOG.error(message) raise iex.IntelPluginException(message)
def _add_element(self, lst, i): context.sleep(rnd.uniform(0, 0.1)) lst.append(i)
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)] client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout / 60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-datanode status', raise_when_error=False) if out.strip() != 'datanode is stopped': stopped = False if out.strip() == 'datanode dead but pid file exists': instance.remote().execute_command( 'sudo rm -f ' '/var/run/hadoop/hadoop-hadoop-datanode.pid') if instance.fqdn() in tt_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-tasktracker status', raise_when_error=False) if out.strip() != 'tasktracker is stopped': stopped = False if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout / 60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)