def prepare_upgrade_backup_namenode_dir(): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires backing up the NameNode Name Dirs. """ import params i = 0 failed_paths = [] nn_name_dirs = params.dfs_name_dir.split(',') backup_destination_root_dir = "/tmp/upgrades/{0}".format(params.stack_version_unformatted) if len(nn_name_dirs) > 0: Logger.info("Backup the NameNode name directory's CURRENT folder.") for nn_dir in nn_name_dirs: i += 1 namenode_current_image = os.path.join(nn_dir, "current") unique = get_unique_id_and_date() + "_" + str(i) # Note that /tmp may not be writeable. backup_current_folder = "{0}/namenode_{1}/".format(backup_destination_root_dir, unique) if os.path.isdir(namenode_current_image) and not os.path.isdir(backup_current_folder): try: os.makedirs(backup_current_folder) Execute(('cp', '-ar', namenode_current_image, backup_current_folder), sudo=True ) except Exception, e: failed_paths.append(namenode_current_image)
def service_check(self, env): import params env.set_params(params) unique = get_unique_id_and_date() File("/tmp/wordCount.jar", content=StaticFile("wordCount.jar"), owner=params.storm_user ) cmd = "" if params.nimbus_seeds_supported: # Because this command is guaranteed to run on one of the hosts with storm client, there is no need # to specify "-c nimbus.seeds={nimbus_seeds}" cmd = format("storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique}") elif params.nimbus_host is not None: cmd = format("storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique} -c nimbus.host={nimbus_host}") Execute(cmd, logoutput=True, path=params.storm_bin_dir, user=params.storm_user ) Execute(format("storm kill WordCount{unique}"), path=params.storm_bin_dir, user=params.storm_user )
def hcat_service_check(): import params unique = get_unique_id_and_date() output_file = format("{hive_apps_whs_dir}/hcatsmoke{unique}") test_cmd = format("fs -test -e {output_file}") if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser_principal}; ") else: kinit_cmd = "" File(format("{tmp_dir}/hcatSmoke.sh"), content=StaticFile("hcatSmoke.sh"), mode=0755 ) prepare_cmd = format("{kinit_cmd}env JAVA_HOME={java64_home} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} prepare {purge_tables}") exec_path = params.execute_path if params.version and params.stack_name: upgrade_hive_bin = format("/usr/hdp/{version}/hive/bin") exec_path = os.environ['PATH'] + os.pathsep + params.hadoop_bin_dir + os.pathsep + upgrade_hive_bin Execute(prepare_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path], logoutput=True) if params.security_enabled: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, principal=params.hdfs_principal_name, bin_dir=params.execute_path) else: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, bin_dir=params.execute_path ) cleanup_cmd = format("{kinit_cmd} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} cleanup {purge_tables}") Execute(cleanup_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path], logoutput=True)
def hcat_service_check(): import params unique = get_unique_id_and_date() output_file = format("{hive_apps_whs_dir}/hcatsmoke{unique}") test_cmd = format("fs -test -e {output_file}") if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser}; ") else: kinit_cmd = "" File(format("{tmp_dir}/hcatSmoke.sh"), content=StaticFile("hcatSmoke.sh"), mode=0755 ) prepare_cmd = format("{kinit_cmd}env JAVA_HOME={java64_home} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} prepare") exec_path = params.execute_path if params.version and params.stack_name: upgrade_hive_bin = format("/usr/hdp/{version}/hive/bin") exec_path = os.environ['PATH'] + os.pathsep + params.hadoop_bin_dir + os.pathsep + upgrade_hive_bin Execute(prepare_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path], logoutput=True) if params.security_enabled: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, principal=params.hdfs_principal_name, bin_dir=params.execute_path) else: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, bin_dir=params.execute_path ) cleanup_cmd = format("{kinit_cmd} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} cleanup") Execute(cleanup_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path], logoutput=True)
def post_upgrade_restart(self, env, upgrade_type=None): if upgrade_type == "nonrolling": return Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) zk_server_host = random.choice(params.zookeeper_hosts) cli_shell = format( "{zk_cli_shell} -server {zk_server_host}:{client_port}") # Ensure that a quorum is still formed. unique = get_unique_id_and_date() create_command = format("echo 'create /{unique} mydata' | {cli_shell}") list_command = format("echo 'ls /' | {cli_shell}") delete_command = format("echo 'delete /{unique} ' | {cli_shell}") quorum_err_message = "Failed to establish zookeeper quorum" call_and_match_output(create_command, 'Created', quorum_err_message) call_and_match_output(list_command, r"\[.*?" + unique + ".*?\]", quorum_err_message) call(delete_command) if params.client_port: check_leader_command = format( "echo stat | nc localhost {client_port} | grep Mode") code, out = call(check_leader_command, logoutput=False) if code == 0 and out: Logger.info(out)
def prepare_upgrade_backup_namenode_dir(): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires backing up the NameNode Name Dirs. """ import params i = 0 failed_paths = [] nn_name_dirs = params.dfs_name_dir.split(',') backup_destination_root_dir = "/tmp/upgrades/{0}".format( params.stack_version_unformatted) if len(nn_name_dirs) > 0: Logger.info("Backup the NameNode name directory's CURRENT folder.") for nn_dir in nn_name_dirs: i += 1 namenode_current_image = os.path.join(nn_dir, "current") unique = get_unique_id_and_date() + "_" + str(i) # Note that /tmp may not be writeable. backup_current_folder = "{0}/namenode_{1}/".format( backup_destination_root_dir, unique) if os.path.isdir(namenode_current_image ) and not os.path.isdir(backup_current_folder): try: os.makedirs(backup_current_folder) Execute(('cp', '-ar', namenode_current_image, backup_current_folder), sudo=True) except Exception, e: failed_paths.append(namenode_current_image)
def post_upgrade_restart(self, env, upgrade_type=None): if upgrade_type == "nonrolling": return Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) zk_server_host = random.choice(params.zookeeper_hosts) cli_shell = format("{zk_cli_shell} -server {zk_server_host}:{client_port}") # Ensure that a quorum is still formed. unique = get_unique_id_and_date() create_command = format("echo 'create /{unique} mydata' | {cli_shell}") list_command = format("echo 'ls /' | {cli_shell}") delete_command = format("echo 'delete /{unique} ' | {cli_shell}") quorum_err_message = "Failed to establish zookeeper quorum" call_and_match_output(create_command, 'Created', quorum_err_message, user=params.zk_user) call_and_match_output(list_command, r"\[.*?" + unique + ".*?\]", quorum_err_message, user=params.zk_user) shell.call(delete_command, user=params.zk_user) if params.client_port: check_leader_command = format("echo stat | nc localhost {client_port} | grep Mode") code, out = shell.call(check_leader_command, logoutput=False) if code == 0 and out: Logger.info(out)
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() #Hadoop uses POSIX-style paths, separator is always / dir = params.hdfs_tmp_dir tmp_file = dir + '/' + unique #commands for execution hadoop_cmd = "cmd /C %s" % (os.path.join(params.hadoop_home, "bin", "hadoop.cmd")) create_dir_cmd = "%s fs -mkdir %s" % (hadoop_cmd, dir) own_dir = "%s fs -chmod 777 %s" % (hadoop_cmd, dir) test_dir_exists = "%s fs -test -e %s" % (hadoop_cmd, dir) cleanup_cmd = "%s fs -rm %s" % (hadoop_cmd, tmp_file) create_file_cmd = "%s fs -put %s %s" % (hadoop_cmd, os.path.join(params.hadoop_conf_dir, "core-site.xml"), tmp_file) test_cmd = "%s fs -test -e %s" % (hadoop_cmd, tmp_file) hdfs_cmd = "cmd /C %s" % (os.path.join(params.hadoop_home, "bin", "hdfs.cmd")) safemode_command = "%s dfsadmin -safemode get | %s OFF" % (hdfs_cmd, params.grep_exe) Execute(safemode_command, logoutput=True, try_sleep=3, tries=20) Execute(create_dir_cmd, user=params.hdfs_user,logoutput=True, ignore_failures=True) Execute(own_dir, user=params.hdfs_user,logoutput=True) Execute(test_dir_exists, user=params.hdfs_user,logoutput=True) Execute(create_file_cmd, user=params.hdfs_user,logoutput=True) Execute(test_cmd, user=params.hdfs_user,logoutput=True) Execute(cleanup_cmd, user=params.hdfs_user,logoutput=True)
def check_llap(self, env, kinit_cmd): import params env.set_params(params) File(format("{tmp_dir}/hiveLlapSmoke.sh"), content=StaticFile("hiveLlapSmoke.sh"), mode=0755) unique_id = get_unique_id_and_date() llap_cmd = format( "{kinit_cmd}env JAVA_HOME={java64_home} {tmp_dir}/hiveLlapSmoke.sh {stack_root} llap_smoke_{unique_id} prepare" ) exec_path = params.execute_path if params.version and params.stack_root: upgrade_hive_bin = format("{stack_root}/{version}/hive2/bin") exec_path = os.environ[ 'PATH'] + os.pathsep + params.hadoop_bin_dir + os.pathsep + upgrade_hive_bin Execute(llap_cmd, user=params.hive_user, path=[ '/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path ], tries=1, try_sleep=5, wait_for_finish=True, stderr=subprocess.PIPE, logoutput=True)
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() #Hadoop uses POSIX-style paths, separator is always / dir = '/tmp' tmp_file = dir + '/' + unique #commands for execution hadoop_cmd = "cmd /C %s" % (os.path.join(params.hadoop_home, "bin", "hadoop.cmd")) create_dir_cmd = "%s fs -mkdir %s" % (hadoop_cmd, dir) own_dir = "%s fs -chmod 777 %s" % (hadoop_cmd, dir) test_dir_exists = "%s fs -test -e %s" % (hadoop_cmd, dir) cleanup_cmd = "%s fs -rm %s" % (hadoop_cmd, tmp_file) create_file_cmd = "%s fs -put %s %s" % (hadoop_cmd, os.path.join(params.hadoop_conf_dir, "core-site.xml"), tmp_file) test_cmd = "%s fs -test -e %s" % (hadoop_cmd, tmp_file) hdfs_cmd = "cmd /C %s" % (os.path.join(params.hadoop_home, "bin", "hdfs.cmd")) safemode_command = "%s dfsadmin -safemode get | %s OFF" % (hdfs_cmd, params.grep_exe) Execute(safemode_command, logoutput=True, try_sleep=3, tries=20) Execute(create_dir_cmd, user=params.hdfs_user,logoutput=True, ignore_failures=True) Execute(own_dir, user=params.hdfs_user,logoutput=True) Execute(test_dir_exists, user=params.hdfs_user,logoutput=True) Execute(create_file_cmd, user=params.hdfs_user,logoutput=True) Execute(test_cmd, user=params.hdfs_user,logoutput=True) Execute(cleanup_cmd, user=params.hdfs_user,logoutput=True)
def hcat_service_check(): import params unique = get_unique_id_and_date() output_file = format("/apps/hive/warehouse/hcatsmoke{unique}") test_cmd = format("fs -test -e {output_file}") if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser}; ") else: kinit_cmd = "" File(format("{tmp_dir}/hcatSmoke.sh"), content=StaticFile("hcatSmoke.sh"), mode=0755 ) prepare_cmd = format("{kinit_cmd}env JAVA_HOME={java64_home} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} prepare") Execute(prepare_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/nin', '/bin', '/usr/bin'], environment = {'PATH' : params.execute_path}, logoutput=True) if params.security_enabled: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, principal=params.hdfs_principal_name, bin_dir=params.hive_bin ) else: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, bin_dir=params.hive_bin ) cleanup_cmd = format("{kinit_cmd} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} cleanup") Execute(cleanup_cmd, tries=3, user=params.smokeuser, environment = {'PATH' : params.execute_path }, try_sleep=5, path=['/usr/sbin', '/usr/local/nin', '/bin', '/usr/bin'], logoutput=True )
def check_llap(self, env, kinit_cmd, address, port, key, hive_auth="NOSASL", transport_mode="binary", http_endpoint="cliservice"): import params env.set_params(params) unique_id = get_unique_id_and_date() beeline_url = [ 'jdbc:hive2://{address}:{port}/', "transportMode={transport_mode}" ] # Currently, HSI is supported on a single node only. The address list should be of size 1, # thus picking the 1st node value. address = address[0] # append url according to used transport if transport_mode == "http": beeline_url.append('httpPath={http_endpoint}') # append url according to used auth if hive_auth == "NOSASL": beeline_url.append('auth=noSasl') # append url according to principal if kinit_cmd: beeline_url.append('principal={key}') exec_path = params.execute_path if params.version and params.stack_root: upgrade_hive_bin = format("{stack_root}/{version}/hive2/bin") exec_path = os.environ[ 'PATH'] + os.pathsep + params.hadoop_bin_dir + os.pathsep + upgrade_hive_bin # beeline path llap_cmd = "! beeline -u '%s'" % format(";".join(beeline_url)) # Append LLAP SQL script path llap_cmd += format( " --hiveconf \"hiveLlapServiceCheck={unique_id}\" -f {stack_root}/current/hive-server2-hive2/scripts/llap/sql/serviceCheckScript.sql" ) # Append grep patterns for detecting failure llap_cmd += " -e '' 2>&1| awk '{print}'|grep -i -e 'Invalid status\|Invalid URL\|command not found\|Connection refused'" Execute(llap_cmd, user=params.hive_user, path=[ '/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', exec_path ], tries=1, wait_for_finish=True, stderr=subprocess.PIPE, logoutput=True)
def hcat_service_check(): import params unique = get_unique_id_and_date() output_file = format("/apps/hive/warehouse/hcatsmoke{unique}") test_cmd = format("fs -test -e {output_file}") if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser}; ") else: kinit_cmd = "" File(format("{tmp_dir}/hcatSmoke.sh"), content=StaticFile("hcatSmoke.sh"), mode=0755 ) prepare_cmd = format("{kinit_cmd}env JAVA_HOME={java64_home} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} prepare") Execute(prepare_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', params.execute_path], logoutput=True) if params.security_enabled: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, principal=params.hdfs_principal_name, bin_dir=params.execute_path ) else: ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab, bin_dir=params.execute_path ) cleanup_cmd = format("{kinit_cmd} {tmp_dir}/hcatSmoke.sh hcatsmoke{unique} cleanup") Execute(cleanup_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/bin', '/bin', '/usr/bin', params.execute_path], logoutput=True )
def service_check(self, env): import params env.set_params(params) if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {storm_keytab_path} {storm_jaas_principal}; " ) else: kinit_cmd = "" unique = get_unique_id_and_date() File("/tmp/wordCount.jar", content=StaticFile("wordCount.jar"), owner=params.storm_user) cmd = "" if params.nimbus_seeds_supported: # Because this command is guaranteed to run on one of the hosts with storm client, there is no need # to specify "-c nimbus.seeds={nimbus_seeds}" cmd = format( "storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique}" ) elif params.nimbus_host is not None: cmd = format( "storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique} -c nimbus.host={nimbus_host}" ) # use client jaas for service check if params.security_enabled: storm_client_jaas_file = format("{conf_dir}/client_jaas.conf") cmd = format( "{kinit_cmd}{cmd} -c java.security.auth.login.config={storm_client_jaas_file}" ) try_count = 1 if params.nimbus_hosts and len(params.nimbus_hosts) > 1: try_count = 3 print( "Nimbus HA is enabled. The check may be retried up to %d times in order to wait for the Nimbus leader selection" % try_count) Execute(cmd, logoutput=True, path=params.storm_bin_dir, user=params.storm_user, try_sleep=30, tries=try_count) Execute(format("storm kill WordCount{unique}"), path=params.storm_bin_dir, user=params.storm_user)
def service_check(self, env): import params env.set_params(params) unique = get_unique_id_and_date() File("/tmp/wordCount.jar", content=StaticFile("wordCount.jar")) cmd = format( "storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique} -c nimbus.host={nimbus_host}" ) Execute(cmd, logoutput=True) Execute(format("storm kill WordCount{unique}"))
def service_check(self, env): import params env.set_params(params) unique = get_unique_id_and_date() File("/tmp/wordCount.jar", content=StaticFile("wordCount.jar")) cmd = format( "env JAVA_HOME={java64_home} storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique} -c nimbus.host={nimbus_host}" ) Execute(cmd, logoutput=True, path=params.storm_bin_dir) Execute(format("env JAVA_HOME={java64_home} storm kill WordCount{unique}"), path=params.storm_bin_dir)
def hcat_service_check(): import params unique = get_unique_id_and_date() output_file = format("/apps/hive/warehouse/hcatsmoke{unique}") test_cmd = format("fs -test -e {output_file}") if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser}; ") else: kinit_cmd = "" File('/tmp/hcatSmoke.sh', content=StaticFile("hcatSmoke.sh"), mode=0755) prepare_cmd = format( "{kinit_cmd}sh /tmp/hcatSmoke.sh hcatsmoke{unique} prepare") Execute(prepare_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/nin', '/bin', '/usr/bin'], logoutput=True) ExecuteHadoop(test_cmd, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, security_enabled=params.security_enabled, kinit_path_local=params.kinit_path_local, keytab=params.hdfs_user_keytab) cleanup_cmd = format( "{kinit_cmd}sh /tmp/hcatSmoke.sh hcatsmoke{unique} cleanup") Execute(cleanup_cmd, tries=3, user=params.smokeuser, try_sleep=5, path=['/usr/sbin', '/usr/local/nin', '/bin', '/usr/bin'], logoutput=True)
def pre_stop_backup_cores(self, env): """ Backs up the Solr cores under Solr's home directory. cp -r /var/lib/solr/data/* /tmp/solr/cores """ import params env.set_params(params) if compare_versions(format_stack_version(params.version), '4.2.0.0') >= 0: solr_home_dir=params.solr_data_dir else: #4.1.0.0 solr_home_dir=params.old_lib_dir + "/data" unique = get_unique_id_and_date() backup_solr_dir="/tmp/upgrades/{0}/solr_{1}".format(params.version, unique) backup_solr_cores="/tmp/solr/cores" if os.path.isdir(solr_home_dir) and not os.path.isdir(backup_solr_dir): os.makedirs(backup_solr_dir) Execute(('cp', '-r', solr_home_dir+"/.", backup_solr_dir), sudo=True ) if params.upgrade_direction is not None and params.upgrade_direction == Direction.UPGRADE: Directory(backup_solr_cores, action="delete", create_parents=True) Directory(backup_solr_cores, mode=0755, cd_access='a', owner=params.solr_user, create_parents=True, group=params.user_group ) Execute(('cp', '-r', solr_home_dir+"/.", backup_solr_cores), user=params.solr_user )
def service_check(self, env): import params env.set_params(params) unique = get_unique_id_and_date() File("/tmp/wordCount.jar", content=StaticFile("wordCount.jar"), owner=params.storm_user) cmd = "" if params.nimbus_seeds_supported: # Because this command is guaranteed to run on one of the hosts with storm client, there is no need # to specify "-c nimbus.seeds={nimbus_seeds}" cmd = format("storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique}") elif params.nimbus_host is not None: cmd = format( "storm jar /tmp/wordCount.jar storm.starter.WordCountTopology WordCount{unique} -c nimbus.host={nimbus_host}" ) Execute(cmd, logoutput=True, path=params.storm_bin_dir, user=params.storm_user) Execute(format("storm kill WordCount{unique}"), path=params.storm_bin_dir, user=params.storm_user)
metrics_report_interval = default("/configurations/ams-site/timeline.metrics.sink.report.interval", 60) metrics_collection_period = default("/configurations/ams-site/timeline.metrics.sink.collection.period", 10) host_in_memory_aggregation = default("/configurations/ams-site/timeline.metrics.host.inmemory.aggregation", True) host_in_memory_aggregation_port = default("/configurations/ams-site/timeline.metrics.host.inmemory.aggregation.port", 61888) # if hbase is selected the hbase_rs_hosts, should not be empty, but still default just in case if 'slave_hosts' in config['clusterHostInfo']: rs_hosts = default('/clusterHostInfo/hbase_rs_hosts', '/clusterHostInfo/slave_hosts') #if hbase_rs_hosts not given it is assumed that region servers on same nodes as slaves else: rs_hosts = default('/clusterHostInfo/hbase_rs_hosts', '/clusterHostInfo/all_hosts') smoke_test_user = config['configurations']['cluster-env']['smokeuser'] smokeuser_principal = config['configurations']['cluster-env']['smokeuser_principal_name'] smokeuser_permissions = "RWXCA" service_check_data = get_unique_id_and_date() user_group = config['configurations']['cluster-env']["user_group"] if security_enabled: _hostname_lowercase = config['hostname'].lower() master_jaas_princ = config['configurations']['hbase-site']['hbase.master.kerberos.principal'].replace('_HOST',_hostname_lowercase) master_keytab_path = config['configurations']['hbase-site']['hbase.master.keytab.file'] regionserver_jaas_princ = config['configurations']['hbase-site']['hbase.regionserver.kerberos.principal'].replace('_HOST',_hostname_lowercase) _queryserver_jaas_princ = config['configurations']['hbase-site']['phoenix.queryserver.kerberos.principal'] if not is_empty(_queryserver_jaas_princ): queryserver_jaas_princ =_queryserver_jaas_princ.replace('_HOST',_hostname_lowercase) regionserver_keytab_path = config['configurations']['hbase-site']['hbase.regionserver.keytab.file'] queryserver_keytab_path = config['configurations']['hbase-site']['phoenix.queryserver.keytab.file'] smoke_user_keytab = config['configurations']['cluster-env']['smokeuser_keytab'] hbase_user_keytab = config['configurations']['hbase-env']['hbase_user_keytab']
phoenix_max_global_mem_percent = default('/configurations/ams-site/phoenix.query.maxGlobalMemoryPercentage', '20') phoenix_client_spool_dir = default('/configurations/ams-site/phoenix.spool.directory', '/tmp') phoenix_server_spool_dir = default('/configurations/ams-hbase-site/phoenix.spool.directory', '/tmp') # Substitute vars if present phoenix_client_spool_dir = substitute_vars(phoenix_client_spool_dir, config['configurations']['ams-hbase-site']) phoenix_server_spool_dir = substitute_vars(phoenix_server_spool_dir, config['configurations']['ams-hbase-site']) client_jaas_config_file = format("{hbase_conf_dir}/hbase_client_jaas.conf") master_jaas_config_file = format("{hbase_conf_dir}/hbase_master_jaas.conf") regionserver_jaas_config_file = format("{hbase_conf_dir}/hbase_regionserver_jaas.conf") rs_hosts = ["localhost"] smoke_test_user = config['configurations']['cluster-env']['smokeuser'] smokeuser_permissions = "RWXCA" service_check_data = functions.get_unique_id_and_date() user_group = config['configurations']['cluster-env']["user_group"] hadoop_user = "******" kinit_cmd = "" if security_enabled: _hostname_lowercase = config['hostname'].lower() client_jaas_config_file = format("{hbase_conf_dir}/hbase_client_jaas.conf") smoke_user_keytab = config['configurations']['cluster-env']['smokeuser_keytab'] hbase_user_keytab = config['configurations']['ams-hbase-env']['hbase_user_keytab'] ams_collector_jaas_config_file = format("{hbase_conf_dir}/ams_collector_jaas.conf") ams_collector_keytab_path = config['configurations']['ams-hbase-security-site']['hbase.myclient.keytab'] ams_collector_jaas_princ = config['configurations']['ams-hbase-security-site']['hbase.myclient.principal'].replace('_HOST',_hostname_lowercase)
else: metric_collector_port = '6188' pass metrics_report_interval = default("/configurations/ams-site/timeline.metrics.sink.report.interval", 60) metrics_collection_period = default("/configurations/ams-site/timeline.metrics.sink.collection.period", 60) # if hbase is selected the hbase_rs_hosts, should not be empty, but still default just in case if 'slave_hosts' in config['clusterHostInfo']: rs_hosts = default('/clusterHostInfo/hbase_rs_hosts', '/clusterHostInfo/slave_hosts') #if hbase_rs_hosts not given it is assumed that region servers on same nodes as slaves else: rs_hosts = default('/clusterHostInfo/hbase_rs_hosts', '/clusterHostInfo/all_hosts') smoke_test_user = config['configurations']['cluster-env']['smokeuser'] smokeuser_principal = config['configurations']['cluster-env']['smokeuser_principal_name'] smokeuser_permissions = "RWXCA" service_check_data = get_unique_id_and_date() user_group = config['configurations']['cluster-env']["user_group"] if security_enabled: _hostname_lowercase = config['hostname'].lower() master_jaas_princ = config['configurations']['hbase-site']['hbase.master.kerberos.principal'].replace('_HOST',_hostname_lowercase) regionserver_jaas_princ = config['configurations']['hbase-site']['hbase.regionserver.kerberos.principal'].replace('_HOST',_hostname_lowercase) _queryserver_jaas_princ = config['configurations']['hbase-site']['phoenix.queryserver.kerberos.principal'] if not is_empty(_queryserver_jaas_princ): queryserver_jaas_princ =_queryserver_jaas_princ.replace('_HOST',_hostname_lowercase) master_keytab_path = config['configurations']['hbase-site']['hbase.master.keytab.file'] regionserver_keytab_path = config['configurations']['hbase-site']['hbase.regionserver.keytab.file'] queryserver_keytab_path = config['configurations']['hbase-site']['phoenix.queryserver.keytab.file'] smoke_user_keytab = config['configurations']['cluster-env']['smokeuser_keytab'] hbase_user_keytab = config['configurations']['hbase-env']['hbase_user_keytab']
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() dir = params.hdfs_tmp_dir tmp_file = format("{dir}/{unique}") safemode_command = format( "dfsadmin -fs {namenode_address} -safemode get | grep OFF") if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) ExecuteHadoop(safemode_command, user=params.hdfs_user, logoutput=True, conf_dir=params.hadoop_conf_dir, try_sleep=3, tries=20, bin_dir=params.hadoop_bin_dir) params.HdfsResource(dir, type="directory", action="create_on_execute", mode=0777) params.HdfsResource( tmp_file, type="file", action="delete_on_execute", ) params.HdfsResource(tmp_file, type="file", source="/etc/passwd", action="create_on_execute") params.HdfsResource(None, action="execute") if params.has_journalnode_hosts: if params.security_enabled: for host in params.journalnode_hosts: if params.https_only: uri = format("https://{host}:{journalnode_port}") else: uri = format("http://{host}:{journalnode_port}") response, errmsg, time_millis = curl_krb_request( params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, uri, "jn_service_check", params.kinit_path_local, False, None, params.smoke_user) if not response: Logger.error( "Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg) return 1 else: journalnode_port = params.journalnode_port checkWebUIFileName = "checkWebUI.py" checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}") comma_sep_jn_hosts = ",".join(params.journalnode_hosts) checkWebUICmd = format( "ambari-python-wrap {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}" ) File(checkWebUIFilePath, content=StaticFile(checkWebUIFileName), mode=0775) Execute(checkWebUICmd, logoutput=True, try_sleep=3, tries=5, user=params.smoke_user) if params.is_namenode_master: if params.has_zkfc_hosts: pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}") pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid") check_zkfc_process_cmd = as_user(format( "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1" ), user=params.hdfs_user) Execute(check_zkfc_process_cmd, logoutput=True, try_sleep=3, tries=5)
def service_check(self, env): import params env.set_params(params) unique = get_unique_id_and_date() storm_user = params.storm_user
def service_check(self, env): import params env.set_params(params) unique = functions.get_unique_id_and_date() dir = params.hdfs_tmp_dir tmp_file = format("{dir}/{unique}") """ Ignore checking safemode, because this command is unable to get safemode state when 1 namenode is down in an HA setup (see more in HDFS-8277). Directly test HDFS availability by file system operations is consistent in both HA and non-HA environment. """ # safemode_command = format("dfsadmin -fs {namenode_address} -safemode get | grep OFF") if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) #ExecuteHadoop(safemode_command, # user=params.hdfs_user, # logoutput=True, # conf_dir=params.hadoop_conf_dir, # try_sleep=3, # tries=20, # bin_dir=params.hadoop_bin_dir #) params.HdfsResource(dir, type="directory", action="create_on_execute", mode=0777) params.HdfsResource( tmp_file, type="file", action="delete_on_execute", ) test_file = params.hdfs_service_check_test_file if not os.path.isfile(test_file): try: Execute( format( "dd if=/dev/urandom of={test_file} count=1 bs=1024")) except: try: Execute(format("rm {test_file}")) #clean up except: pass test_file = "/etc/passwd" params.HdfsResource(tmp_file, type="file", source=test_file, action="create_on_execute") params.HdfsResource(None, action="execute") if params.has_journalnode_hosts: if params.security_enabled: for host in params.journalnode_hosts: if params.https_only: uri = format("https://{host}:{journalnode_port}") else: uri = format("http://{host}:{journalnode_port}") response, errmsg, time_millis = curl_krb_request( params.tmp_dir, params.smoke_user_keytab, params.smokeuser_principal, uri, "jn_service_check", params.kinit_path_local, False, None, params.smoke_user) if not response: Logger.error( "Cannot access WEB UI on: {0}. Error : {1}", uri, errmsg) return 1 else: journalnode_port = params.journalnode_port checkWebUIFileName = "checkWebUI.py" checkWebUIFilePath = format("{tmp_dir}/{checkWebUIFileName}") comma_sep_jn_hosts = ",".join(params.journalnode_hosts) checkWebUICmd = format( "ambari-python-wrap {checkWebUIFilePath} -m {comma_sep_jn_hosts} -p {journalnode_port} -s {https_only}" ) File(checkWebUIFilePath, content=StaticFile(checkWebUIFileName), mode=0775) Execute(checkWebUICmd, logoutput=True, try_sleep=3, tries=5, user=params.smoke_user) if params.is_namenode_master: if params.has_zkfc_hosts: pid_dir = format("{hadoop_pid_dir_prefix}/{hdfs_user}") pid_file = format("{pid_dir}/hadoop-{hdfs_user}-zkfc.pid") check_zkfc_process_cmd = as_user(format( "ls {pid_file} >/dev/null 2>&1 && ps -p `cat {pid_file}` >/dev/null 2>&1" ), user=params.hdfs_user) Execute(check_zkfc_process_cmd, logoutput=True, try_sleep=3, tries=5)