def _check_datanode_shutdown(hdfs_binary): """ Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo" several times, pausing in between runs. Once the DataNode stops responding this method will return, otherwise it will raise a Fail(...) and retry automatically. The stack defaults for retrying for HDFS are also way too slow for this command; they are set to wait about 45 seconds between client retries. As a result, a single execution of dfsadmin will take 45 seconds to retry and the DataNode may be marked as dead, causing problems with HBase. https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the times for ipc.client.connect.retry.interval. In the meantime, override them here, but only for RU. :param hdfs_binary: name/path of the HDFS binary to use :return: """ import params # override stock retry timeouts since after 30 seconds, the datanode is # marked as dead and can affect HBase during RU dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = format('{dfsadmin_base_command} -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}') try: Execute(command, user=params.hdfs_user, tries=1) except: Logger.info("DataNode has successfully shutdown for upgrade.") return Logger.info("DataNode has not shutdown.") raise Fail('DataNode has not shutdown.')
def prepare_upgrade_enter_safe_mode(hdfs_binary): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires first entering Safemode. :param hdfs_binary: name/path of the HDFS binary to use """ import params dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) safe_mode_enter_cmd = dfsadmin_base_command + " -safemode enter" try: # Safe to call if already in Safe Mode desired_state = SafeMode.ON safemode_transition_successful, original_state = reach_safemode_state( params.hdfs_user, desired_state, params.dfs_ha_enabled, hdfs_binary) Logger.info("Transition successful: {0}, original state: {1}".format( str(safemode_transition_successful), str(original_state))) if not safemode_transition_successful: raise Fail( "Could not transition to safemode state %s. Please check logs to make sure namenode is up." % str(desired_state)) except Exception, e: message = "Could not enter safemode. Error: {0}. As the HDFS user, call this command: {1}".format( str(e), safe_mode_enter_cmd) Logger.error(message) raise Fail(message)
def _check_datanode_startup(hdfs_binary): """ Checks that a DataNode is reported as being alive via the "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :param hdfs_binary: name/path of the HDFS binary to use :return: """ import params import socket try: dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = dfsadmin_base_command + ' -report -live' return_code, hdfs_output = shell.call(command, user=params.hdfs_user) except: raise Fail('Unable to determine if the DataNode has started after upgrade.') if return_code == 0: hostname = params.hostname.lower() hostname_ip = socket.gethostbyname(params.hostname.lower()) if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower(): Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname)) return else: raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname)) # return_code is not 0, fail raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
def _check_datanode_startup(hdfs_binary): """ Checks that a DataNode process is running and DataNode is reported as being alive via the "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :param hdfs_binary: name/path of the HDFS binary to use :return: """ if not is_datanode_process_running(): Logger.info("DataNode process is not running") raise Fail("DataNode process is not running") import params import socket try: dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = dfsadmin_base_command + ' -report -live' return_code, hdfs_output = shell.call(command, user=params.hdfs_user) except: raise Fail('Unable to determine if the DataNode has started after upgrade.') if return_code == 0: hostname = params.hostname.lower() hostname_ip = socket.gethostbyname(params.hostname.lower()) if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower(): Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname)) return else: raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname)) # return_code is not 0, fail raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
def finalize_upgrade(upgrade_type, hdfs_binary): """ Finalize the Namenode upgrade, at which point it cannot be downgraded. :param upgrade_type rolling or nonrolling :param hdfs_binary: name/path of the HDFS binary to use """ Logger.info("Executing Rolling Upgrade finalize") import params if params.security_enabled: kinit_command = format( "{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}" ) Execute(kinit_command, user=params.hdfs_user, logoutput=True) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) finalize_cmd = dfsadmin_base_command + " -rollingUpgrade finalize" query_cmd = dfsadmin_base_command + " -rollingUpgrade query" summary = upgrade_summary.get_upgrade_summary() if summary is not None and not summary.is_downgrade_allowed: finalize_cmd = dfsadmin_base_command + " -finalizeUpgrade" query_cmd = dfsadmin_base_command + " -upgrade query" if summary is not None and summary.is_switch_bits: Logger.info( "The {0} switches the binaries only. No need to call finalization." .format(summary.direction)) else: Execute(query_cmd, user=params.hdfs_user, logoutput=True) Execute(finalize_cmd, user=params.hdfs_user, logoutput=True) Execute(query_cmd, user=params.hdfs_user, logoutput=True) # upgrade is finalized; remove the upgrade marker delete_upgrade_marker()
def finalize_upgrade(upgrade_type, hdfs_binary): """ Finalize the Namenode upgrade, at which point it cannot be downgraded. :param upgrade_type rolling or nonrolling :param hdfs_binary: name/path of the HDFS binary to use """ Logger.info("Executing Rolling Upgrade finalize") import params if params.security_enabled: kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}") Execute(kinit_command, user=params.hdfs_user, logoutput=True) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) finalize_cmd = dfsadmin_base_command + " -rollingUpgrade finalize" query_cmd = dfsadmin_base_command + " -rollingUpgrade query" Execute(query_cmd, user=params.hdfs_user, logoutput=True) Execute(finalize_cmd, user=params.hdfs_user, logoutput=True) Execute(query_cmd, user=params.hdfs_user, logoutput=True)
def _check_datanode_shutdown(hdfs_binary): """ Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo" several times, pausing in between runs. Once the DataNode stops responding this method will return, otherwise it will raise a Fail(...) and retry automatically. The stack defaults for retrying for HDFS are also way too slow for this command; they are set to wait about 45 seconds between client retries. As a result, a single execution of dfsadmin will take 45 seconds to retry and the DataNode may be marked as dead, causing problems with HBase. https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the times for ipc.client.connect.retry.interval. In the meantime, override them here, but only for RU. :param hdfs_binary: name/path of the HDFS binary to use :return: """ import params # override stock retry timeouts since after 30 seconds, the datanode is # marked as dead and can affect HBase during RU dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = format( '{dfsadmin_base_command} -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}' ) try: Execute(command, user=params.hdfs_user, tries=1) except: Logger.info("DataNode has successfully shutdown for upgrade.") return Logger.info("DataNode has not shutdown.") raise Fail('DataNode has not shutdown.')
def wait_for_safemode_off(afterwait_sleep=0, execute_kinit=False): """ During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode. If HA is present, then this command will run individually on each NameNode, which checks for its own address. """ import params Logger.info("Wait to leafe safemode since must transition from ON to OFF.") if params.security_enabled and execute_kinit: kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}") Execute(kinit_command, user=params.hdfs_user, logoutput=True) try: # Note, this fails if namenode_address isn't prefixed with "params." dfsadmin_base_command = get_dfsadmin_base_command('hdfs', use_specific_namenode=True) is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'" # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=115, try_sleep=10, user=params.hdfs_user, logoutput=True ) # Wait a bit more since YARN still depends on block reports coming in. # Also saw intermittent errors with HBASE service check if it was done too soon. time.sleep(afterwait_sleep) except Fail: Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
def wait_for_safemode_off(hdfs_binary, afterwait_sleep=0, execute_kinit=False): """ During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode. If HA is present, then this command will run individually on each NameNode, which checks for its own address. """ import params Logger.info("Wait to leafe safemode since must transition from ON to OFF.") if params.security_enabled and execute_kinit: kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}") Execute(kinit_command, user=params.hdfs_user, logoutput=True) try: # Note, this fails if namenode_address isn't prefixed with "params." dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True) is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'" # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=115, try_sleep=10, user=params.hdfs_user, logoutput=True ) # Wait a bit more since YARN still depends on block reports coming in. # Also saw intermittent errors with HBASE service check if it was done too soon. time.sleep(afterwait_sleep) except Fail: Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")
def pre_rolling_upgrade_shutdown(hdfs_binary): """ Runs the "shutdownDatanode {ipc_address} upgrade" command to shutdown the DataNode in preparation for an upgrade. This will then periodically check "getDatanodeInfo" to ensure the DataNode has shutdown correctly. This function will obtain the Kerberos ticket if security is enabled. :param hdfs_binary: name/path of the HDFS binary to use :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully. """ import params Logger.info( 'DataNode executing "shutdownDatanode" command in preparation for upgrade...' ) if params.security_enabled: Execute(params.dn_kinit_cmd, user=params.hdfs_user) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = format( '{dfsadmin_base_command} -shutdownDatanode {dfs_dn_ipc_address} upgrade' ) code, output = shell.call(command, user=params.hdfs_user) if code == 0: # verify that the datanode is down _check_datanode_shutdown(hdfs_binary) else: # Due to bug HDFS-7533, DataNode may not always shutdown during stack upgrade, and it is necessary to kill it. if output is not None and re.search("Shutdown already in progress", output): Logger.error( "Due to a known issue in DataNode, the command {0} did not work, so will need to shutdown the datanode forcefully." .format(command)) return False return True
def pre_rolling_upgrade_shutdown(hdfs_binary): """ Runs the "shutdownDatanode {ipc_address} upgrade" command to shutdown the DataNode in preparation for an upgrade. This will then periodically check "getDatanodeInfo" to ensure the DataNode has shutdown correctly. This function will obtain the Kerberos ticket if security is enabled. :param hdfs_binary: name/path of the HDFS binary to use :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully. """ import params Logger.info('DataNode executing "shutdownDatanode" command in preparation for upgrade...') if params.security_enabled: Execute(params.dn_kinit_cmd, user = params.hdfs_user) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = format('{dfsadmin_base_command} -shutdownDatanode {dfs_dn_ipc_address} upgrade') code, output = shell.call(command, user=params.hdfs_user) if code == 0: # verify that the datanode is down _check_datanode_shutdown(hdfs_binary) else: # Due to bug HDFS-7533, DataNode may not always shutdown during stack upgrade, and it is necessary to kill it. if output is not None and re.search("Shutdown already in progress", output): Logger.error("Due to a known issue in DataNode, the command {0} did not work, so will need to shutdown the datanode forcefully.".format(command)) return False return True
def prepare_upgrade_finalize_previous_upgrades(hdfs_binary): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires Finalizing any upgrades that are in progress. :param hdfs_binary: name/path of the HDFS binary to use """ import params dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) finalize_command = dfsadmin_base_command + " -rollingUpgrade finalize" try: Logger.info( "Attempt to Finalize if there are any in-progress upgrades. " "This will return 255 if no upgrades are in progress.") code, out = shell.checked_call(finalize_command, logoutput=True, user=params.hdfs_user) if out: expected_substring = "there is no rolling upgrade in progress" if expected_substring not in out.lower(): Logger.warning( 'Finalize command did not contain substring: %s' % expected_substring) else: Logger.warning("Finalize command did not return any output.") except Exception, e: Logger.warning("Ensure no upgrades are in progress.")
def finalize_upgrade(upgrade_type, hdfs_binary): """ Finalize the Namenode upgrade, at which point it cannot be downgraded. :param upgrade_type rolling or nonrolling :param hdfs_binary: name/path of the HDFS binary to use """ Logger.info("Executing Rolling Upgrade finalize") import params if params.security_enabled: kinit_command = format( "{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}" ) Execute(kinit_command, user=params.hdfs_user, logoutput=True) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) finalize_cmd = dfsadmin_base_command + " -rollingUpgrade finalize" query_cmd = dfsadmin_base_command + " -rollingUpgrade query" Execute(query_cmd, user=params.hdfs_user, logoutput=True) Execute(finalize_cmd, user=params.hdfs_user, logoutput=True) Execute(query_cmd, user=params.hdfs_user, logoutput=True) # upgrade is finalized; remove the upgrade marker delete_upgrade_marker()
def post_upgrade_restart(self, env, upgrade_type=None): Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) dfsadmin_base_command = get_dfsadmin_base_command('hdfs') dfsadmin_cmd = dfsadmin_base_command + " -report -live" Execute(dfsadmin_cmd, user=params.hdfs_user, tries=60, try_sleep=10)
def post_upgrade_restart(self, env): Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) hdfs_binary = self.get_hdfs_binary() dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) dfsadmin_cmd = dfsadmin_base_command + " -report -live" Execute(dfsadmin_cmd, user=params.hdfs_user, tries=60, try_sleep=10)
def hdfs_roll_edits(): """ HDFS_CLIENT needs to be a dependency of JOURNALNODE Roll the logs so that Namenode will be able to connect to the Journalnode. Must kinit before calling this command. """ import params # TODO, this will be to be doc'ed since existing HDP 2.2 clusters will needs HDFS_CLIENT on all JOURNALNODE hosts dfsadmin_base_command = get_dfsadmin_base_command('hdfs') command = dfsadmin_base_command + ' -rollEdits' Execute(command, user=params.hdfs_user, tries=1)
def hdfs_roll_edits(): """ HDFS_CLIENT needs to be a dependency of JOURNALNODE Roll the logs so that Namenode will be able to connect to the Journalnode. Must kinit before calling this command. """ import params # TODO, this will need to be doc'ed since existing clusters will need HDFS_CLIENT on all JOURNALNODE hosts dfsadmin_base_command = get_dfsadmin_base_command('hdfs') command = dfsadmin_base_command + ' -rollEdits' Execute(command, user=params.hdfs_user, tries=1)
def post_upgrade_restart(self, env, upgrade_type=None): Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) hdfs_binary = self.get_hdfs_binary() dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) dfsadmin_cmd = dfsadmin_base_command + " -report -live" Execute(dfsadmin_cmd, user=params.hdfs_user, tries=60, try_sleep=10 )
def prepare_rolling_upgrade(hdfs_binary): """ This can be called during either Rolling Upgrade or Express Upgrade (aka nonrolling) Rolling Upgrade for HDFS Namenode requires the following. 0. Namenode must be up 1. If HA: leave safemode if the safemode status is not OFF 2. Execute a rolling upgrade "prepare" 3. Execute a rolling upgrade "query" :param hdfs_binary: name/path of the HDFS binary to use """ import params if not params.upgrade_direction or params.upgrade_direction not in [ Direction.UPGRADE, Direction.DOWNGRADE ]: raise Fail("Could not retrieve upgrade direction: %s" % str(params.upgrade_direction)) Logger.info(format("Performing a(n) {params.upgrade_direction} of HDFS")) if params.security_enabled: kinit_command = format( "{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}" ) Execute(kinit_command, user=params.hdfs_user, logoutput=True) if params.upgrade_direction == Direction.UPGRADE: if params.dfs_ha_enabled: Logger.info( 'High Availability is enabled, must leave safemode before calling "-rollingUpgrade prepare"' ) desired_state = SafeMode.OFF safemode_transition_successful, original_state = reach_safemode_state( params.hdfs_user, desired_state, True, hdfs_binary) if not safemode_transition_successful: raise Fail( "Could not transition to safemode state %s. Please check logs to make sure namenode is up." % str(desired_state)) summary = upgrade_summary.get_upgrade_summary() if summary is not None and summary.is_switch_bits: Logger.info( "The {0} switches the binaries only. No need to call prepare." .format(summary.direction)) else: dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) prepare = dfsadmin_base_command + " -rollingUpgrade prepare" query = dfsadmin_base_command + " -rollingUpgrade query" Execute(prepare, user=params.hdfs_user, logoutput=True) Execute(query, user=params.hdfs_user, logoutput=True)
def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary): """ Enter or leave safemode for the Namenode. :param user: user to perform action as :param safemode_state: Desired state of ON or OFF :param in_ha: bool indicating if Namenode High Availability is enabled :param hdfs_binary: name/path of the HDFS binary to use :return: Returns a tuple of (transition success, original state). If no change is needed, the indicator of success will be True """ Logger.info("Prepare to transition into safemode state %s" % safemode_state) import params original_state = SafeMode.UNKNOWN dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) safemode_base_command = dfsadmin_base_command + " -safemode " safemode_check_cmd = safemode_base_command + " get" grep_pattern = format("Safe mode is {safemode_state}") safemode_check_with_grep = format( "{safemode_check_cmd} | grep '{grep_pattern}'") code, out = shell.call(safemode_check_cmd, user=user, logoutput=True) Logger.info("Command: %s\nCode: %d." % (safemode_check_cmd, code)) if code == 0 and out is not None: Logger.info(out) re_pattern = r"Safe mode is (\S*)" Logger.info("Pattern to search: {0}".format(re_pattern)) m = re.search(re_pattern, out, re.IGNORECASE) if m and len(m.groups()) >= 1: original_state = m.group(1).upper() if original_state == safemode_state: return (True, original_state) else: # Make a transition command = safemode_base_command + safemode_to_instruction[ safemode_state] Execute(command, user=user, logoutput=True, path=[params.hadoop_bin_dir]) code, out = shell.call(safemode_check_with_grep, user=user) Logger.info("Command: %s\nCode: %d. Out: %s" % (safemode_check_with_grep, code, out)) if code == 0: return (True, original_state) return (False, original_state)
def prepare_upgrade_save_namespace(): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires saving the namespace. """ import params dfsadmin_base_command = get_dfsadmin_base_command('hdfs') save_namespace_cmd = dfsadmin_base_command + " -saveNamespace" try: Logger.info("Checkpoint the current namespace.") as_user(save_namespace_cmd, params.hdfs_user, env={'PATH': params.hadoop_bin_dir}) except Exception, e: message = format("Could not save the NameSpace. As the HDFS user, call this command: {save_namespace_cmd}") Logger.error(message) raise Fail(message)
def prepare_upgrade_save_namespace(hdfs_binary): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires saving the namespace. :param hdfs_binary: name/path of the HDFS binary to use """ import params dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) save_namespace_cmd = dfsadmin_base_command + " -saveNamespace" try: Logger.info("Checkpoint the current namespace.") as_user(save_namespace_cmd, params.hdfs_user, env={'PATH': params.hadoop_bin_dir}) except Exception, e: message = format("Could not save the NameSpace. As the HDFS user, call this command: {save_namespace_cmd}") Logger.error(message) raise Fail(message)
def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary): """ Enter or leave safemode for the Namenode. :param user: user to perform action as :param safemode_state: Desired state of ON or OFF :param in_ha: bool indicating if Namenode High Availability is enabled :param hdfs_binary: name/path of the HDFS binary to use :return: Returns a tuple of (transition success, original state). If no change is needed, the indicator of success will be True """ Logger.info("Prepare to transition into safemode state %s" % safemode_state) import params original_state = SafeMode.UNKNOWN dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) safemode_base_command = dfsadmin_base_command + " -safemode " safemode_check_cmd = safemode_base_command + " get" grep_pattern = format("Safe mode is {safemode_state}") safemode_check_with_grep = format("{safemode_check_cmd} | grep '{grep_pattern}'") code, out = shell.call(safemode_check_cmd, user=user, logoutput=True) Logger.info("Command: %s\nCode: %d." % (safemode_check_cmd, code)) if code == 0 and out is not None: Logger.info(out) re_pattern = r"Safe mode is (\S*)" Logger.info("Pattern to search: {0}".format(re_pattern)) m = re.search(re_pattern, out, re.IGNORECASE) if m and len(m.groups()) >= 1: original_state = m.group(1).upper() if original_state == safemode_state: return (True, original_state) else: # Make a transition command = safemode_base_command + safemode_to_instruction[safemode_state] Execute(command, user=user, logoutput=True, path=[params.hadoop_bin_dir]) code, out = shell.call(safemode_check_with_grep, user=user) Logger.info("Command: %s\nCode: %d. Out: %s" % (safemode_check_with_grep, code, out)) if code == 0: return (True, original_state) return (False, original_state)
def prepare_upgrade_enter_safe_mode(hdfs_binary): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires first entering Safemode. :param hdfs_binary: name/path of the HDFS binary to use """ import params dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) safe_mode_enter_cmd = dfsadmin_base_command + " -safemode enter" try: # Safe to call if already in Safe Mode desired_state = SafeMode.ON safemode_transition_successful, original_state = reach_safemode_state(params.hdfs_user, desired_state, params.dfs_ha_enabled, hdfs_binary) Logger.info("Transition successful: {0}, original state: {1}".format(str(safemode_transition_successful), str(original_state))) if not safemode_transition_successful: raise Fail("Could not transition to safemode state %s. Please check logs to make sure namenode is up." % str(desired_state)) except Exception, e: message = "Could not enter safemode. Error: {0}. As the HDFS user, call this command: {1}".format(str(e), safe_mode_enter_cmd) Logger.error(message) raise Fail(message)
def prepare_upgrade_finalize_previous_upgrades(hdfs_binary): """ During a NonRolling (aka Express Upgrade), preparing the NameNode requires Finalizing any upgrades that are in progress. :param hdfs_binary: name/path of the HDFS binary to use """ import params dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) finalize_command = dfsadmin_base_command + " -rollingUpgrade finalize" try: Logger.info("Attempt to Finalize if there are any in-progress upgrades. " "This will return 255 if no upgrades are in progress.") code, out = shell.checked_call(finalize_command, logoutput=True, user=params.hdfs_user) if out: expected_substring = "there is no rolling upgrade in progress" if expected_substring not in out.lower(): Logger.warning('Finalize command did not contain substring: %s' % expected_substring) else: Logger.warning("Finalize command did not return any output.") except Exception, e: Logger.warning("Ensure no upgrades are in progress.")
def prepare_rolling_upgrade(hdfs_binary): """ This can be called during either Rolling Upgrade or Express Upgrade (aka nonrolling) Rolling Upgrade for HDFS Namenode requires the following. 0. Namenode must be up 1. If HA: leave safemode if the safemode status is not OFF 2. Execute a rolling upgrade "prepare" 3. Execute a rolling upgrade "query" :param hdfs_binary: name/path of the HDFS binary to use """ import params if not params.upgrade_direction or params.upgrade_direction not in [Direction.UPGRADE, Direction.DOWNGRADE]: raise Fail("Could not retrieve upgrade direction: %s" % str(params.upgrade_direction)) Logger.info(format("Performing a(n) {params.upgrade_direction} of HDFS")) if params.security_enabled: kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}") Execute(kinit_command, user=params.hdfs_user, logoutput=True) if params.upgrade_direction == Direction.UPGRADE: if params.dfs_ha_enabled: Logger.info('High Availability is enabled, must leave safemode before calling "-rollingUpgrade prepare"') desired_state = SafeMode.OFF safemode_transition_successful, original_state = reach_safemode_state(params.hdfs_user, desired_state, True, hdfs_binary) if not safemode_transition_successful: raise Fail("Could not transition to safemode state %s. Please check logs to make sure namenode is up." % str(desired_state)) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) prepare = dfsadmin_base_command + " -rollingUpgrade prepare" query = dfsadmin_base_command + " -rollingUpgrade query" Execute(prepare, user=params.hdfs_user, logoutput=True) Execute(query, user=params.hdfs_user, logoutput=True)
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail('"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format(action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group ) Directory(params.hadoop_pid_dir_prefix, mode=0755, owner=params.hdfs_user, group=params.user_group ) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling" and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) #options = "-rollingUpgrade started" if rolling_restart else "" options = "" if upgrade_type == "rolling": if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info(format("Previous file system image dir present is {is_previous_image_dir}")) if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" Logger.info(format("Option for start command: {options}")) service( action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True ) if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True) is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'" if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: is_active_namenode_cmd = True # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: Logger.info("Wait for NameNode to become active.") if is_active_namenode(hdfs_binary): # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: Logger.info("Wait to leafe safemode since must transition from ON to OFF.") try: # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=180, try_sleep=10, user=params.hdfs_user, logoutput=True ) except Fail: Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.") # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) create_ranger_audit_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service( action="stop", name="namenode", user=params.hdfs_user ) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()