def is_this_namenode_active(name_service):
    """
  Gets whether the current NameNode is Active. This function will wait until the NameNode is
  listed as being either Active or Standby before returning a value. This is to ensure that
  that if the other NameNode is Active, we ensure that this NameNode has fully loaded and
  registered in the event that the other NameNode is going to be restarted. This prevents
  a situation where we detect the other NameNode as Active before this NameNode has fully booted.
  If the other Active NameNode is then restarted, there can be a loss of service if this
  NameNode has not entered Standby.
  """
    import params

    # returns ([('nn1', 'c6401.ambari.apache.org:50070')], [('nn2', 'c6402.ambari.apache.org:50070')], [])
    #                  0                                           1                                   2
    # or
    # returns ([], [('nn1', 'c6401.ambari.apache.org:50070')], [('nn2', 'c6402.ambari.apache.org:50070')], [])
    #          0                                              1                                             2
    #
    namenode_states = namenode_ha_utils.get_namenode_states(
        params.hdfs_site,
        params.security_enabled,
        params.hdfs_user,
        times=5,
        sleep_time=5,
        backoff_factor=2,
        name_service=name_service)

    # unwraps [('nn1', 'c6401.ambari.apache.org:50070')]
    active_namenodes = [] if len(
        namenode_states[0]) < 1 else namenode_states[0]

    # unwraps [('nn2', 'c6402.ambari.apache.org:50070')]
    standby_namenodes = [] if len(
        namenode_states[1]) < 1 else namenode_states[1]

    # check to see if this is the active NameNode
    for entry in active_namenodes:
        if params.namenode_id in entry:
            return True

    # if this is not the active NameNode, then we must wait for it to register as standby
    for entry in standby_namenodes:
        if params.namenode_id in entry:
            return False

    # this this point, this NameNode is neither active nor standby - we must wait to ensure it
    # enters at least one of these roles before returning a verdict - the annotation will catch
    # this failure and retry the fuction automatically
    raise Fail(
        format(
            "The NameNode {namenode_id} is not listed as Active or Standby, waiting..."
        ))
Beispiel #2
0
def is_there_any_active_nn(name_service):
    import params

    namenode_states = namenode_ha_utils.get_namenode_states(
        params.hdfs_site,
        params.security_enabled,
        params.hdfs_user,
        times=3,
        sleep_time=3,
        backoff_factor=2,
        name_service=name_service)

    active_namenodes = [] if len(
        namenode_states[0]) < 1 else namenode_states[0]

    # namenode_states[1] contains standby NN

    return len(active_namenodes) > 0
def is_there_any_active_nn(name_service):
    import params

    namenode_states = namenode_ha_utils.get_namenode_states(
        params.hdfs_site,
        params.security_enabled,
        params.hdfs_user,
        times=3,
        sleep_time=3,
        backoff_factor=2,
        name_service=name_service)

    # unwraps [('nn1', 'c6401.ambari.apache.org:50070')]
    active_namenodes = [] if len(
        namenode_states[0]) < 1 else namenode_states[0]

    # namenode_states[1] contains standby NN

    return len(active_namenodes) > 0
Beispiel #4
0
def initiate_safe_zkfc_failover():
    """
  If this is the active namenode, initiate a safe failover and wait for it to become the standby.

  If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart,
  will also have to start ZKFC manually.
  """
    import params

    # Must kinit before running the HDFS command
    if params.security_enabled:
        Execute(format(
            "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
                user=params.hdfs_user)

    active_namenode_id = None
    standby_namenode_id = None
    active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states(
        params.hdfs_site, params.security_enabled, params.hdfs_user)
    if active_namenodes:
        active_namenode_id = active_namenodes[0][0]
    if standby_namenodes:
        standby_namenode_id = standby_namenodes[0][0]

    if active_namenode_id:
        Logger.info(format("Active NameNode id: {active_namenode_id}"))
    if standby_namenode_id:
        Logger.info(format("Standby NameNode id: {standby_namenode_id}"))
    if unknown_namenodes:
        for unknown_namenode in unknown_namenodes:
            Logger.info("NameNode HA state for {0} is unknown".format(
                unknown_namenode[0]))

    if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id:
        # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover)
        Logger.info(
            format(
                "NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby"
            ))

        failover_command = format(
            "hdfs haadmin -ns {dfs_ha_nameservices} -failover {namenode_id} {other_namenode_id}"
        )
        check_standby_cmd = format(
            "hdfs haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep standby"
        )

        msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format(
            params.hostname)
        Logger.info(msg)
        code, out = shell.call(failover_command,
                               user=params.hdfs_user,
                               logoutput=True)
        Logger.info(
            format("Rolling Upgrade - failover command returned {code}"))
        wait_for_standby = False

        if code == 0:
            wait_for_standby = True
        else:
            # Try to kill ZKFC manually
            was_zkfc_killed = kill_zkfc(params.hdfs_user)
            code, out = shell.call(check_standby_cmd,
                                   user=params.hdfs_user,
                                   logoutput=True)
            Logger.info(
                format("Rolling Upgrade - check for standby returned {code}"))
            if code == 255 and out:
                Logger.info("Rolling Upgrade - NameNode is already down.")
            else:
                if was_zkfc_killed:
                    # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
                    wait_for_standby = True

        if wait_for_standby:
            Logger.info("Waiting for this NameNode to become the standby one.")
            Execute(check_standby_cmd,
                    user=params.hdfs_user,
                    tries=50,
                    try_sleep=6,
                    logoutput=True)
    else:
        msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format(
            params.hostname)
        Logger.info(msg)
Beispiel #5
0
def initiate_safe_zkfc_failover():
  """
  If this is the active namenode, initiate a safe failover and wait for it to become the standby.

  If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart,
  will also have to start ZKFC manually.
  """
  import params

  # Must kinit before running the HDFS command
  if params.security_enabled:
    Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
            user = params.hdfs_user)

  active_namenode_id = None
  standby_namenode_id = None
  active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states(params.hdfs_site, params.security_enabled, params.hdfs_user)
  if active_namenodes:
    active_namenode_id = active_namenodes[0][0]
  if standby_namenodes:
    standby_namenode_id = standby_namenodes[0][0]

  if active_namenode_id:
    Logger.info(format("Active NameNode id: {active_namenode_id}"))
  if standby_namenode_id:
    Logger.info(format("Standby NameNode id: {standby_namenode_id}"))
  if unknown_namenodes:
    for unknown_namenode in unknown_namenodes:
      Logger.info("NameNode HA state for {0} is unknown".format(unknown_namenode[0]))

  if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id:
    # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover)
    Logger.info(format("NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby"))

    failover_command = format("hdfs haadmin -failover {namenode_id} {other_namenode_id}")
    check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby")

    msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format(params.hostname)
    Logger.info(msg)
    code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True)
    Logger.info(format("Rolling Upgrade - failover command returned {code}"))
    wait_for_standby = False

    if code == 0:
      wait_for_standby = True
    else:
      # Try to kill ZKFC manually
      was_zkfc_killed = kill_zkfc(params.hdfs_user)
      code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
      Logger.info(format("Rolling Upgrade - check for standby returned {code}"))
      if code == 255 and out:
        Logger.info("Rolling Upgrade - NameNode is already down.")
      else:
        if was_zkfc_killed:
          # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
          wait_for_standby = True

    if wait_for_standby:
      Logger.info("Waiting for this NameNode to become the standby one.")
      Execute(check_standby_cmd,
              user=params.hdfs_user,
              tries=50,
              try_sleep=6,
              logoutput=True)
  else:
    msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format(params.hostname)
    Logger.info(msg)