Ejemplo n.º 1
0
def namenode(action=None,
             hdfs_binary=None,
             do_format=True,
             upgrade_type=None,
             upgrade_suspended=False,
             env=None):

    if action is None:
        raise Fail('"action" parameter is required for function namenode().')

    if action in ["start", "stop"] and hdfs_binary is None:
        raise Fail(
            '"hdfs_binary" parameter is required for function namenode().')

    if action == "configure":
        import params
        #we need this directory to be present before any action(HA manual steps for
        #additional namenode)
        create_name_dirs(params.dfs_name_dir)

        # set up failover /  secure zookeper ACLs, this feature is supported from HDP 2.6 ownwards
        set_up_zkfc_security(params)
    elif action == "start":
        Logger.info("Called service {0} with upgrade_type: {1}".format(
            action, str(upgrade_type)))
        setup_ranger_hdfs(upgrade_type=upgrade_type)
        import params

        File(params.exclude_file_path,
             content=Template("exclude_hosts_list.j2"),
             owner=params.hdfs_user,
             group=params.user_group)

        if do_format and not params.hdfs_namenode_format_disabled:
            format_namenode()
            pass

        if params.dfs_ha_enabled and \
          params.dfs_ha_namenode_standby is not None and \
          params.hostname == params.dfs_ha_namenode_standby:
            # if the current host is the standby NameNode in an HA deployment
            # run the bootstrap command, to start the NameNode in standby mode
            # this requires that the active NameNode is already up and running,
            # so this execute should be re-tried upon failure, up to a timeout
            success = bootstrap_standby_namenode(params)
            if not success:
                raise Fail("Could not bootstrap standby namenode")

        if upgrade_type == constants.UPGRADE_TYPE_ROLLING and params.dfs_ha_enabled:
            # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
            # to kill ZKFC manually, so we need to start it if not already running.
            safe_zkfc_op(action, env)

        options = ""
        if upgrade_type == constants.UPGRADE_TYPE_ROLLING:
            if params.upgrade_direction == Direction.UPGRADE:
                options = "-rollingUpgrade started"
            elif params.upgrade_direction == Direction.DOWNGRADE:
                options = "-rollingUpgrade downgrade"
        elif upgrade_type == constants.UPGRADE_TYPE_NON_ROLLING:
            is_previous_image_dir = is_previous_fs_image()
            Logger.info("Previous file system image dir present is {0}".format(
                str(is_previous_image_dir)))

            if params.upgrade_direction == Direction.UPGRADE:
                options = "-rollingUpgrade started"
            elif params.upgrade_direction == Direction.DOWNGRADE:
                options = "-rollingUpgrade downgrade"
        elif upgrade_type == constants.UPGRADE_TYPE_HOST_ORDERED:
            # nothing special to do for HOU - should be very close to a normal restart
            pass
        elif upgrade_type is None and upgrade_suspended is True:
            # the rollingUpgrade flag must be passed in during a suspended upgrade when starting NN
            if os.path.exists(
                    namenode_upgrade.get_upgrade_in_progress_marker()):
                options = "-rollingUpgrade started"
            else:
                Logger.info(
                    "The NameNode upgrade marker file {0} does not exist, yet an upgrade is currently suspended. "
                    "Assuming that the upgrade of NameNode has not occurred yet."
                    .format(namenode_upgrade.get_upgrade_in_progress_marker()))

        Logger.info("Options for start command are: {0}".format(options))

        service(action="start",
                name="namenode",
                user=params.hdfs_user,
                options=options,
                create_pid_dir=True,
                create_log_dir=True)

        if params.security_enabled:
            Execute(format(
                "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"
            ),
                    user=params.hdfs_user)

        # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
        # no-HA                 | ON -> OFF                | Yes                      |
        # HA and active         | ON -> OFF                | Yes                      |
        # HA and standby        | no change                | No                       |
        # RU with HA on active  | ON -> OFF                | Yes                      |
        # RU with HA on standby | ON -> OFF                | Yes                      |
        # EU with HA on active  | ON -> OFF                | No                       |
        # EU with HA on standby | ON -> OFF                | No                       |
        # EU non-HA             | ON -> OFF                | No                       |

        # because we do things like create directories after starting NN,
        # the vast majority of the time this should be True - it should only
        # be False if this is HA and we are the Standby NN
        ensure_safemode_off = True

        # True if this is the only NameNode (non-HA) or if its the Active one in HA
        is_active_namenode = True

        if params.dfs_ha_enabled:
            Logger.info(
                "Waiting for the NameNode to broadcast whether it is Active or Standby..."
            )

            if is_this_namenode_active() is False:
                # we are the STANDBY NN
                is_active_namenode = False

                # we are the STANDBY NN and this restart is not part of an upgrade
                if upgrade_type is None:
                    ensure_safemode_off = False

        # During an Express Upgrade, NameNode will not leave SafeMode until the DataNodes are started,
        # so always disable the Safemode check
        if upgrade_type == constants.UPGRADE_TYPE_NON_ROLLING:
            ensure_safemode_off = False

        # some informative logging separate from the above logic to keep things a little cleaner
        if ensure_safemode_off:
            Logger.info(
                "Waiting for this NameNode to leave Safemode due to the following conditions: HA: {0}, isActive: {1}, upgradeType: {2}"
                .format(params.dfs_ha_enabled, is_active_namenode,
                        upgrade_type))
        else:
            Logger.info(
                "Skipping Safemode check due to the following conditions: HA: {0}, isActive: {1}, upgradeType: {2}"
                .format(params.dfs_ha_enabled, is_active_namenode,
                        upgrade_type))

        # wait for Safemode to end
        if ensure_safemode_off:
            if params.rolling_restart and params.rolling_restart_safemode_exit_timeout:
                calculated_retries = int(
                    params.rolling_restart_safemode_exit_timeout) / 30
                wait_for_safemode_off(hdfs_binary,
                                      afterwait_sleep=30,
                                      retries=calculated_retries,
                                      sleep_seconds=30)
            else:
                wait_for_safemode_off(hdfs_binary)

        # Always run this on the "Active" NN unless Safemode has been ignored
        # in the case where safemode was ignored (like during an express upgrade), then
        # NN will be in SafeMode and cannot have directories created
        if is_active_namenode and ensure_safemode_off:
            create_hdfs_directories()
            create_ranger_audit_hdfs_directories()
        else:
            Logger.info(
                "Skipping creation of HDFS directories since this is either not the Active NameNode or we did not wait for Safemode to finish."
            )

    elif action == "stop":
        import params
        service(action="stop", name="namenode", user=params.hdfs_user)
    elif action == "status":
        import status_params
        check_process_status(status_params.namenode_pid_file)
    elif action == "decommission":
        decommission()
Ejemplo n.º 2
0
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None):
  if action is None:
    raise Fail('"action" parameter is required for function namenode().')

  if action in ["start", "stop"] and hdfs_binary is None:
    raise Fail('"hdfs_binary" parameter is required for function namenode().')

  if action == "configure":
    import params
    #we need this directory to be present before any action(HA manual steps for
    #additional namenode)
    create_name_dirs(params.dfs_name_dir)
  elif action == "start":
    Logger.info("Called service {0} with upgrade_type: {1}".format(action, str(upgrade_type)))
    setup_ranger_hdfs(upgrade_type=upgrade_type)
    import params
    if do_format and not params.hdfs_namenode_format_disabled:
      format_namenode()
      pass

    File(params.exclude_file_path,
         content=Template("exclude_hosts_list.j2"),
         owner=params.hdfs_user,
         group=params.user_group
    )

    if params.dfs_ha_enabled and \
      params.dfs_ha_namenode_standby is not None and \
      params.hostname == params.dfs_ha_namenode_standby:
        # if the current host is the standby NameNode in an HA deployment
        # run the bootstrap command, to start the NameNode in standby mode
        # this requires that the active NameNode is already up and running,
        # so this execute should be re-tried upon failure, up to a timeout
        success = bootstrap_standby_namenode(params)
        if not success:
          raise Fail("Could not bootstrap standby namenode")

    if upgrade_type == "rolling" and params.dfs_ha_enabled:
      # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
      # to kill ZKFC manually, so we need to start it if not already running.
      safe_zkfc_op(action, env)

    options = ""
    if upgrade_type == "rolling":
      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"
        
    elif upgrade_type == "nonrolling":
      is_previous_image_dir = is_previous_fs_image()
      Logger.info(format("Previous file system image dir present is {is_previous_image_dir}"))

      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"

    Logger.info(format("Option for start command: {options}"))

    service(
      action="start",
      name="namenode",
      user=params.hdfs_user,
      options=options,
      create_pid_dir=True,
      create_log_dir=True
    )

    if params.security_enabled:
      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
              user = params.hdfs_user)

    if params.dfs_ha_enabled:
      is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
    else:
      is_active_namenode_cmd = True
    
    # During NonRolling Upgrade, both NameNodes are initially down,
    # so no point in checking if this is the active or standby.
    if upgrade_type == "nonrolling":
      is_active_namenode_cmd = False

    # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
    # no-HA                 | ON -> OFF                | Yes                      |
    # HA and active         | ON -> OFF                | Yes                      |
    # HA and standby        | no change                | no check                 |
    # RU with HA on active  | ON -> OFF                | Yes                      |
    # RU with HA on standby | ON -> OFF                | Yes                      |
    # EU with HA on active  | no change                | no check                 |
    # EU with HA on standby | no change                | no check                 |
    # EU non-HA             | no change                | no check                 |

    check_for_safemode_off = False
    msg = ""
    if params.dfs_ha_enabled:
      if upgrade_type is not None:
        check_for_safemode_off = True
        msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
      else:
        Logger.info("Wait for NameNode to become active.")
        if is_active_namenode(hdfs_binary): # active
          check_for_safemode_off = True
          msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
        else:
          msg = "Will remain in the current safemode state."
    else:
      msg = "Must wait to leave safemode since High Availability is not enabled."
      check_for_safemode_off = True

    Logger.info(msg)

    # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
    stay_in_safe_mode = False
    if upgrade_type == "nonrolling":
      stay_in_safe_mode = True

    if check_for_safemode_off:
      Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
      if not stay_in_safe_mode:
        wait_for_safemode_off(hdfs_binary)

    # Always run this on non-HA, or active NameNode during HA.
    create_hdfs_directories(is_active_namenode_cmd)
    create_ranger_audit_hdfs_directories(is_active_namenode_cmd)

  elif action == "stop":
    import params
    service(
      action="stop", name="namenode", 
      user=params.hdfs_user
    )
  elif action == "status":
    import status_params
    check_process_status(status_params.namenode_pid_file)
  elif action == "decommission":
    decommission()
Ejemplo n.º 3
0
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None):
  if action is None:
    raise Fail('"action" parameter is required for function namenode().')

  if action in ["start", "stop"] and hdfs_binary is None:
    raise Fail('"hdfs_binary" parameter is required for function namenode().')

  if action == "configure":
    import params
    #we need this directory to be present before any action(HA manual steps for
    #additional namenode)
    create_name_dirs(params.dfs_name_dir)
  elif action == "start":
    Logger.info("Called service {0} with upgrade_type: {1}".format(action, str(upgrade_type)))
    setup_ranger_hdfs(upgrade_type=upgrade_type)
    import params
    if do_format:
      format_namenode()
      pass

    File(params.exclude_file_path,
         content=Template("exclude_hosts_list.j2"),
         owner=params.hdfs_user,
         group=params.user_group
    )

    Directory(params.hadoop_pid_dir_prefix,
              mode=0755,
              owner=params.hdfs_user,
              group=params.user_group
    )

    if params.dfs_ha_enabled and \
      params.dfs_ha_namenode_standby is not None and \
      params.hostname == params.dfs_ha_namenode_standby:
        # if the current host is the standby NameNode in an HA deployment
        # run the bootstrap command, to start the NameNode in standby mode
        # this requires that the active NameNode is already up and running,
        # so this execute should be re-tried upon failure, up to a timeout
        success = bootstrap_standby_namenode(params)
        if not success:
          raise Fail("Could not bootstrap standby namenode")

    if upgrade_type == "rolling" and params.dfs_ha_enabled:
      # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
      # to kill ZKFC manually, so we need to start it if not already running.
      safe_zkfc_op(action, env)

    #options = "-rollingUpgrade started" if rolling_restart else ""
    options = ""
    if upgrade_type == "rolling":
      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"
    elif upgrade_type == "nonrolling":
      is_previous_image_dir = is_previous_fs_image()
      Logger.info(format("Previous file system image dir present is {is_previous_image_dir}"))

      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"

    Logger.info(format("Option for start command: {options}"))

    service(
      action="start",
      name="namenode",
      user=params.hdfs_user,
      options=options,
      create_pid_dir=True,
      create_log_dir=True
    )


    if params.security_enabled:
      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
              user = params.hdfs_user)
    dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary, use_specific_namenode=True)
    is_namenode_safe_mode_off = dfsadmin_base_command + " -safemode get | grep 'Safe mode is OFF'"
    if params.dfs_ha_enabled:
      is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
    else:
      is_active_namenode_cmd = True
    
    # During NonRolling Upgrade, both NameNodes are initially down,
    # so no point in checking if this is the active or standby.
    if upgrade_type == "nonrolling":
      is_active_namenode_cmd = False

    # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
    # no-HA                 | ON -> OFF                | Yes                      |
    # HA and active         | ON -> OFF                | Yes                      |
    # HA and standby        | no change                | no check                 |
    # RU with HA on active  | ON -> OFF                | Yes                      |
    # RU with HA on standby | ON -> OFF                | Yes                      |
    # EU with HA on active  | no change                | no check                 |
    # EU with HA on standby | no change                | no check                 |
    # EU non-HA             | no change                | no check                 |

    check_for_safemode_off = False
    msg = ""
    if params.dfs_ha_enabled:
      if upgrade_type is not None:
        check_for_safemode_off = True
        msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
      else:
        Logger.info("Wait for NameNode to become active.")
        if is_active_namenode(hdfs_binary): # active
          check_for_safemode_off = True
          msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
        else:
          msg = "Will remain in the current safemode state."
    else:
      msg = "Must wait to leave safemode since High Availability is not enabled."
      check_for_safemode_off = True

    Logger.info(msg)

    # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
    stay_in_safe_mode = False
    if upgrade_type == "nonrolling":
      stay_in_safe_mode = True

    if check_for_safemode_off:
      Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
      if not stay_in_safe_mode:
        Logger.info("Wait to leafe safemode since must transition from ON to OFF.")
        try:
          # Wait up to 30 mins
          Execute(is_namenode_safe_mode_off,
                  tries=180,
                  try_sleep=10,
                  user=params.hdfs_user,
                  logoutput=True
          )
        except Fail:
          Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.")

    # Always run this on non-HA, or active NameNode during HA.
    create_hdfs_directories(is_active_namenode_cmd)
    create_ranger_audit_hdfs_directories(is_active_namenode_cmd)

  elif action == "stop":
    import params
    service(
      action="stop", name="namenode", 
      user=params.hdfs_user
    )
  elif action == "status":
    import status_params
    check_process_status(status_params.namenode_pid_file)
  elif action == "decommission":
    decommission()
Ejemplo n.º 4
0
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None,
    upgrade_suspended=False, env=None):

  if action is None:
    raise Fail('"action" parameter is required for function namenode().')

  if action in ["start", "stop"] and hdfs_binary is None:
    raise Fail('"hdfs_binary" parameter is required for function namenode().')

  if action == "configure":
    import params
    #we need this directory to be present before any action(HA manual steps for
    #additional namenode)
    create_name_dirs(params.dfs_name_dir)
  elif action == "start":
    Logger.info("Called service {0} with upgrade_type: {1}".format(action, str(upgrade_type)))
    setup_ranger_hdfs(upgrade_type=upgrade_type)
    import params
    if do_format and not params.hdfs_namenode_format_disabled:
      format_namenode()
      pass

    File(params.exclude_file_path,
         content=Template("exclude_hosts_list.j2"),
         owner=params.hdfs_user,
         group=params.user_group
    )

    if params.dfs_ha_enabled and \
      params.dfs_ha_namenode_standby is not None and \
      params.hostname == params.dfs_ha_namenode_standby:
        # if the current host is the standby NameNode in an HA deployment
        # run the bootstrap command, to start the NameNode in standby mode
        # this requires that the active NameNode is already up and running,
        # so this execute should be re-tried upon failure, up to a timeout
        success = bootstrap_standby_namenode(params)
        if not success:
          raise Fail("Could not bootstrap standby namenode")

    if upgrade_type == "rolling" and params.dfs_ha_enabled:
      # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
      # to kill ZKFC manually, so we need to start it if not already running.
      safe_zkfc_op(action, env)

    options = ""
    if upgrade_type == "rolling":
      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"
    elif upgrade_type == "nonrolling":
      is_previous_image_dir = is_previous_fs_image()
      Logger.info("Previous file system image dir present is {0}".format(str(is_previous_image_dir)))

      if params.upgrade_direction == Direction.UPGRADE:
        options = "-rollingUpgrade started"
      elif params.upgrade_direction == Direction.DOWNGRADE:
        options = "-rollingUpgrade downgrade"
    elif upgrade_type is None and upgrade_suspended is True:
      # the rollingUpgrade flag must be passed in during a suspended upgrade when starting NN
      if os.path.exists(namenode_upgrade.get_upgrade_in_progress_marker()):
        options = "-rollingUpgrade started"
      else:
        Logger.info("The NameNode upgrade marker file {0} does not exist, yet an upgrade is currently suspended. "
                    "Assuming that the upgrade of NameNode has not occurred yet.".format(namenode_upgrade.get_upgrade_in_progress_marker()))

    Logger.info("Options for start command are: {0}".format(options))

    service(
      action="start",
      name="namenode",
      user=params.hdfs_user,
      options=options,
      create_pid_dir=True,
      create_log_dir=True
    )

    if params.security_enabled:
      Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
              user = params.hdfs_user)

    if params.dfs_ha_enabled:
      is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
    else:
      is_active_namenode_cmd = True
    
    # During NonRolling Upgrade, both NameNodes are initially down,
    # so no point in checking if this is the active or standby.
    if upgrade_type == "nonrolling":
      is_active_namenode_cmd = False

    # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
    # no-HA                 | ON -> OFF                | Yes                      |
    # HA and active         | ON -> OFF                | Yes                      |
    # HA and standby        | no change                | no check                 |
    # RU with HA on active  | ON -> OFF                | Yes                      |
    # RU with HA on standby | ON -> OFF                | Yes                      |
    # EU with HA on active  | no change                | no check                 |
    # EU with HA on standby | no change                | no check                 |
    # EU non-HA             | no change                | no check                 |

    check_for_safemode_off = False
    is_active_namenode = False
    msg = ""
    if params.dfs_ha_enabled:
      if upgrade_type is not None:
        check_for_safemode_off = True
        msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
      else:
        Logger.info("Wait for NameNode to become active.")
        if check_is_active_namenode(hdfs_binary): # active
          check_for_safemode_off = True
          is_active_namenode = True
          msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
        else:
          msg = "Will remain in the current safemode state."
    else:
      msg = "Must wait to leave safemode since High Availability is not enabled."
      check_for_safemode_off = True
      is_active_namenode = True

    Logger.info(msg)

    # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
    stay_in_safe_mode = False
    if upgrade_type == "nonrolling":
      stay_in_safe_mode = True

    if check_for_safemode_off:
      Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
      if not stay_in_safe_mode:
        wait_for_safemode_off(hdfs_binary)

    # Always run this on non-HA, or active NameNode during HA.
    if is_active_namenode:
      create_hdfs_directories()
      create_ranger_audit_hdfs_directories()
    else:
      Logger.info("Skipping creating hdfs directories as is not active NN.")

  elif action == "stop":
    import params
    service(
      action="stop", name="namenode", 
      user=params.hdfs_user
    )
  elif action == "status":
    import status_params
    check_process_status(status_params.namenode_pid_file)
  elif action == "decommission":
    decommission()