Example #1
0
def bootstrap_standby(cluster):
    """ Bootstraps a standby NameNode """
    install_dir = cluster.get_hadoop_install_dir()
    get_logger().info("Bootstrapping standby NameNode: {}".format(
        env.host_string))
    cmd = '{}/bin/hdfs namenode -bootstrapstandby'.format(install_dir)
    return sudo(cmd, user=constants.HDFS_USER).succeeded
Example #2
0
def stop_ozone(cluster):
    """ Stops HDFS and Ozone"""
    get_logger().info("Stopping Ozone services ...")
    install_dir = cluster.get_hadoop_install_dir()
    start_dfs_cmd = '{}/sbin/stop-ozone.sh'.format(install_dir)
    sudo(start_dfs_cmd, user=constants.HDFS_USER)
    return True
Example #3
0
def start_dfs(cluster):
    """Starts the dfs cluster. """
    get_logger().info("Starting HDFS services ... this can take some time.")
    install_dir = cluster.get_hadoop_install_dir()
    start_dfs_cmd = '{}/sbin/start-dfs.sh'.format(install_dir)
    sudo(start_dfs_cmd, pty=False)
    return True
Example #4
0
def passwd(username=None, password=None):
    """Changes password. e.g. fab passwd:username=hdfs,password=password"""
    get_logger().debug('changing password for user {}'.format(username))
    with hide('commands', 'output', 'running', 'warnings', 'debug',
              'status'), settings(warn_only=True):
        result = sudo("echo {} | passwd --stdin {}".format(password, username))
    return result
Example #5
0
def stop_yarn(cluster):
    """Stops the yarn services."""
    get_logger().info("Stopping YARN services ...")
    install_dir = cluster.get_hadoop_install_dir()
    stop_yarn_cmd = '{}/sbin/stop-yarn.sh'.format(install_dir)
    sudo(stop_yarn_cmd, user=constants.YARN_USER)
    return True
Example #6
0
 def handle_prepare_cluster(command, cluster):
     env.output_prefix = False
     # Convert from string to binary
     prepare_cluster(cluster=cluster,
                     force=is_true(
                         cluster.config[constants.KEY_FORCE_WIPE]))
     get_logger().info("Done preparing the cluster.")
Example #7
0
def killall_services():
    """
    Kills all services related to hadoop.
    :return:
    """
    # Need to add the full list of Hadoop proceess here.
    # with settings(warn_only=True):
    #     if is_namenode_running():
    #         kill_namenode()
    #     if is_datanode_running():
    #         kill_datanode()
    #     if is_scsi_server_running():
    #         kill_scsi_server()
    #     if is_scm_running():
    #         kill_scm()
    #     if is_cblock_running():
    #         kill_cblock_server()
    # return True

    # Cheat for now and kill all Java process
    get_logger().debug("Killing all services on host {}".format(env.host))
    with settings(warn_only=True):
        sudo('ps aux | grep -i [j]ava | awk \'{print $2}\' | xargs -r kill -9')
    get_logger().debug("Killed all services on host {}".format(env.host))
    return True
Example #8
0
 def handle_start(command, cluster):
     service = command.split()[1:2]
     nodes = command.split()[2:]
     cmds = []
     if not service or service[0].lower() == "all":
         run_hdfs(cluster)
         do_sleep(20)
         if cluster.is_yarn_enabled():
             run_yarn(cluster)
     elif service[0].lower() in {'dfs', 'hdfs'}:
         run_hdfs(cluster)
     elif service[0].lower() == 'yarn':
         run_yarn(cluster)
     elif service[0].lower() == 'ozone':
         run_ozone()
     elif service[0].lower() == 'datanodes':
         start_stop_datanodes(action='start', nodes=nodes, cluster=cluster)
     elif service[0].lower() == 'namenodes':
         start_stop_namenodes(action='start', nodes=nodes, cluster=cluster)
     elif service[0].lower() == 'journalnodes':
         start_stop_journalnodes(action='start',
                                 nodes=nodes,
                                 cluster=cluster)
     else:
         get_logger().error("Unrecognized service {}\n".format(service[0]))
         return
     with hide('running'):
         for cmd in cmds:
             local(cmd)
Example #9
0
def install_container_executor(cluster=None):
    """
    Install the YARN Linux container executor if it is not already present
    on the node. This uses a bundled binary that should work on most Linux
    distributions.

    It is a fallback to allow enabling Kerberos security with
    an HDP distribution that was compiled without Linux native support.

    The container-executor binary should be setuid.

    :param cluster:
    :return:
    """
    local_ce_file = resource_filename('bman.resources.bin',
                                      'container-executor')
    remote_path = os.path.join(cluster.get_hadoop_install_dir(),
                               'bin/container-executor')
    if not exists(path=remote_path):
        get_logger().debug(
            " >> Copying container executor from {} to {}".format(
                local_ce_file, remote_path))
        put(local_path=local_ce_file, remote_path=remote_path)
    sudo('chown root.{0} {1} && chmod 6050 {1}'.format(HADOOP_GROUP,
                                                       remote_path))
Example #10
0
def generate_hadoop_env(cluster):
    """ Generate hadoop-env.sh."""
    get_logger().debug("Generating hadoop-env.sh from template")
    template_str = resource_string('bman.resources.conf',
                                   'hadoop-env.sh.template').decode('utf-8')
    env_str = Template(template_str)

    log_dirs = {}
    # Set the log directories for Hadoop service users.
    for user in cluster.get_service_users():
        log_dirs['{}_log_dir_config'.format(user.name)] = os.path.join(
            cluster.get_hadoop_install_dir(), "logs", user.name)

    env_str = env_str.safe_substitute(
        hadoop_home_config=cluster.get_hadoop_install_dir(),
        java_home=cluster.get_config(constants.KEY_JAVA_HOME),
        hdfs_datanode_secure_user=(constants.HDFS_USER
                                   if cluster.is_kerberized() else ''),
        hdfs_datanode_user=('root' if cluster.is_kerberized() else
                            constants.HDFS_USER),
        hdfs_user=constants.HDFS_USER,
        yarn_user=constants.YARN_USER,
        jsvc_home=constants.JSVC_HOME,
        **log_dirs)

    if cluster.is_tez_enabled():
        env_str = env_str + hadoop_env_tez_settings(cluster)

    with open(
            os.path.join(cluster.get_generated_hadoop_conf_tmp_dir(),
                         "hadoop-env.sh"), "w") as hadoop_env:
        hadoop_env.write(env_str)
Example #11
0
def stop_dfs(cluster):
    """Stops the dfs cluster."""
    get_logger().info("Stopping HDFS services ...")
    install_dir = cluster.get_hadoop_install_dir()
    stop_dfs_cmd = '{}/sbin/stop-dfs.sh'.format(install_dir)
    sudo(stop_dfs_cmd, user=constants.HDFS_USER)
    return True
Example #12
0
def start_yarn(cluster):
    """Starts the yarn services. """
    get_logger().info("Starting YARN services ... this can take some time.")
    install_dir = cluster.get_hadoop_install_dir()
    start_yarn_cmd = '{}/sbin/start-yarn.sh'.format(install_dir)
    sudo(start_yarn_cmd, pty=False, user=constants.YARN_USER)
    return True
Example #13
0
def stop_scm(cluster):
    """Stops the storage container manager"""
    get_logger().info("Stopping the SCM ...")
    if cluster.get_config(constants.KEY_OZONE_ENABLED):
        install_dir = cluster.get_hadoop_install_dir()
        stop_scm_cmd = '{}/bin/hdfs --daemon stop scm'.format(install_dir)
        sudo(stop_scm_cmd, user=constants.HDFS_USER)
    return True
Example #14
0
def stop_jscsi_server(cluster):
    """Stops the JSCSI Server"""
    get_logger().info("Stopping the JSCSI kadmin_server ...")
    if cluster.get_config(constants.KEY_OZONE_ENABLED):
        install_dir = cluster.get_hadoop_install_dir()
        stop_jscsi_cmd = '{}/bin/hdfs --daemon stop jscsi'.format(install_dir)
        sudo(stop_jscsi_cmd, user=constants.HDFS_USER)
    return True
Example #15
0
def make_install_dir(cluster):
    with hide('status', 'warnings', 'running', 'stdout', 'stderr', 'user',
              'commands'):
        if not execute(make_base_install_dir,
                       hosts=cluster.get_all_hosts(),
                       cluster=cluster):
            get_logger().error('Making install directory failed.')
            return False
Example #16
0
def make_hadoop_log_dirs(cluster=None):
    logging_root = os.path.join(cluster.get_hadoop_install_dir(),
                                constants.HADOOP_LOG_DIR_NAME)
    get_logger().debug("Creating log output dir {} on host {}".format(
        logging_root, env.host))
    sudo('mkdir -p {}'.format(logging_root))
    sudo('chgrp {} {}'.format(constants.HADOOP_GROUP, logging_root))
    sudo('chmod 775 {}'.format(logging_root))
Example #17
0
def format_namenode(cluster, cluster_id):
    """ formats a namenode using the given cluster_id.
    """
    # This command will prompt the user, so we are skipping the prompt.
    get_logger().info('Formatting NameNode {}'.format(env.host_string))
    with hide("stdout"):
        return sudo('{}/bin/hdfs namenode -format -clusterid {}'.format(
            cluster.get_hadoop_install_dir(), cluster_id),
                    user=constants.HDFS_USER).succeeded
Example #18
0
def start_cblock_server(cluster):
    """Starts the cBlockServer"""
    get_logger().info("Starting the cBlockServer ...")
    if cluster.get_config(constants.KEY_OZONE_ENABLED):
        install_dir = cluster.get_hadoop_install_dir()
        start_cblock_cmd = '{}/bin/hdfs --daemon start cblockserver'.format(
            install_dir)
        sudo(start_cblock_cmd, user=constants.HDFS_USER)
    return True
Example #19
0
def sshkey_gen(cluster=None, user=None):
    """Generates a an ssh"""
    keyname = get_keyname_for_user(user=user)
    ssh_keys_dir = cluster.get_ssh_keys_tmp_dir()
    get_logger().debug("Generating a private key for user {}.".format(
        user.name))
    os.makedirs(ssh_keys_dir, exist_ok=True)
    local("rm -f {}/{}*".format(ssh_keys_dir, keyname))
    local('ssh-keygen -b 2048 -t rsa -f {}/{}  -q -N ""'.format(
        ssh_keys_dir, keyname))
Example #20
0
def fail_if_fabricrc_exists():
    """
    Fail if ~/.fabricrc exists. Credentials defined in .fabricrc can
    conflict with credentials generated by bman.
    :return:
    """
    fabricconf = os.path.join(os.path.expanduser("~"), ".fabricrc")
    if os.path.isfile(fabricconf):
        get_logger().error("Please remove the {} file.".format(fabricconf))
        sys.exit(-1)
Example #21
0
def clean_root_dir(path):
    """
    Removes all files from the Path.
    :param path:
    :return:
    """
    get_logger().debug("Cleaning root directory on host {}".format(env.host))
    with settings(warn_only=True):
        sudo("rm -rf %s" % path)
    return True
Example #22
0
def generate_configs(cluster=None):
    if cluster is None:
        cluster = load_config()

    get_logger().info("Generating Hadoop configuration files")
    try:
        # We create configuration files in the generated directory. Once that
        # is done, we process specific files that need template processing
        # and over write them. In other words the copy of all files need to be
        # first.
        check_for_generated_dirs(cluster)
        copy_all_configs(cluster)
        update_hdfs_configs(cluster)
        generate_site_config(
            cluster,
            filename='core-site.xml',
            settings_key=constants.KEY_CORE_SITE_SETTINGS,
            output_dir=cluster.get_generated_hadoop_conf_tmp_dir())
        generate_site_config(
            cluster,
            filename='hdfs-site.xml',
            settings_key=constants.KEY_HDFS_SITE_SETTINGS,
            output_dir=cluster.get_generated_hadoop_conf_tmp_dir())

        if cluster.is_yarn_enabled():
            update_mapred_configs(cluster)
            update_yarn_configs(cluster)
            generate_site_config(
                cluster,
                filename='yarn-site.xml',
                settings_key=constants.KEY_YARN_SITE_SETTINGS,
                output_dir=cluster.get_generated_hadoop_conf_tmp_dir())
            generate_site_config(
                cluster,
                filename='mapred-site.xml',
                settings_key=constants.KEY_MAPRED_SITE_SETTINGS,
                output_dir=cluster.get_generated_hadoop_conf_tmp_dir())

        if cluster.is_tez_enabled():
            update_tez_configs(cluster)
            generate_site_config(
                cluster,
                filename='tez-site.xml',
                settings_key=constants.KEY_TEZ_SITE_SETTINGS,
                output_dir=cluster.get_generated_tez_conf_tmp_dir())

        if cluster.get_config(constants.KEY_OZONE_ENABLED):
            generate_ozone_site(cluster)
        generate_workers_file(cluster)
        generate_hadoop_env(cluster)
        generate_logging_properties(cluster)

    except Exception as e:
        get_logger().exception(e)
Example #23
0
 def read_config_value_with_altkey(self, values, key, altkey):
     value = None
     if key in values:
         self.config[key] = value = values[key]
     if not value and altkey in values:
         get_logger().warn(
             "{} has been deprecated by {}. Please update {}".format(
                 altkey, key, self.get_config_file()))
         self.config[key] = values[altkey]
     if not value:
         raise ValueError("Required key {} is missing in YAML.".format(key))
Example #24
0
def do_kerberos_install(cluster=None):

    get_logger().info(
        "Installing jsvc and Linux container executor on all cluster hosts")
    copy_jce_policy_files(cluster)
    execute(install_jsvc, hosts=cluster.get_all_hosts())
    execute(install_container_executor,
            hosts=cluster.get_all_hosts(),
            cluster=cluster)
    make_headless_principals(cluster)
    generate_hdfs_principals_and_keytabs(cluster=cluster)
Example #25
0
def do_active_transitions(cluster):
    for ns in cluster.get_hdfs_master_config().get_nameservices():
        if len(ns.get_nn_configs()) > 1:
            active_nn_id = ns.choose_active_nn()[0].nn_id
            get_logger().info("Transitioning {}.{} to active".format(
                ns.nsid, active_nn_id))
            cmd = '{}/bin/hdfs haadmin -ns {} -transitionToActive {}'.format(
                cluster.get_hadoop_install_dir(), ns.nsid, active_nn_id)
            targets = cluster.get_hdfs_master_config().get_nn_hosts()[0:1]
            execute(run_dfs_command, hosts=targets, cluster=cluster, cmd=cmd)
    pass
Example #26
0
def setup_passwordless_ssh(cluster, targets):
    # Setup password-less ssh for all service users.
    get_logger().info("Installing ssh keys for users [{}] on {} hosts.".format(
        ", ".join(cluster.get_service_user_names()), len(targets)))
    for user in cluster.get_service_users():
        sshkey_gen(cluster=cluster, user=user)
        for hostname in targets:
            sshkey_install(hostname=hostname, user=user, cluster=cluster)
        if not execute(
                copy_private_key, hosts=targets, user=user, cluster=cluster):
            get_logger().error('Putting private key failed.')
            return False
Example #27
0
def copy_all_configs(cluster=None):
    """ Copy the remaining files as-is, removing the .template suffix """
    conf_generated_dir = cluster.get_generated_hadoop_conf_tmp_dir()
    get_logger().debug("Listing conf resources")
    for f in resource_listdir('bman.resources.conf', ''):
        if f.endswith('.template'):
            get_logger().debug("Got resource {}".format(f))
            resource_contents = resource_string('bman.resources.conf',
                                                f).decode('utf-8')
            filename = re.sub(".template$", "", f)
            with open(os.path.join(conf_generated_dir, filename),
                      "w") as output_file:
                output_file.write(resource_contents)
Example #28
0
def check_user_exists(username=None):
    """
    Checks if the user exists on the remote machine
    """
    get_logger().debug(
        "executing check_user_exists for user {} on host {}".format(
            username, env.host))
    with hide('status', 'aborts', 'warnings', 'running', 'stdout', 'stderr',
              'user', 'commands', 'output'), settings(warn_only=True):
        get_logger().debug("user is {} running id {}".format(
            env.user, username))
        result = sudo('id %s'.format(username), pty=True)
    return result.succeeded
Example #29
0
def add_user(cluster=None, new_user=None):
    """
    Creates an unprivileged user e.g. to run HDFS commands and submit jobs.
    """
    targets = cluster.get_all_hosts()
    with hide('status', 'warnings', 'running', 'stdout', 'stderr', 'user',
              'commands'):
        if not execute(add_user_task, hosts=targets, new_user=new_user):
            get_logger().error('Failed to create user {}.'.format(
                new_user.name))
            return False
        if cluster.is_kerberized():
            make_headless_principal(cluster, kadmin_util=None, user=new_user)
Example #30
0
def run_hdfs(cluster):
    targets = cluster.get_hdfs_master_config().get_nn_hosts()
    execute(start_dfs, hosts=[targets[0]], cluster=cluster)

    do_active_transitions(cluster)

    # CBlock needs OZONE -- so we start everything.
    if cluster.get_config(constants.KEY_CBLOCK_CACHE):
        # We start SCM and Cblock on the Namenode machine for now.
        # TODO : Fix this so that if other machines are specified
        # we are able to get to it.
        if execute(start_scm, hosts=targets, cluster=cluster):
            get_logger().error('Start SCM failed.')
            return False

        if execute(start_cblock_server, hosts=targets, cluster=cluster):
            get_logger().error('Failed to start cBlock Server.')
            return False

        # Read the Datanode Machine list now and execute the rest of the commands
        targets = cluster.get_worker_nodes()

        if execute(start_jscsi_server, hosts=targets, cluster=cluster):
            get_logger().error('Unable to start datanodes.')
            return False
    else:
        # Just Start SCM for Ozone, everthing else is already running.
        if cluster.get_config(constants.KEY_OZONE_ENABLED):
            if execute(start_scm, hosts=targets, cluster=cluster):
                get_logger().error('Start SCM failed.')
                return False

    return True