def bootstrap_standby(cluster): """ Bootstraps a standby NameNode """ install_dir = cluster.get_hadoop_install_dir() get_logger().info("Bootstrapping standby NameNode: {}".format( env.host_string)) cmd = '{}/bin/hdfs namenode -bootstrapstandby'.format(install_dir) return sudo(cmd, user=constants.HDFS_USER).succeeded
def stop_ozone(cluster): """ Stops HDFS and Ozone""" get_logger().info("Stopping Ozone services ...") install_dir = cluster.get_hadoop_install_dir() start_dfs_cmd = '{}/sbin/stop-ozone.sh'.format(install_dir) sudo(start_dfs_cmd, user=constants.HDFS_USER) return True
def start_dfs(cluster): """Starts the dfs cluster. """ get_logger().info("Starting HDFS services ... this can take some time.") install_dir = cluster.get_hadoop_install_dir() start_dfs_cmd = '{}/sbin/start-dfs.sh'.format(install_dir) sudo(start_dfs_cmd, pty=False) return True
def passwd(username=None, password=None): """Changes password. e.g. fab passwd:username=hdfs,password=password""" get_logger().debug('changing password for user {}'.format(username)) with hide('commands', 'output', 'running', 'warnings', 'debug', 'status'), settings(warn_only=True): result = sudo("echo {} | passwd --stdin {}".format(password, username)) return result
def stop_yarn(cluster): """Stops the yarn services.""" get_logger().info("Stopping YARN services ...") install_dir = cluster.get_hadoop_install_dir() stop_yarn_cmd = '{}/sbin/stop-yarn.sh'.format(install_dir) sudo(stop_yarn_cmd, user=constants.YARN_USER) return True
def handle_prepare_cluster(command, cluster): env.output_prefix = False # Convert from string to binary prepare_cluster(cluster=cluster, force=is_true( cluster.config[constants.KEY_FORCE_WIPE])) get_logger().info("Done preparing the cluster.")
def killall_services(): """ Kills all services related to hadoop. :return: """ # Need to add the full list of Hadoop proceess here. # with settings(warn_only=True): # if is_namenode_running(): # kill_namenode() # if is_datanode_running(): # kill_datanode() # if is_scsi_server_running(): # kill_scsi_server() # if is_scm_running(): # kill_scm() # if is_cblock_running(): # kill_cblock_server() # return True # Cheat for now and kill all Java process get_logger().debug("Killing all services on host {}".format(env.host)) with settings(warn_only=True): sudo('ps aux | grep -i [j]ava | awk \'{print $2}\' | xargs -r kill -9') get_logger().debug("Killed all services on host {}".format(env.host)) return True
def handle_start(command, cluster): service = command.split()[1:2] nodes = command.split()[2:] cmds = [] if not service or service[0].lower() == "all": run_hdfs(cluster) do_sleep(20) if cluster.is_yarn_enabled(): run_yarn(cluster) elif service[0].lower() in {'dfs', 'hdfs'}: run_hdfs(cluster) elif service[0].lower() == 'yarn': run_yarn(cluster) elif service[0].lower() == 'ozone': run_ozone() elif service[0].lower() == 'datanodes': start_stop_datanodes(action='start', nodes=nodes, cluster=cluster) elif service[0].lower() == 'namenodes': start_stop_namenodes(action='start', nodes=nodes, cluster=cluster) elif service[0].lower() == 'journalnodes': start_stop_journalnodes(action='start', nodes=nodes, cluster=cluster) else: get_logger().error("Unrecognized service {}\n".format(service[0])) return with hide('running'): for cmd in cmds: local(cmd)
def install_container_executor(cluster=None): """ Install the YARN Linux container executor if it is not already present on the node. This uses a bundled binary that should work on most Linux distributions. It is a fallback to allow enabling Kerberos security with an HDP distribution that was compiled without Linux native support. The container-executor binary should be setuid. :param cluster: :return: """ local_ce_file = resource_filename('bman.resources.bin', 'container-executor') remote_path = os.path.join(cluster.get_hadoop_install_dir(), 'bin/container-executor') if not exists(path=remote_path): get_logger().debug( " >> Copying container executor from {} to {}".format( local_ce_file, remote_path)) put(local_path=local_ce_file, remote_path=remote_path) sudo('chown root.{0} {1} && chmod 6050 {1}'.format(HADOOP_GROUP, remote_path))
def generate_hadoop_env(cluster): """ Generate hadoop-env.sh.""" get_logger().debug("Generating hadoop-env.sh from template") template_str = resource_string('bman.resources.conf', 'hadoop-env.sh.template').decode('utf-8') env_str = Template(template_str) log_dirs = {} # Set the log directories for Hadoop service users. for user in cluster.get_service_users(): log_dirs['{}_log_dir_config'.format(user.name)] = os.path.join( cluster.get_hadoop_install_dir(), "logs", user.name) env_str = env_str.safe_substitute( hadoop_home_config=cluster.get_hadoop_install_dir(), java_home=cluster.get_config(constants.KEY_JAVA_HOME), hdfs_datanode_secure_user=(constants.HDFS_USER if cluster.is_kerberized() else ''), hdfs_datanode_user=('root' if cluster.is_kerberized() else constants.HDFS_USER), hdfs_user=constants.HDFS_USER, yarn_user=constants.YARN_USER, jsvc_home=constants.JSVC_HOME, **log_dirs) if cluster.is_tez_enabled(): env_str = env_str + hadoop_env_tez_settings(cluster) with open( os.path.join(cluster.get_generated_hadoop_conf_tmp_dir(), "hadoop-env.sh"), "w") as hadoop_env: hadoop_env.write(env_str)
def stop_dfs(cluster): """Stops the dfs cluster.""" get_logger().info("Stopping HDFS services ...") install_dir = cluster.get_hadoop_install_dir() stop_dfs_cmd = '{}/sbin/stop-dfs.sh'.format(install_dir) sudo(stop_dfs_cmd, user=constants.HDFS_USER) return True
def start_yarn(cluster): """Starts the yarn services. """ get_logger().info("Starting YARN services ... this can take some time.") install_dir = cluster.get_hadoop_install_dir() start_yarn_cmd = '{}/sbin/start-yarn.sh'.format(install_dir) sudo(start_yarn_cmd, pty=False, user=constants.YARN_USER) return True
def stop_scm(cluster): """Stops the storage container manager""" get_logger().info("Stopping the SCM ...") if cluster.get_config(constants.KEY_OZONE_ENABLED): install_dir = cluster.get_hadoop_install_dir() stop_scm_cmd = '{}/bin/hdfs --daemon stop scm'.format(install_dir) sudo(stop_scm_cmd, user=constants.HDFS_USER) return True
def stop_jscsi_server(cluster): """Stops the JSCSI Server""" get_logger().info("Stopping the JSCSI kadmin_server ...") if cluster.get_config(constants.KEY_OZONE_ENABLED): install_dir = cluster.get_hadoop_install_dir() stop_jscsi_cmd = '{}/bin/hdfs --daemon stop jscsi'.format(install_dir) sudo(stop_jscsi_cmd, user=constants.HDFS_USER) return True
def make_install_dir(cluster): with hide('status', 'warnings', 'running', 'stdout', 'stderr', 'user', 'commands'): if not execute(make_base_install_dir, hosts=cluster.get_all_hosts(), cluster=cluster): get_logger().error('Making install directory failed.') return False
def make_hadoop_log_dirs(cluster=None): logging_root = os.path.join(cluster.get_hadoop_install_dir(), constants.HADOOP_LOG_DIR_NAME) get_logger().debug("Creating log output dir {} on host {}".format( logging_root, env.host)) sudo('mkdir -p {}'.format(logging_root)) sudo('chgrp {} {}'.format(constants.HADOOP_GROUP, logging_root)) sudo('chmod 775 {}'.format(logging_root))
def format_namenode(cluster, cluster_id): """ formats a namenode using the given cluster_id. """ # This command will prompt the user, so we are skipping the prompt. get_logger().info('Formatting NameNode {}'.format(env.host_string)) with hide("stdout"): return sudo('{}/bin/hdfs namenode -format -clusterid {}'.format( cluster.get_hadoop_install_dir(), cluster_id), user=constants.HDFS_USER).succeeded
def start_cblock_server(cluster): """Starts the cBlockServer""" get_logger().info("Starting the cBlockServer ...") if cluster.get_config(constants.KEY_OZONE_ENABLED): install_dir = cluster.get_hadoop_install_dir() start_cblock_cmd = '{}/bin/hdfs --daemon start cblockserver'.format( install_dir) sudo(start_cblock_cmd, user=constants.HDFS_USER) return True
def sshkey_gen(cluster=None, user=None): """Generates a an ssh""" keyname = get_keyname_for_user(user=user) ssh_keys_dir = cluster.get_ssh_keys_tmp_dir() get_logger().debug("Generating a private key for user {}.".format( user.name)) os.makedirs(ssh_keys_dir, exist_ok=True) local("rm -f {}/{}*".format(ssh_keys_dir, keyname)) local('ssh-keygen -b 2048 -t rsa -f {}/{} -q -N ""'.format( ssh_keys_dir, keyname))
def fail_if_fabricrc_exists(): """ Fail if ~/.fabricrc exists. Credentials defined in .fabricrc can conflict with credentials generated by bman. :return: """ fabricconf = os.path.join(os.path.expanduser("~"), ".fabricrc") if os.path.isfile(fabricconf): get_logger().error("Please remove the {} file.".format(fabricconf)) sys.exit(-1)
def clean_root_dir(path): """ Removes all files from the Path. :param path: :return: """ get_logger().debug("Cleaning root directory on host {}".format(env.host)) with settings(warn_only=True): sudo("rm -rf %s" % path) return True
def generate_configs(cluster=None): if cluster is None: cluster = load_config() get_logger().info("Generating Hadoop configuration files") try: # We create configuration files in the generated directory. Once that # is done, we process specific files that need template processing # and over write them. In other words the copy of all files need to be # first. check_for_generated_dirs(cluster) copy_all_configs(cluster) update_hdfs_configs(cluster) generate_site_config( cluster, filename='core-site.xml', settings_key=constants.KEY_CORE_SITE_SETTINGS, output_dir=cluster.get_generated_hadoop_conf_tmp_dir()) generate_site_config( cluster, filename='hdfs-site.xml', settings_key=constants.KEY_HDFS_SITE_SETTINGS, output_dir=cluster.get_generated_hadoop_conf_tmp_dir()) if cluster.is_yarn_enabled(): update_mapred_configs(cluster) update_yarn_configs(cluster) generate_site_config( cluster, filename='yarn-site.xml', settings_key=constants.KEY_YARN_SITE_SETTINGS, output_dir=cluster.get_generated_hadoop_conf_tmp_dir()) generate_site_config( cluster, filename='mapred-site.xml', settings_key=constants.KEY_MAPRED_SITE_SETTINGS, output_dir=cluster.get_generated_hadoop_conf_tmp_dir()) if cluster.is_tez_enabled(): update_tez_configs(cluster) generate_site_config( cluster, filename='tez-site.xml', settings_key=constants.KEY_TEZ_SITE_SETTINGS, output_dir=cluster.get_generated_tez_conf_tmp_dir()) if cluster.get_config(constants.KEY_OZONE_ENABLED): generate_ozone_site(cluster) generate_workers_file(cluster) generate_hadoop_env(cluster) generate_logging_properties(cluster) except Exception as e: get_logger().exception(e)
def read_config_value_with_altkey(self, values, key, altkey): value = None if key in values: self.config[key] = value = values[key] if not value and altkey in values: get_logger().warn( "{} has been deprecated by {}. Please update {}".format( altkey, key, self.get_config_file())) self.config[key] = values[altkey] if not value: raise ValueError("Required key {} is missing in YAML.".format(key))
def do_kerberos_install(cluster=None): get_logger().info( "Installing jsvc and Linux container executor on all cluster hosts") copy_jce_policy_files(cluster) execute(install_jsvc, hosts=cluster.get_all_hosts()) execute(install_container_executor, hosts=cluster.get_all_hosts(), cluster=cluster) make_headless_principals(cluster) generate_hdfs_principals_and_keytabs(cluster=cluster)
def do_active_transitions(cluster): for ns in cluster.get_hdfs_master_config().get_nameservices(): if len(ns.get_nn_configs()) > 1: active_nn_id = ns.choose_active_nn()[0].nn_id get_logger().info("Transitioning {}.{} to active".format( ns.nsid, active_nn_id)) cmd = '{}/bin/hdfs haadmin -ns {} -transitionToActive {}'.format( cluster.get_hadoop_install_dir(), ns.nsid, active_nn_id) targets = cluster.get_hdfs_master_config().get_nn_hosts()[0:1] execute(run_dfs_command, hosts=targets, cluster=cluster, cmd=cmd) pass
def setup_passwordless_ssh(cluster, targets): # Setup password-less ssh for all service users. get_logger().info("Installing ssh keys for users [{}] on {} hosts.".format( ", ".join(cluster.get_service_user_names()), len(targets))) for user in cluster.get_service_users(): sshkey_gen(cluster=cluster, user=user) for hostname in targets: sshkey_install(hostname=hostname, user=user, cluster=cluster) if not execute( copy_private_key, hosts=targets, user=user, cluster=cluster): get_logger().error('Putting private key failed.') return False
def copy_all_configs(cluster=None): """ Copy the remaining files as-is, removing the .template suffix """ conf_generated_dir = cluster.get_generated_hadoop_conf_tmp_dir() get_logger().debug("Listing conf resources") for f in resource_listdir('bman.resources.conf', ''): if f.endswith('.template'): get_logger().debug("Got resource {}".format(f)) resource_contents = resource_string('bman.resources.conf', f).decode('utf-8') filename = re.sub(".template$", "", f) with open(os.path.join(conf_generated_dir, filename), "w") as output_file: output_file.write(resource_contents)
def check_user_exists(username=None): """ Checks if the user exists on the remote machine """ get_logger().debug( "executing check_user_exists for user {} on host {}".format( username, env.host)) with hide('status', 'aborts', 'warnings', 'running', 'stdout', 'stderr', 'user', 'commands', 'output'), settings(warn_only=True): get_logger().debug("user is {} running id {}".format( env.user, username)) result = sudo('id %s'.format(username), pty=True) return result.succeeded
def add_user(cluster=None, new_user=None): """ Creates an unprivileged user e.g. to run HDFS commands and submit jobs. """ targets = cluster.get_all_hosts() with hide('status', 'warnings', 'running', 'stdout', 'stderr', 'user', 'commands'): if not execute(add_user_task, hosts=targets, new_user=new_user): get_logger().error('Failed to create user {}.'.format( new_user.name)) return False if cluster.is_kerberized(): make_headless_principal(cluster, kadmin_util=None, user=new_user)
def run_hdfs(cluster): targets = cluster.get_hdfs_master_config().get_nn_hosts() execute(start_dfs, hosts=[targets[0]], cluster=cluster) do_active_transitions(cluster) # CBlock needs OZONE -- so we start everything. if cluster.get_config(constants.KEY_CBLOCK_CACHE): # We start SCM and Cblock on the Namenode machine for now. # TODO : Fix this so that if other machines are specified # we are able to get to it. if execute(start_scm, hosts=targets, cluster=cluster): get_logger().error('Start SCM failed.') return False if execute(start_cblock_server, hosts=targets, cluster=cluster): get_logger().error('Failed to start cBlock Server.') return False # Read the Datanode Machine list now and execute the rest of the commands targets = cluster.get_worker_nodes() if execute(start_jscsi_server, hosts=targets, cluster=cluster): get_logger().error('Unable to start datanodes.') return False else: # Just Start SCM for Ozone, everthing else is already running. if cluster.get_config(constants.KEY_OZONE_ENABLED): if execute(start_scm, hosts=targets, cluster=cluster): get_logger().error('Start SCM failed.') return False return True