def action(self): logger.info('--> common.format_file_system <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' cluster_binary_dir = self.getClusterBinaryDir() # # clear hdfs files # """ folders for namenode """ name_nodes = self.getHosts(roles=['namen', ]) namefiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['dir']), '*') namesfiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['sdir']), '*') instructions = list() for host in name_nodes: ins = "ssh {0} {2}@{1} -tt 'rm -rf {3} {4}' ".format( ssh_option, host['ip'], host['usr'], namefiles, namesfiles) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret """ folders for datanodes """ data_nodes = self.getHosts(roles=['datan', ]) datafiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['datan']['dir']), '*') instructions = list() for host in data_nodes: ins = "ssh {0} {2}@{1} -tt 'rm -rf {3}' ".format( ssh_option, host['ip'], host['usr'], datafiles) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret # # formate # remote_ins = "{0} namenode -format -force".format( os.path.join(cluster_binary_dir, 'bin/hdfs')) ins = "ssh {0} {2}@{1} -tt '{3}' ".format( ssh_option, self.ys['roles']['namen']['hosts'][0], self.ys['roles']['namen']['usr'], remote_ins) return 0 == Command.do(ins)
def action(self): logger.info('--> controlp.distribute_binary_package <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() controlp_binary_dir = self.getControlPBinaryDir() cluster_binary_dir = self.getClusterBinaryDir() """ chmod """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_binary_dir) instructions.append((ins, host['pwd'])) ret = Command.parallel(instructions) if not ret: return ret """ sync binary files """ params = self.getParams() instructions = list() if len(params) > 0: """ # with params """ candidates = list() for p in params: candidates.append(self.__parse(p)) for can in candidates: for host in host_list: ins = ("scp -r {0} {3} {2}@{1}:{4}").format( ssh_option, host['ip'], host['usr'], can[0], can[1]) instructions.append(ins) else: """ # without params """ controlp_binary_files = os.path.join(controlp_binary_dir, '*') cluster_binary_files = os.path.join(cluster_binary_dir, '*') for host in host_list: ins = ("ssh {0} {2}@{1} -tt 'mkdir -p {4} && rm -rf {5}'" " && scp -r {0} {3} {2}@{1}:{4}").format( ssh_option, host['ip'], host['usr'], controlp_binary_files, cluster_binary_dir, cluster_binary_files) instructions.append(ins) return Command.parallel(instructions)
def action(self): logger.info('--> common.stop <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=600' host_list = self.getHosts() rm_list = self.getHosts(roles=['resourcem', ]) # -- step1 params = self.getParams() if len(params) == 0: params.append('hdfs') params.append('yarn') params.append('jobhistory') params.append('timelineserver') # -- step2 EACH_HOST_INS = [] # such as 'nodemanager' and datanode instructions = list() for p in params: tlist = None if p in EACH_HOST_INS: tlist = host_list else: tlist = rm_list for host in tlist: if self.__parse(p) is None: continue #!!! donot use -tt option ins = "ssh {0} {2}@{1} -T '{3}' ".format( ssh_option, host['ip'], host['usr'], self.__parse(p)) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret # -- step3 : remove 'process information unavailable' if len(self.getParams()) == 0: instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S rm -rf /tmp/hsperfdata*'".format( ssh_option, host['ip'], host['usr']) instructions.append((ins, host['pwd'])) ret = Command.parallel(instructions) return ret
def action(self): logger.info('--> controlp.install_compilation_prerequisites <--') ins = './utilities/setup_aliyun_maven_mirror.sh' retcode = Command.do(ins) if retcode != 0: return False ins = 'sudo -S ./utilities/install_compilation_prerequisites.sh' retcode = Command.sudo(ins, self.ys['roles']['controlp']['pwd']) if retcode != 0: return False return True
def action(self): logger.info('--> common.clean <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=600' host_list = self.getHosts() cluster_script_dir = self.getClusterScriptDir() params = self.getParams() if len(params) == 0: params.append('log') params.append('tmp') instructions = list() for p in params: remote_ins = self.__parse(p) for host in host_list: #!!! donot use -tt option ins = "ssh {0} {2}@{1} -T '{3}' ".format( ssh_option, host['ip'], host['usr'], remote_ins) instructions.append(ins) return Command.parallel(instructions)
def action(self): logger.info('--> common.configure_ganglia_monitor <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() gmetad_list = self.getHosts(roles=[ 'gmetad', ]) instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S apt-get install -y collectd-core ganglia-modules-linux ganglia-monitor ganglia-monitor-python libganglia1-dev libgmetric4j-java libjmxetric-java'".format( ssh_option, host['ip'], host['usr']) instructions.append((ins, host['pwd'])) for host in gmetad_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S apt-get install -y gmetad ganglia-webfrontend rrdtool'".format( ssh_option, host['ip'], host['usr']) instructions.append((ins, host['pwd'])) ret = Command.parallel(instructions) if not ret: return ret """
def action(self): logger.info('--> controlp.setup_passphraseless <--') host_list = self.getHosts() instructions = list() for host in host_list: # setup passphraseless ins = "./utilities/setup_passphraseless.sh '%s@%s' '%s'" % ( host['usr'], host['ip'], host['pwd']) instructions.append(ins) return Command.parallel(ins_list)
def action(self): logger.info('--> controlp.init_compile_src_code <--') controlp_source_dir = self.getControlPSourceDir() controlp_source_maven_plugins_dir = self.getControlPSourceDir( subdir='hadoop-maven-plugins') ins = " && ".join([ "free", "cd %s" % (controlp_source_maven_plugins_dir), "mvn install", "cd %s" % (controlp_source_dir), "mvn clean", "mvn eclipse:eclipse -DdownloadSources=true -DdownloadJavadocs=true -DskipTests", # "mvn dependency-check:aggregate", # TODO, fix hanging # "mvn package -Pdist,native,docs,src -DskipTests -Dtar" # -Pdocs will enforce to check the format correction of docs and some mvn errors will occur. "mvn clean install -Pdist,native -DskipTests -Dmaven.javadoc.skip=true -Dtar" ]) retcode = Command.do(ins) if retcode != 0: Command.do("mvn package -DskipTests") return False return True
def action(self): logger.info('--> common.submit <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=600' slaves_list = self.getSlaveHosts() params = self.getParams() instructions = list() for p in params: host = choice(slaves_list) #!!! donot use -tt option ins = "ssh {0} {2}@{1} -T 'cd {3} && {4}' ".format( ssh_option, host['ip'], host['usr'], self.getClusterBinaryDir(), p) instructions.append(ins) return Command.parallel(instructions)
def action(self): logger.info('--> common.change_binarycode_mode_own <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() cluster_binary_dir = self.getClusterBinaryDir() cluster_script_dir = self.getClusterScriptDir() remote_ins = "sudo -S %s %s %s %s" % (os.path.join( cluster_script_dir, 'change_binarycode_mode_own.sh' ), self.ys['opt']['group'], self.ys['opt']['user'], cluster_binary_dir) instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt '{3}' ".format(ssh_option, host['ip'], host['usr'], remote_ins) instructions.append((ins, host['pwd'])) return Command.parallel(instructions)
def action(self): logger.info('--> controlp.download_bin_code <--') controlp_binary_dir = self.getControlPBinaryDir() if not os.path.exists(controlp_binary_dir): os.makedirs(controlp_binary_dir) if not os.path.isdir(controlp_binary_dir): logger.error( '\'binary code\' does not indicate a folder in setting file.') return False link_address = "http://www-eu.apache.org/dist/hadoop/common/hadoop-{0}/hadoop-{0}.tar.gz".format( self.ys['version']) ins = "curl -sSL {0} | tar -C {1} -xzv".format( link_address, os.path.join(controlp_binary_dir, '../')) # TODO, only exclude files retcode = Command.do(ins) if retcode != 0: return False return True
def action(self): logger.info('--> common.install_runtime_prerequisties <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() cluster_script_dir = self.getClusterScriptDir() # # build master and slaves environment # remote_ins = os.path.join(cluster_script_dir, 'install_runtime_prerequisites.sh') instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S {3}'".format( ssh_option, host['ip'], host['usr'], remote_ins) instructions.append((ins, host['pwd'])) return Command.parallel(instructions)
def action(self): logger.info('--> controlp.compile_src_code <--') controlp_source_dir = self.getControlPSourceDir() params = self.getParams() candidates = list() for p in params: candidates.append(self.__parse(p)) if len(candidates) == 0: candidates.append(controlp_source_dir) instructions = list() for can in candidates: ins = " && ".join([ "cd %s" % (can), "mvn clean install -Pdist,native -DskipTests -Dmaven.javadoc.skip=true -Dtar" ]) instructions.append(ins) return Command.parallel(instructions)
def action(self): logger.info('--> controlp.distribute_binary_package<--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() sourcecode = self.ys['sourcecode'] binarycode = self.ys['binarycode'] # # add permissions # for host in host_list: """ create folders """ ins = "ssh {0} {2}@{1} -tt 'sudo -S mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ chown """ ins = "ssh {0} {2}@{1} -tt 'sudo -S chown -R {2} {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ chmod """ ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # create hdfs folders # """ folders for namenode """ name_nodes = self.getHosts(roles=[ 'namen', ]) namedir = os.path.join(binarycode, self.ys['roles']['namen']['dir']) namesdir = os.path.join(binarycode, self.ys['roles']['namen']['sdir']) for host in name_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3} {4}' ".format( ssh_option, host['ip'], host['usr'], namedir, namesdir) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ folders for datanodes """ data_nodes = self.getHosts(roles=[ 'datan', ]) datadir = os.path.join(binarycode, self.ys['roles']['datan']['dir']) for host in data_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], datadir) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # binary code # sour_folder = os.path.join(self.getControlPBinaryFolder(), '*') dest_folder = os.path.join(binarycode, 'rose-on-yarn/') for host in host_list: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {4} && rm -rf {4}/*' && scp -r {0} {3} {2}@{1}:{4}".format( ssh_option, host['ip'], host['usr'], sour_folder, dest_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # scripts about building env # controlp_scripts = './utilities/*' dest_scripts_folder = os.path.join(binarycode, 'scripts/') for host in host_list: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {4}' && scp -r {0} {3} {2}@{1}:{4} ".format( ssh_option, host['ip'], host['usr'], controlp_scripts, dest_scripts_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # config setup_passphraseless from master to slaves # setup_passphraseless = os.path.join(dest_scripts_folder, 'setup_passphraseless.sh') # hdfs namenode = self.getHosts(roles=[ 'namen', ]) datanodes = self.getHosts(roles=[ 'datan', ]) datanodes_hostname = list() for host in datanodes: datanodes_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in namenode: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\'' ".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(datanodes_hostname), self.ys['roles']['datan']['pwd']) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # yarn resourcemanager = self.getHosts(roles=[ 'resourcem', ]) nodemanagers = self.getHosts(roles=[ 'nodem', ]) nodemanagers_hostname = list() for host in nodemanagers: nodemanagers_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in resourcemanager: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\''".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(nodemanagers_hostname), self.ys['roles']['nodem']['pwd']) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configs # controlp_configs = './configs/*.xml ./configs/workers' dest_configs_folder = os.path.join(binarycode, 'rose-on-yarn/etc/hadoop/') for host in host_list: ins = "scp {0} {2}@{1}:{3} ".format(controlp_configs, host['ip'], host['usr'], dest_configs_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # wait to end # ins = 'wait' retcode = cmd.do(ins) if retcode != 0: logger.error(ins) return False return True
def action(self): logger.info('--> controlp.distribute_binary_package_prep <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() cluster_script_dir = self.getClusterScriptDir() cluster_binary_dir = self.getClusterBinaryDir() cluster_hdfs_dir = self.getClusterHdfsDir() cluster_base_dir = self.getClusterBaseDir() # # clear cluster base dir # ------------------------------------------------------- # ret = True instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S rm -rf `ls {3}/* | egrep -v {4}` ' ".format( ssh_option, host['ip'], host['usr'], cluster_base_dir, cluster_hdfs_dir) instructions.append((ins, host['pwd'])) Command.parallel(instructions) # # add permissions # ------------------------------------------------------- # """ create folders """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_binary_dir) instructions.append((ins, host['pwd'])) Command.parallel(instructions) """ chown """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S chown -R {2} {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_base_dir) instructions.append((ins, host['pwd'])) Command.parallel(instructions) """ chmod """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_base_dir) instructions.append((ins, host['pwd'])) Command.parallel(instructions) # # create hdfs folders # ------------------------------------------------------- # instructions = list() """ folders for namenode """ name_nodes = self.getHosts(roles=[ 'namen', ]) namedir = self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['dir']) namesdir = self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['sdir']) for host in name_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3} {4}' ".format( ssh_option, host['ip'], host['usr'], namedir, namesdir) instructions.append(ins) """ folders for datanodes """ data_nodes = self.getHosts(roles=[ 'datan', ]) datadir = self.getClusterHdfsDir( subdir=self.ys['roles']['datan']['dir']) for host in data_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], datadir) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret # # scripts about building env # ------------------------------------------------------- # instructions = list() hbe_utilities = './utilities/*' for host in host_list: ins = ("ssh {0} {2}@{1} -tt 'mkdir -p {4}' " "&& scp -r {0} {3} {2}@{1}:{4} ").format( ssh_option, host['ip'], host['usr'], hbe_utilities, cluster_script_dir) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret # # config setup_passphraseless from master to slaves # ------------------------------------------------------- # instructions = list() setup_passphraseless = os.path.join(cluster_script_dir, 'setup_passphraseless.sh') """ hdfs """ namenode = self.getHosts(roles=[ 'namen', ]) datanodes = self.getHosts(roles=[ 'datan', ]) datanodes_hostname = list() for host in datanodes: datanodes_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in namenode: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\'' ".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(datanodes_hostname), self.ys['roles']['datan']['pwd']) instructions.append(ins) """ yarn """ resourcemanager = self.getHosts(roles=[ 'resourcem', ]) nodemanagers = self.getHosts(roles=[ 'nodem', ]) nodemanagers_hostname = list() for host in nodemanagers: nodemanagers_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in resourcemanager: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\''".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(nodemanagers_hostname), self.ys['roles']['nodem']['pwd']) instructions.append(ins) return Command.parallel(instructions)
def action(self): logger.info('--> common.configure_site <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' cluster_hadoop_lib_native = self.getClusterHadoopLibNativeDir() cluster_hadoop_conf_dir = self.getClusterHadoopConfDir() cluster_binary_dir = self.getClusterBinaryDir() cluster_hdfs_dir = self.getClusterHdfsDir() cluster_log_dir = self.getClusterLogDir() # # wirte slaves' ip into workers # slaves_list = self.getSlaveHosts() workers = open('./configs/workers', 'w') for host in slaves_list: workers.write("%s \n" % (host['ip'])) workers.close() # # configure *-site.xml # shutil.copy2('./configs/default/hadoop-core.xml', './configs/core-site.xml') shutil.copy2('./configs/default/hadoop-hdfs.xml', './configs/hdfs-site.xml') shutil.copy2('./configs/default/hadoop-yarn.xml', './configs/yarn-site.xml') shutil.copy2('./configs/default/hadoop-mapred.xml', './configs/mapred-site.xml') # log-level putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.log.level', value='DEBUG') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.log.level', value='DEBUG') putconfig(file='./configs/mapred-site.xml', name='yarn.app.mapreduce.am.log.level', value='DEBUG') # hdfs putconfig(file='./configs/core-site.xml', name='fs.defaultFS', value="hdfs://%s:9000" % self.ys['roles']['namen']['hosts'][0]) putconfig(file='./configs/hdfs-site.xml', name='dfs.replication', value='3') putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.name.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['dir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.checkpoint.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['sdir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.checkpoint.edits.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['sdir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.datanode.data.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['datan']['dir'])) # mapreduce putconfig(file='./configs/mapred-site.xml', name='mapreduce.task.timeout', value='300000') putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.memory.mb', value='1536') putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.cpu.vcores', value='1') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.memory.mb', value='2048') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.cpu.vcores', value='1') putconfig(file='./configs/mapred-site.xml', name='mapreduce.framework.name', value='yarn') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.aux-services', value='mapreduce_shuffle') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.env-whitelist', value= 'JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME' ) # yarn putconfig(file='./configs/yarn-site.xml', name='yarn.resourcemanager.hostname', value=self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms', value='3000') putconfig( file='./configs/yarn-site.xml', name='yarn.resourcemanager.nodemanagers.heartbeat-interval-ms', value='3000') putconfig(file='./configs/yarn-site.xml', name='yarn.webapp.ui2.enable', value='false') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.resource.detect-hardware-capabilities', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.scheduler.minimum-allocation-mb', value='512') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.recovery.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.address', value='${yarn.nodemanager.hostname}:45678') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.recovery.supervised', value='true') # ROSE: yarn->webapp putconfig(file='./configs/yarn-site.xml', name='yarn.resourcemanager.webapp.rrds.dir.cluster', value=self.ys['gmetad']['rrds']['dir']) # -- logs and tmp putconfig(file='./configs/core-site.xml', name='hadoop.tmp.dir', value=self.getClusterTmpDir()) putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.delete.debug-delay-sec', value='86400') # 86400sec = 1day putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.log.retain-seconds', value='86400') # 86400sec = 1day putconfig(file='./configs/yarn-site.xml', name='yarn.log-aggregation-enable', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.remote-app-log-dir', value=self.getClusterLogDir(subdir='remote-app-logs')) # jobhistory putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.address', value="%s:10020" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.webapp.address', value="%s:19888" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.webapp.https.address', value="%s:19890" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.admin.address', value="%s:10033" % self.ys['roles']['resourcem']['hosts'][0]) # -- timeline service putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.enabled', value='true') # todo. configue timeline putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.version', value='1.0f') # 1.0f 1.5f putconfig(file='./configs/yarn-site.xml', name='yarn.system-metrics-publisher.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name='yarn.timeline-service.generic-application-history.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name= 'yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms ', value='60000') # ms putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.hostname', value='${yarn.resourcemanager.hostname}') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.recovery.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.ttl-enable', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.ttl-ms', value='86400000') # 86400000ms = 1day # yarn-support opportunistic container scheduler putconfig( file='./configs/yarn-site.xml', name= 'yarn.resourcemanager.opportunistic-container-allocation.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.opportunistic-containers-max-queue-length', value='20') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.container-monitor.interval-ms', value='3000') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.health-checker.interval-ms', value='60000') # yarn-support distributed scheduler putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.distributed-scheduling.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.address', value='0.0.0.0:8049') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.client.thread-count', value='3') putconfig( file='./configs/yarn-site.xml', name='yarn.resourcemanager.scheduler.address', #value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0]) value='0.0.0.0:8049') # on RM, must change it into rm-ip:8030 putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.realrm.scheduler.address', value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0], description= "SUNXY-ROSE: targets to help AMRMProxy find real RM scheduler address" ) # ROSE putconfig( file='./configs/yarn-site.xml', name='yarn.rose.enabled', value='true', description= "SUNXY-ROSE: targets to manage opportunistic containers as an overselling method" ) # ROSE: support distributed scheduler: on RM, must change it into rm-ip:8030 shutil.copy2('./configs/yarn-site.xml', './configs/yarn-rm-site.xml') putconfig(file='./configs/yarn-rm-site.xml', name='yarn.resourcemanager.scheduler.address', value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0]) files = [ './configs/core-site.xml', './configs/hdfs-site.xml', './configs/mapred-site.xml', './configs/yarn-site.xml', './configs/yarn-rm-site.xml' ] ins = " && ".join(map(lambda x: "format_file %s" % x, files)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configure ./etc/hadoop/*.sh # shutil.copy2('./configs/default/hadoop-env.sh', './configs/hadoop-env.sh') hadoop_env_file = './configs/hadoop-env.sh' envlist = [ ['PDSH_RCMD_TYPE', 'ssh'], ['JAVA_HOME', '/usr/lib/jvm/java-8-openjdk-amd64/'], ['HADOOP_HOME', cluster_binary_dir], ['HADOOP_YARN_HOME', cluster_binary_dir], ['HADOOP_HDFS_HOME', cluster_binary_dir], ['HADOOP_MAPRED_HOME', cluster_binary_dir], ['HADOOP_COMMON_HOME', cluster_binary_dir], ['HADOOP_COMMON_LIB_NATIVE_DIR', cluster_hadoop_lib_native], [ 'HADOOP_OPTS', "'\"${HADOOP_OPTS} -Djava.library.path=%s\"'" % (cluster_hadoop_lib_native) ], ['HADOOP_CONF_DIR', cluster_hadoop_conf_dir], ['HADOOP_LOG_DIR', cluster_log_dir], # custom ['HADOOP_ROOT_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom ['HADOOP_DAEMON_ROOT_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom ['HADOOP_SECURITY_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom # ['YARN_CONF_DIR', cluster_hadoop_conf_dir], # Deprecated # ['YARN_ROOT_LOGGER', 'DEBUG,console,RFA'], # Deprecated ] ins = " && ".join( map( lambda x: "put_config_line --file %s --property %s --value %s --prefix 'export' " % (hadoop_env_file, x[0], x[1]), envlist)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configure ./etc/hadoop/hadoop-metrics2.properties # shutil.copy2('./configs/default/hadoop-metrics2.properties', './configs/hadoop-metrics2.properties') hadoop_metrics_file = './configs/hadoop-metrics2.properties' gmond_host = self.ys['gmond']['host'] envlist = [ # ['jobhistoryserver.sink.ganglia.servers', gmond_host], # ['mrappmaster.sink.ganglia.servers', gmond_host], ['nodemanager.sink.ganglia.servers', gmond_host], ['resourcemanager.sink.ganglia.servers', gmond_host], # ['datanode.sink.ganglia.servers', gmond_host], # ['namenode.sink.ganglia.servers', gmond_host], # ['datanode.sink.file.filename', 'datanode-metrics.out'], [ 'resourcemanager.sink.file.filename', 'resourcemanager-metrics.out' ], ['nodemanager.sink.file.filename', 'nodemanager-metrics.out'], # ['mrappmaster.sink.file.filename', 'mrappmaster-metrics.out'], # ['jobhistoryserver.sink.file.filename', 'jobhistoryserver-metrics.out'], [ 'nodemanager.sink.file_jvm.class', 'org.apache.hadoop.metrics2.sink.FileSink' ], ['nodemanager.sink.file_jvm.context', 'jvm'], [ 'nodemanager.sink.file_jvm.filename', 'nodemanager-jvm-metrics.out' ], [ 'nodemanager.sink.file_mapred.class', 'org.apache.hadoop.metrics2.sink.FileSink' ], ['nodemanager.sink.file_mapred.context', 'mapred'], [ 'nodemanager.sink.file_mapred.filename', 'nodemanager-mapred-metrics.out' ], [ '*.sink.ganglia.class', 'org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31' ], ['*.sink.ganglia.period', '10'], ['*.sink.ganglia.supportsparse', 'true'], [ '*.sink.ganglia.slope', 'jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both' ], [ '*.sink.ganglia.dmax', 'jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40' ], ] ins = " && ".join( map( lambda x: "put_config_line --file %s --property %s --value %s " % (hadoop_metrics_file, x[0], x[1]), envlist)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # sync configures # host_list = self.getHosts() rm_list = self.getHosts(roles=[ 'resourcem', ]) """ chmod """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_hadoop_conf_dir) instructions.append((ins, host['pwd'])) ret = Command.parallel(instructions) if not ret: return ret """ sync files to all nodes """ hbe_configs = './configs/hdfs-site.xml ./configs/mapred-site.xml \ ./configs/yarn-site.xml ./configs/core-site.xml \ ./configs/workers ./configs/hadoop-env.sh \ ./configs/hadoop-metrics2.properties' instructions = list() for host in host_list: ins = "ssh {1}@{0} -tt 'mkdir -p {3}' && scp {2} {1}@{0}:{3} ".format( host['ip'], host['usr'], hbe_configs, cluster_hadoop_conf_dir) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret """ sync files to RMs """ instructions = list() for host in rm_list: ins = "ssh {1}@{0} -tt 'mkdir -p {3}' && scp ./configs/yarn-rm-site.xml {1}@{0}:{2}".format( host['ip'], host['usr'], os.path.join(cluster_hadoop_conf_dir, 'yarn-site.xml'), cluster_hadoop_conf_dir) instructions.append(ins) return Command.parallel(instructions)