def action(self): logger.info('--> common.format_file_system <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' cluster_binary_dir = self.getClusterBinaryDir() # # clear hdfs files # """ folders for namenode """ name_nodes = self.getHosts(roles=['namen', ]) namefiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['dir']), '*') namesfiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['namen']['sdir']), '*') instructions = list() for host in name_nodes: ins = "ssh {0} {2}@{1} -tt 'rm -rf {3} {4}' ".format( ssh_option, host['ip'], host['usr'], namefiles, namesfiles) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret """ folders for datanodes """ data_nodes = self.getHosts(roles=['datan', ]) datafiles = os.path.join(self.getClusterHdfsDir( subdir=self.ys['roles']['datan']['dir']), '*') instructions = list() for host in data_nodes: ins = "ssh {0} {2}@{1} -tt 'rm -rf {3}' ".format( ssh_option, host['ip'], host['usr'], datafiles) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret # # formate # remote_ins = "{0} namenode -format -force".format( os.path.join(cluster_binary_dir, 'bin/hdfs')) ins = "ssh {0} {2}@{1} -tt '{3}' ".format( ssh_option, self.ys['roles']['namen']['hosts'][0], self.ys['roles']['namen']['usr'], remote_ins) return 0 == Command.do(ins)
def action(self): logger.info('--> controlp.install_compilation_prerequisites <--') ins = './utilities/setup_aliyun_maven_mirror.sh' retcode = Command.do(ins) if retcode != 0: return False ins = 'sudo -S ./utilities/install_compilation_prerequisites.sh' retcode = Command.sudo(ins, self.ys['roles']['controlp']['pwd']) if retcode != 0: return False return True
def action(self): logger.info('--> controlp.init_compile_src_code <--') controlp_source_dir = self.getControlPSourceDir() controlp_source_maven_plugins_dir = self.getControlPSourceDir( subdir='hadoop-maven-plugins') ins = " && ".join([ "free", "cd %s" % (controlp_source_maven_plugins_dir), "mvn install", "cd %s" % (controlp_source_dir), "mvn clean", "mvn eclipse:eclipse -DdownloadSources=true -DdownloadJavadocs=true -DskipTests", # "mvn dependency-check:aggregate", # TODO, fix hanging # "mvn package -Pdist,native,docs,src -DskipTests -Dtar" # -Pdocs will enforce to check the format correction of docs and some mvn errors will occur. "mvn clean install -Pdist,native -DskipTests -Dmaven.javadoc.skip=true -Dtar" ]) retcode = Command.do(ins) if retcode != 0: Command.do("mvn package -DskipTests") return False return True
def action(self): logger.info('--> controlp.download_bin_code <--') controlp_binary_dir = self.getControlPBinaryDir() if not os.path.exists(controlp_binary_dir): os.makedirs(controlp_binary_dir) if not os.path.isdir(controlp_binary_dir): logger.error( '\'binary code\' does not indicate a folder in setting file.') return False link_address = "http://www-eu.apache.org/dist/hadoop/common/hadoop-{0}/hadoop-{0}.tar.gz".format( self.ys['version']) ins = "curl -sSL {0} | tar -C {1} -xzv".format( link_address, os.path.join(controlp_binary_dir, '../')) # TODO, only exclude files retcode = Command.do(ins) if retcode != 0: return False return True
def action(self): logger.info('--> controlp.distribute_binary_package<--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' host_list = self.getHosts() sourcecode = self.ys['sourcecode'] binarycode = self.ys['binarycode'] # # add permissions # for host in host_list: """ create folders """ ins = "ssh {0} {2}@{1} -tt 'sudo -S mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ chown """ ins = "ssh {0} {2}@{1} -tt 'sudo -S chown -R {2} {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ chmod """ ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], binarycode) retcode = cmd.sudo(ins, host['pwd']) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # create hdfs folders # """ folders for namenode """ name_nodes = self.getHosts(roles=[ 'namen', ]) namedir = os.path.join(binarycode, self.ys['roles']['namen']['dir']) namesdir = os.path.join(binarycode, self.ys['roles']['namen']['sdir']) for host in name_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3} {4}' ".format( ssh_option, host['ip'], host['usr'], namedir, namesdir) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False """ folders for datanodes """ data_nodes = self.getHosts(roles=[ 'datan', ]) datadir = os.path.join(binarycode, self.ys['roles']['datan']['dir']) for host in data_nodes: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {3}' ".format( ssh_option, host['ip'], host['usr'], datadir) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # binary code # sour_folder = os.path.join(self.getControlPBinaryFolder(), '*') dest_folder = os.path.join(binarycode, 'rose-on-yarn/') for host in host_list: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {4} && rm -rf {4}/*' && scp -r {0} {3} {2}@{1}:{4}".format( ssh_option, host['ip'], host['usr'], sour_folder, dest_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # scripts about building env # controlp_scripts = './utilities/*' dest_scripts_folder = os.path.join(binarycode, 'scripts/') for host in host_list: ins = "ssh {0} {2}@{1} -tt 'mkdir -p {4}' && scp -r {0} {3} {2}@{1}:{4} ".format( ssh_option, host['ip'], host['usr'], controlp_scripts, dest_scripts_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # config setup_passphraseless from master to slaves # setup_passphraseless = os.path.join(dest_scripts_folder, 'setup_passphraseless.sh') # hdfs namenode = self.getHosts(roles=[ 'namen', ]) datanodes = self.getHosts(roles=[ 'datan', ]) datanodes_hostname = list() for host in datanodes: datanodes_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in namenode: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\'' ".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(datanodes_hostname), self.ys['roles']['datan']['pwd']) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # yarn resourcemanager = self.getHosts(roles=[ 'resourcem', ]) nodemanagers = self.getHosts(roles=[ 'nodem', ]) nodemanagers_hostname = list() for host in nodemanagers: nodemanagers_hostname.append("%s@%s" % (host['usr'], host['ip'])) for host in resourcemanager: ins = "ssh {0} {2}@{1} -tt '{3} \'{4}\' \'{5}\''".format( ssh_option, host['ip'], host['usr'], setup_passphraseless, ",".join(nodemanagers_hostname), self.ys['roles']['nodem']['pwd']) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configs # controlp_configs = './configs/*.xml ./configs/workers' dest_configs_folder = os.path.join(binarycode, 'rose-on-yarn/etc/hadoop/') for host in host_list: ins = "scp {0} {2}@{1}:{3} ".format(controlp_configs, host['ip'], host['usr'], dest_configs_folder) retcode = cmd.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # wait to end # ins = 'wait' retcode = cmd.do(ins) if retcode != 0: logger.error(ins) return False return True
def action(self): logger.info('--> common.configure_site <--') ssh_option = '-o StrictHostKeyChecking=no -o ConnectTimeout=5' cluster_hadoop_lib_native = self.getClusterHadoopLibNativeDir() cluster_hadoop_conf_dir = self.getClusterHadoopConfDir() cluster_binary_dir = self.getClusterBinaryDir() cluster_hdfs_dir = self.getClusterHdfsDir() cluster_log_dir = self.getClusterLogDir() # # wirte slaves' ip into workers # slaves_list = self.getSlaveHosts() workers = open('./configs/workers', 'w') for host in slaves_list: workers.write("%s \n" % (host['ip'])) workers.close() # # configure *-site.xml # shutil.copy2('./configs/default/hadoop-core.xml', './configs/core-site.xml') shutil.copy2('./configs/default/hadoop-hdfs.xml', './configs/hdfs-site.xml') shutil.copy2('./configs/default/hadoop-yarn.xml', './configs/yarn-site.xml') shutil.copy2('./configs/default/hadoop-mapred.xml', './configs/mapred-site.xml') # log-level putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.log.level', value='DEBUG') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.log.level', value='DEBUG') putconfig(file='./configs/mapred-site.xml', name='yarn.app.mapreduce.am.log.level', value='DEBUG') # hdfs putconfig(file='./configs/core-site.xml', name='fs.defaultFS', value="hdfs://%s:9000" % self.ys['roles']['namen']['hosts'][0]) putconfig(file='./configs/hdfs-site.xml', name='dfs.replication', value='3') putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.name.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['dir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.checkpoint.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['sdir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.namenode.checkpoint.edits.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['namen']['sdir'])) putconfig(file='./configs/hdfs-site.xml', name='dfs.datanode.data.dir', value=os.path.join('file:', cluster_hdfs_dir, self.ys['roles']['datan']['dir'])) # mapreduce putconfig(file='./configs/mapred-site.xml', name='mapreduce.task.timeout', value='300000') putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.memory.mb', value='1536') putconfig(file='./configs/mapred-site.xml', name='mapreduce.map.cpu.vcores', value='1') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.memory.mb', value='2048') putconfig(file='./configs/mapred-site.xml', name='mapreduce.reduce.cpu.vcores', value='1') putconfig(file='./configs/mapred-site.xml', name='mapreduce.framework.name', value='yarn') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.aux-services', value='mapreduce_shuffle') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.env-whitelist', value= 'JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME' ) # yarn putconfig(file='./configs/yarn-site.xml', name='yarn.resourcemanager.hostname', value=self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms', value='3000') putconfig( file='./configs/yarn-site.xml', name='yarn.resourcemanager.nodemanagers.heartbeat-interval-ms', value='3000') putconfig(file='./configs/yarn-site.xml', name='yarn.webapp.ui2.enable', value='false') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.resource.detect-hardware-capabilities', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.scheduler.minimum-allocation-mb', value='512') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.recovery.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.address', value='${yarn.nodemanager.hostname}:45678') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.recovery.supervised', value='true') # ROSE: yarn->webapp putconfig(file='./configs/yarn-site.xml', name='yarn.resourcemanager.webapp.rrds.dir.cluster', value=self.ys['gmetad']['rrds']['dir']) # -- logs and tmp putconfig(file='./configs/core-site.xml', name='hadoop.tmp.dir', value=self.getClusterTmpDir()) putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.delete.debug-delay-sec', value='86400') # 86400sec = 1day putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.log.retain-seconds', value='86400') # 86400sec = 1day putconfig(file='./configs/yarn-site.xml', name='yarn.log-aggregation-enable', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.remote-app-log-dir', value=self.getClusterLogDir(subdir='remote-app-logs')) # jobhistory putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.address', value="%s:10020" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.webapp.address', value="%s:19888" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.webapp.https.address', value="%s:19890" % self.ys['roles']['resourcem']['hosts'][0]) putconfig(file='./configs/mapred-site.xml', name='mapreduce.jobhistory.admin.address', value="%s:10033" % self.ys['roles']['resourcem']['hosts'][0]) # -- timeline service putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.enabled', value='true') # todo. configue timeline putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.version', value='1.0f') # 1.0f 1.5f putconfig(file='./configs/yarn-site.xml', name='yarn.system-metrics-publisher.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name='yarn.timeline-service.generic-application-history.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name= 'yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms ', value='60000') # ms putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.hostname', value='${yarn.resourcemanager.hostname}') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.recovery.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.ttl-enable', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.timeline-service.ttl-ms', value='86400000') # 86400000ms = 1day # yarn-support opportunistic container scheduler putconfig( file='./configs/yarn-site.xml', name= 'yarn.resourcemanager.opportunistic-container-allocation.enabled', value='true') putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.opportunistic-containers-max-queue-length', value='20') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.container-monitor.interval-ms', value='3000') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.health-checker.interval-ms', value='60000') # yarn-support distributed scheduler putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.distributed-scheduling.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.enabled', value='true') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.address', value='0.0.0.0:8049') putconfig(file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.client.thread-count', value='3') putconfig( file='./configs/yarn-site.xml', name='yarn.resourcemanager.scheduler.address', #value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0]) value='0.0.0.0:8049') # on RM, must change it into rm-ip:8030 putconfig( file='./configs/yarn-site.xml', name='yarn.nodemanager.amrmproxy.realrm.scheduler.address', value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0], description= "SUNXY-ROSE: targets to help AMRMProxy find real RM scheduler address" ) # ROSE putconfig( file='./configs/yarn-site.xml', name='yarn.rose.enabled', value='true', description= "SUNXY-ROSE: targets to manage opportunistic containers as an overselling method" ) # ROSE: support distributed scheduler: on RM, must change it into rm-ip:8030 shutil.copy2('./configs/yarn-site.xml', './configs/yarn-rm-site.xml') putconfig(file='./configs/yarn-rm-site.xml', name='yarn.resourcemanager.scheduler.address', value="%s:8030" % self.ys['roles']['resourcem']['hosts'][0]) files = [ './configs/core-site.xml', './configs/hdfs-site.xml', './configs/mapred-site.xml', './configs/yarn-site.xml', './configs/yarn-rm-site.xml' ] ins = " && ".join(map(lambda x: "format_file %s" % x, files)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configure ./etc/hadoop/*.sh # shutil.copy2('./configs/default/hadoop-env.sh', './configs/hadoop-env.sh') hadoop_env_file = './configs/hadoop-env.sh' envlist = [ ['PDSH_RCMD_TYPE', 'ssh'], ['JAVA_HOME', '/usr/lib/jvm/java-8-openjdk-amd64/'], ['HADOOP_HOME', cluster_binary_dir], ['HADOOP_YARN_HOME', cluster_binary_dir], ['HADOOP_HDFS_HOME', cluster_binary_dir], ['HADOOP_MAPRED_HOME', cluster_binary_dir], ['HADOOP_COMMON_HOME', cluster_binary_dir], ['HADOOP_COMMON_LIB_NATIVE_DIR', cluster_hadoop_lib_native], [ 'HADOOP_OPTS', "'\"${HADOOP_OPTS} -Djava.library.path=%s\"'" % (cluster_hadoop_lib_native) ], ['HADOOP_CONF_DIR', cluster_hadoop_conf_dir], ['HADOOP_LOG_DIR', cluster_log_dir], # custom ['HADOOP_ROOT_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom ['HADOOP_DAEMON_ROOT_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom ['HADOOP_SECURITY_LOGGER', 'DEBUG,console,RFA'], # DEBUG mode custom # ['YARN_CONF_DIR', cluster_hadoop_conf_dir], # Deprecated # ['YARN_ROOT_LOGGER', 'DEBUG,console,RFA'], # Deprecated ] ins = " && ".join( map( lambda x: "put_config_line --file %s --property %s --value %s --prefix 'export' " % (hadoop_env_file, x[0], x[1]), envlist)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # configure ./etc/hadoop/hadoop-metrics2.properties # shutil.copy2('./configs/default/hadoop-metrics2.properties', './configs/hadoop-metrics2.properties') hadoop_metrics_file = './configs/hadoop-metrics2.properties' gmond_host = self.ys['gmond']['host'] envlist = [ # ['jobhistoryserver.sink.ganglia.servers', gmond_host], # ['mrappmaster.sink.ganglia.servers', gmond_host], ['nodemanager.sink.ganglia.servers', gmond_host], ['resourcemanager.sink.ganglia.servers', gmond_host], # ['datanode.sink.ganglia.servers', gmond_host], # ['namenode.sink.ganglia.servers', gmond_host], # ['datanode.sink.file.filename', 'datanode-metrics.out'], [ 'resourcemanager.sink.file.filename', 'resourcemanager-metrics.out' ], ['nodemanager.sink.file.filename', 'nodemanager-metrics.out'], # ['mrappmaster.sink.file.filename', 'mrappmaster-metrics.out'], # ['jobhistoryserver.sink.file.filename', 'jobhistoryserver-metrics.out'], [ 'nodemanager.sink.file_jvm.class', 'org.apache.hadoop.metrics2.sink.FileSink' ], ['nodemanager.sink.file_jvm.context', 'jvm'], [ 'nodemanager.sink.file_jvm.filename', 'nodemanager-jvm-metrics.out' ], [ 'nodemanager.sink.file_mapred.class', 'org.apache.hadoop.metrics2.sink.FileSink' ], ['nodemanager.sink.file_mapred.context', 'mapred'], [ 'nodemanager.sink.file_mapred.filename', 'nodemanager-mapred-metrics.out' ], [ '*.sink.ganglia.class', 'org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31' ], ['*.sink.ganglia.period', '10'], ['*.sink.ganglia.supportsparse', 'true'], [ '*.sink.ganglia.slope', 'jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both' ], [ '*.sink.ganglia.dmax', 'jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40' ], ] ins = " && ".join( map( lambda x: "put_config_line --file %s --property %s --value %s " % (hadoop_metrics_file, x[0], x[1]), envlist)) retcode = Command.do(ins) logger.info("ins: %s; retcode: %d." % (ins, retcode)) if retcode != 0: logger.error(ins) return False # # sync configures # host_list = self.getHosts() rm_list = self.getHosts(roles=[ 'resourcem', ]) """ chmod """ instructions = list() for host in host_list: ins = "ssh {0} {2}@{1} -tt 'sudo -S chmod -R 777 {3}' ".format( ssh_option, host['ip'], host['usr'], cluster_hadoop_conf_dir) instructions.append((ins, host['pwd'])) ret = Command.parallel(instructions) if not ret: return ret """ sync files to all nodes """ hbe_configs = './configs/hdfs-site.xml ./configs/mapred-site.xml \ ./configs/yarn-site.xml ./configs/core-site.xml \ ./configs/workers ./configs/hadoop-env.sh \ ./configs/hadoop-metrics2.properties' instructions = list() for host in host_list: ins = "ssh {1}@{0} -tt 'mkdir -p {3}' && scp {2} {1}@{0}:{3} ".format( host['ip'], host['usr'], hbe_configs, cluster_hadoop_conf_dir) instructions.append(ins) ret = Command.parallel(instructions) if not ret: return ret """ sync files to RMs """ instructions = list() for host in rm_list: ins = "ssh {1}@{0} -tt 'mkdir -p {3}' && scp ./configs/yarn-rm-site.xml {1}@{0}:{2}".format( host['ip'], host['usr'], os.path.join(cluster_hadoop_conf_dir, 'yarn-site.xml'), cluster_hadoop_conf_dir) instructions.append(ins) return Command.parallel(instructions)