def _copy_base_conf(self): """Copy base configuration files to tmp dir.""" self.temp_conf_dir = tempfile.mkdtemp("", "hadoop-", "/tmp") if os.path.exists(self.local_base_conf_dir): base_conf_files = [ os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir) ] for f in base_conf_files: shutil.copy(f, self.temp_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] mandatory_files = [ CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE, YARN_CONF_FILE ] missing_conf_files = mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str(missing_conf_files)) remote_missing_files = [ os.path.join(self.conf_dir, f) for f in missing_conf_files ] action = Get([self.master], remote_missing_files, self.temp_conf_dir) action.run()
def _copy_base_conf(self): """Copy base configuration files to tmp dir.""" self.temp_conf_dir = tempfile.mkdtemp("", "hadoop-", "/tmp") if os.path.exists(self.local_base_conf_dir): base_conf_files = [os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir)] for f in base_conf_files: shutil.copy(f, self.temp_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] mandatory_files = [CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE, YARN_CONF_FILE] missing_conf_files = mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str( missing_conf_files)) remote_missing_files = [os.path.join(self.conf_dir, f) for f in missing_conf_files] action = Get([self.master], remote_missing_files, self.temp_conf_dir) action.run()
def copy_history(self, dest, job_ids=None): """Copy history logs from dfs. Args: dest (str): The path of the local dir where the logs will be copied. job_ids (list of str, optional): A list with the ids of the jobs for which the history should be copied. If nothing is passed, the history of all jobs is copied. """ if not os.path.exists(dest): logger.warning("Destination directory " + dest + " does not exist. It will be created") os.makedirs(dest) # Dirs used user_login = getpass.getuser() hist_dfs_dir = "/tmp/hadoop-yarn/staging/history/done_intermediate/" + \ user_login hist_tmp_dir = "/tmp/hadoop_hist" # Remove file in tmp dir if exists proc = SshProcess("rm -rf " + hist_tmp_dir, self.master) proc.run() # Get files in master if job_ids: proc = SshProcess("mkdir " + hist_tmp_dir, self.master) proc.run() for jid in job_ids: self.execute("fs -get " + hist_dfs_dir + "/" + jid + "* " + hist_tmp_dir, verbose=False) else: self.execute("fs -get " + hist_dfs_dir + " " + hist_tmp_dir, verbose=False) # Copy files from master action = Get([self.master], [hist_tmp_dir], dest) action.run()
def _initialize_conf(self): """Merge locally-specified configuration files with default files from the distribution.""" if os.path.exists(self.local_base_conf_dir): base_conf_files = [os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir)] for f in base_conf_files: shutil.copy(f, self.init_conf_dir) else: logger.warn("Local conf dir does not exist. Using default configuration") base_conf_files = [] missing_conf_files = self.conf_mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str(missing_conf_files)) remote_missing_files = [os.path.join(self.conf_dir, f) for f in missing_conf_files] action = Get([self.hosts[0]], remote_missing_files, self.init_conf_dir) action.run()
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(host.split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: logger.info(thread_name + 'Generate conf file') param_str = self.create_string(comb) Remote( "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb " + param_str, [host]).run() logger.info(thread_name + 'Run code') Remote( "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s" % param_str, [host]).run() logger.info(thread_name + 'Get results') traceFile = "ntier_" + param_str get_results = Get([host], [ "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile + ".csv" ], local_location=comb_dir).run() for p in get_results.processes: if not p.ok: logger.error( host + ': Unable to retrieve the files for combination %s', slugify(comb)) exit() comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def run_xp(self): master = self.cluster[0] opt = '' """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: # Take sweeper comb = self.sweeper.get_next() logger.info('Processing new combination %s' % (comb, )) try: # metric from linux sar tools, works with clock def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout) #Set CPU Freq and Policy according current combination cmd_template_Freq_Policy = ("cpufreq-set -r -g {policy}") cmd_template_Freq = ("cpufreq-set -r -f {freq}") if comb['Freq'] == 'OnDemand': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='ondemand') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() elif comb['Freq'] == 'conservative': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='conservative') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() else: cmd_freq_policy = cmd_template_Freq_Policy.format( policy='userspace') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() cmd_freq = cmd_template_Freq.format(freq=comb['Freq']) Remote(cmd_freq, master, connection_params={ 'user': '******' }).run() # build command src = 'source /opt/intel-performance-snapshoot/apsvars.sh' cmd_mpirun_template = ( "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}" ) cmd_mpirun = cmd_mpirun_template.format( opt='', pr1=comb['n_core'], typeNPB=comb['Benchmark'], NPBclass=comb['NPBclass'], pr2=comb['n_core']) cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format( src, cmd_mpirun, slugify(comb)) curPath = self.result_dir + slugify(comb) # run Mpi through execo remote SshProcess def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok # start clock and exec command in the master node time.sleep(5) startUnix = int(time.time()) start24Hour = datetime.datetime.fromtimestamp( startUnix).strftime('%H:%M:%S') task1 = runMpi(cmd) endUnix = int(time.time()) end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime( '%H:%M:%S') time.sleep(5) with open(os.path.join(curPath, "executionTime.txt"), "w") as sout: sout.write( 'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format( str(endUnix - startUnix), start24Hour, end24Hour)) takeMetric(curPath, start24Hour, end24Hour, ['cpu', 'mem', 'disk', 'swap', 'network']) # collect power from kWAPI: grid5000 infrastructure made tool for hostname in self.cluster: powerOut = '{}_power'.format(hostname) collect_metric(startUnix, endUnix, 'power', curPath, self.site, powerOut, hostname) st = '/tmp/out/' + slugify(comb) intelAppPerf = str(st + '.html') # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot'] # https://software.intel.com/en-us/performance-snapshot Get(master, [intelAppPerf], curPath, connection_params={ 'user': '******' }).run() if task1: logger.info("comb ok: %s" % (comb, )) self.sweeper.done(comb) continue except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise logger.info("comb NOT ok: %s" % (comb, )) self.sweeper.cancel(comb)