def reportDash(self,jobReport): ''' dashboard report dictionary ''' event_report = self.n_of_events(jobReport) storage_report, throughput_report = self.storageStat(jobReport) dashboard_report = {} # for k,v in event_report.iteritems() : dashboard_report[k]=v # extract information to be sent to DashBoard # per protocol and for action=read, calculate MBPS # dashboard key is io_action dashboard_report['MonitorID'] = self.MonitorID dashboard_report['MonitorJobID'] = self.MonitorJobID for protocol in storage_report.keys() : for action in storage_report[protocol].keys() : try: size = float(storage_report[protocol][action][2]) except: size = 'NULL' try: time = float(storage_report[protocol][action][3])/1000 except: time = 'NULL' dashboard_report['io_'+protocol+'_'+action] = str(size)+'_'+str(time) if self.debug : ordered = dashboard_report.keys() ordered.sort() for key in ordered: print key,'=',dashboard_report[key] # IO throughput information dashboard_report['io_read_throughput'] = throughput_report['readThr'] dashboard_report['io_write_throughput'] = throughput_report['writeThr'] dashboard_report['io_netAvg_throughput'] = throughput_report['avgNetThr'] # send to DashBoard apmonSend(self.MonitorID, self.MonitorJobID, dashboard_report) apmonFree() if self.debug == 1 : print dashboard_report return
def free(self): apmonFree()
syncid = os.environ.get('Dashboard_syncid') #Replace MetaId by Dashboard_id _jobid = str(os.environ.get('Dashboard_Id')) monitorid = monitorid.replace('MetaID', _jobid) syncid = syncid.replace('MetaID', _jobid) # Start Dashboard Report hostname = str(socket.gethostname()) parameters = { 'ExeStart': executable, 'SyncCE': str(ce), 'SyncGridJobId': syncid, 'WNHostName': hostname } apmonSend(taskid, monitorid, parameters) apmonFree() ############### # Execute job ############### #Add PWD to PATH environment myenv = os.environ myenv['PATH'] += ':{0}'.format(os.environ.get('PWD')) t0 = os.times() p = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=myenv)
elif first == default and line[21:24] == "1st": first = int(datetime.strptime(line[-29:-9], "%d-%b-%Y %X").strftime('%s')) return (finit, fopen, first) (config, data) = sys.argv[1:] with open(data, 'rb') as f: (args, files, lumis, stageout, server, taskid, monitorid, syncid, want_summary) = pickle.load(f) apmonSend(taskid, monitorid, { 'ExeStart': 'cmsRun', 'SyncCE': 'ndcms.crc.nd.edu', 'SyncGridJobId': syncid, 'WNHostName': os.environ.get('HOSTNAME', '') }) apmonFree() configfile = config.replace(".py", "_mod.py") shutil.copy2(config, configfile) env = os.environ env['X509_USER_PROXY'] = 'proxy' edit_process_source(configfile, files, lumis, want_summary) # exit_code = subprocess.call('python "{0}" {1}'.format(configfile, ' '.join(map(repr, args))), shell=True, env=env) exit_code = subprocess.call('cmsRun -j report.xml "{0}" {1} > cmssw.log 2>&1'.format(configfile, ' '.join(map(repr, args))), shell=True, env=env) apmonSend(taskid, monitorid, {'ExeEnd': 'cmsRun'}) try:
def dashboard_postexecute(proc_ad, db): # Called in postsubmit for each job. # Existence of ad attributes are not guaranteed by collect_history -> check for each one if 'DESIRED_Sites' not in proc_ad: return try: if proc_ad['JobUniverse'] != 5: # 5: vanilla universe return except KeyError: return try: cluster_id = proc_ad['ClusterId'] except KeyError: return try: taskid = _taskids[cluster_id] except KeyError: result = db.query( 'SELECT `task_id` FROM `cms_tasks` WHERE `instance` = %s AND `cluster_id` = %s', HistoryDB.CONDOR_INSTANCE, cluster_id) if len(result) == 0: # cluster not known to history DB for some reason taskid = None else: taskid = result[0] _taskids[cluster_id] = taskid LOG.debug( 'Postexecute for CMS Global Pool: taskid for cluster id %d is %s' % (cluster_id, taskid)) if taskid is None: return report_remote_host(proc_ad, taskid) try: cmssite = proc_ad['MATCH_GLIDEIN_CMSSite'] except KeyError: cmssite = 'Unknown' if proc_ad['JobStatus'] != 4 or cmssite == 'Unknown': # JobStatus = 4 -> failed # CMSSite unknown -> didn't run on a CMS resource # In both cases, report as Aborted LOG.debug('Reporting cluster id %d process id %d as Aborted' % (cluster_id, proc_ad['ProcId'])) report_task_status(proc_ad, taskid, 'Aborted') else: LOG.debug('Reporting cluster id %d process id %d as Done' % (cluster_id, proc_ad['ProcId'])) report_task_status(proc_ad, taskid, 'Done') report_exit_code(proc_ad, taskid) apmonFree()
def dashboard_postsubmit(proc_ads): # Called in postsubmit. # Report to Dashboard about the cluster. Individual job reports are sent by history_update.py if len(proc_ads) == 0: return if 'DESIRED_Sites' not in proc_ads[0]: # This cluster will not match any CMS resource return try: if proc_ads[0]['JobUniverse'] != 5: # 5: vanilla universe return except KeyError: return LOG.debug('Postsubmit for CMS Global Pool') db = HistoryDB() for proc_ad in proc_ads: cluster_id = proc_ad['ClusterId'] try: taskid = _taskids[cluster_id] LOG.debug('Using task id %s for cluster id %d' % (taskid, cluster_id)) except KeyError: result = db.query( 'SELECT `task_id` FROM `cms_tasks` WHERE `instance` = %s AND `cluster_id` = %s', HistoryDB.CONDOR_INSTANCE, cluster_id) if len(result) == 0: # First time encountering this cluster id -> report to dashboard and record in DB taskid = report_master_submission(proc_ad) LOG.debug('New task id %s for cluster id %d' % (taskid, cluster_id)) sql = 'INSERT INTO `cms_tasks` (`instance`, `cluster_id`, `task_id`) VALUES (%s, %s, %s)' db.query(sql, HistoryDB.CONDOR_INSTANCE, cluster_id, taskid) else: taskid = result[0] LOG.debug('Found task id %s for cluster id %d in DB' % (taskid, cluster_id)) _taskids[cluster_id] = taskid # In CMSConnect, each dashboard task is allowed to have multiple clusters (Dashboard_Id is the full # serial ID of the jobs) # Here we are simplifying it by making 1:1 correspondence between clusters and tasks proc_ad['Dashboard_TaskId'] = taskid proc_ad['Dashboard_Id'] = proc_ad['ProcId'] report_task_submission(proc_ad, taskid) LOG.debug('Reported task submission to Dashboard') for proc_ad in proc_ads: taskid = _taskids[proc_ad['ClusterId']] report_task_status(proc_ad, taskid, 'Pending') apmonFree() LOG.debug('Reported all tasks as Pending')
def popularityInfos(self, jobReport): report_dict = {} inputList = [] inputParentList = [] report_dict['inputBlocks'] = '' if (os.path.exists(self.inputInfos)): file=open(self.inputInfos,'r') lines = file.readlines() for line in lines: if line.find("inputBlocks")>=0: report_dict['inputBlocks']= line.split("=")[1].strip() if line.find("inputFiles")>=0: inputList = line.split("=")[1].strip().split(";") if line.find("parentFiles")>=0: inputParentList = line.split("=")[1].strip().split(";") file.close() if len(inputList) == 1 and inputList[0] == '': inputList=[] if len(inputParentList) == 1 and inputParentList[0] == '': inputParentList=[] basename = '' if len(inputList) > 1: basename = os.path.commonprefix(inputList) elif len(inputList) == 1: basename = "%s/"%os.path.dirname(inputList[0]) basenameParent = '' if len(inputParentList) > 1: basenameParent = os.path.commonprefix(inputParentList) elif len(inputParentList) == 1: basenameParent = "%s/"%os.path.dirname(inputParentList[0]) readFile = {} readFileParent = {} fileAttr = [] fileParentAttr = [] for inputFile in jobReport.inputFiles: fileAccess = 'Local' if inputFile.get("PFN").find('xrootd') >= 0 : fileAccess = 'Remote' if inputFile['LFN'].find(basename) >=0: fileAttr = (inputFile.get("FileType"), fileAccess, inputFile.get("Runs")) readFile[inputFile.get("LFN").split(basename)[1]] = fileAttr else: fileParentAttr = (inputFile.get("FileType"), fileAccess, inputFile.get("Runs")) readParentFile[inputFile.get("LFN").split(basenameParent)[1]] = fileParentAttr cleanedinputList = [] for file in inputList: cleanedinputList.append(file.split(basename)[1]) cleanedParentList = [] for file in inputParentList: cleanedParentList.append(file.split(basenameParent)[1]) inputString = '' LumisString = '' countFile = 1 for f,t in readFile.items(): cleanedinputList.remove(f) inputString += '%s::%d::%s::%s::%d;'%(f,1,t[0],t[1],countFile) LumisString += '%s::%s::%d;'%(t[2].keys()[0],self.makeRanges(t[2].values()[0]),countFile) countFile += 1 inputParentString = '' LumisParentString = '' countParentFile = 1 for fp,tp in readFileParent.items(): cleanedParentList.remove(fp) inputParentString += '%s::%d::%s::%s::%d;'%(fp,1,tp[0],tp[1],countParentFile) LumisParentString += '%s::%s::%d;'%(tp[2].keys()[0],self.makeRanges(tp[2].values()[0]),countParentFile) countParentFile += 1 if len(cleanedinputList): for file in cleanedinputList : if len(jobReport.errors): if jobReport.errors[0]["Description"].find(file) >= 0: fileAccess = 'Local' if jobReport.errors[0]["Description"].find('xrootd') >= 0: fileAccess = 'Remote' inputString += '%s::%d::%s::%s::%s;'%(file,0,'Unknown',fileAccess,'Unknown') else: inputString += '%s::%d::%s::%s::%s;'%(file,2,'Unknown','Unknown','Unknown') else: inputString += '%s::%d::%s::%s::%s;'%(file,2,'Unknown','Unknown','Unknown') if len(cleanedParentList): for file in cleanedParentList : if len(jobReport.errors): if jobReport.errors[0]["Description"].find(file) >= 0: inputString += '%s::%d::%s::%s::%s;'%(file,0,'Unknown','Local','Unknown') else: inputString += '%s::%d::%s::%s::%s;'%(file,2,'Unknown','Unknown','Unknown') else: inputParentString += '%s::%d::%s::%s::%s;'%(file,2,'Unknown','Unknown','Unknown') report_dict['inputFiles']= inputString report_dict['parentFiles']= inputParentString report_dict['lumisRange']= LumisString report_dict['lumisParentRange']= LumisParentString report_dict['Basename']= basename report_dict['BasenameParent']= basenameParent # send to DashBoard apmonSend(self.MonitorID, self.MonitorJobID, report_dict) apmonFree() # if self.debug == 1 : print "Popularity start" for k,v in report_dict.items(): print "%s : %s"%(k,v) print "Popularity stop" return