def parseWL( MainDir, XMLhandler ): DictionaryOfApps = XMLhandler.getDictionaryOfApplications() ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, 'runTime', AIStorageUtils.SORT_TYPE_FLOAT, AIStorageUtils.SORT_ASCENDING ) NTotalJobs = len( ListOfApps ) print "Found", NTotalJobs, "apps. Sorting...done." RunDir = os.path.join( MainDir, "run" ) JDFsDir = os.path.join( MainDir, "jdfs" ) Timers = {} Timers['TurnAround'] = ItemTimer() Timers['SuccessfulTurnAround'] = ItemTimer() Timers['SuccessfulRun'] = ItemTimer() Timers['SuccessfulRunMPI'] = ItemTimer() AllowedFirstChar = ['2','3'] DataList = [] for (id, App) in ListOfApps: #-- generate item DataItem = {} DataItem['id'] = id if DataItem['id'][0] not in AllowedFirstChar: continue DataItem['name'] = App['name'] DataItem['jdf'] = App['jdf'] DataItem['SubmitStdOut'] = os.path.join( RunDir, "%s.out" % id ).replace(':', '-') DataItem['SubmitStdErr'] = os.path.join( RunDir, "%s.out" % id ).replace(':', '-') TurnAroundTime, RunnerJobID, JobResult, GlobusMessages = parseKOALASubmissionFile( DataItem['SubmitStdOut'] ) #print RunnerJobID, JobResult, TurnAroundTime if RunnerJobID: DataItem['RunnerJobID'] = RunnerJobID DataItem['JobResult'] = JobResult DataItem['TurnAroundTime'] = TurnAroundTime DataItem['GlobusMessages'] = GlobusMessages DataList.append( DataItem ) if TurnAroundTime: Timers['TurnAround'].addValue(TurnAroundTime) if JobResult == 'SUCCESS': Timers['SuccessfulTurnAround'].addValue(TurnAroundTime) if os.path.exists(DataItem['jdf']): OutputFilesList = getOutputFiles(DataItem['jdf']) for FileName in OutputFilesList: DirFileName = os.path.join(JDFsDir, FileName) RunTime, RunTimes = parseSMPIOutputFile( DirFileName ) if RunTime and RunTimes['Overall']['Max']: Timers['SuccessfulRun'].addValue(RunTime) Timers['SuccessfulRunMPI'].addValue(RunTimes['Overall']['Max']) ##else: ## print "RunTime", RunTime, "RunTimes", RunTimes print "All TurnAround time [s]: avg=%8.3f | min=%8d | max=%8d | #=%8d" % Timers['TurnAround'].getInfo() print "SUCCESS TurnAround time [s]: avg=%8.3f | min=%8d | max=%8d | #=%8d" % Timers['SuccessfulTurnAround'].getInfo() print "SUCCESS Run time [s]: avg=%8.3f | min=%8d | max=%8d | #=%8d" % Timers['SuccessfulRun'].getInfo() print "SUCCESS Run time MPI [s]: avg=%8.3f | min=%8d | max=%8d | #=%8d" % Timers['SuccessfulRunMPI'].getInfo()
def usage(progname): global Defaults ReplaceDic = {} VarsDic = vars(); for Key in VarsDic.keys(): ReplaceDic[Key] = VarsDic[Key] ListOfTestsData = AIStorageUtils.dict_sortbykey( TestData, AIStorageUtils.SORT_ASCENDING ) ListOfSortedNames = [] for Name, Dummy in ListOfTestsData: ListOfSortedNames.append(Name) ReplaceDic['AvailableTests'] = ','.join(ListOfSortedNames) print __doc__ % ReplaceDic
def main(argv): try: opts, args = getopt.getopt(argv, "h", ["help", "version"]) except getopt.GetoptError: print "Error while converting options: unknown option(s) encountered.\n\n" usage(os.path.basename(sys.argv[0])) sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage(os.path.basename(sys.argv[0])) sys.exit() elif opt in ["--version"]: version() sys.exit() else: print "Unknown parameter", opt if len(args) < 1: #print "Error: No sites file given.\n\n" #usage(os.path.basename(sys.argv[0])) #sys.exit(3) SitesFileName = "grid-sites.xml" else: SitesFileName = args[0]; if not os.path.isfile( SitesFileName ): if os.path.exists( SitesFileName ): print "\n\n****\nError: %s is not a file!\n****\n" % SitesFileName else: print "\n\n****\nError: %s does not exist!\n****\n" % SitesFileName usage(os.path.basename(sys.argv[0])) sys.exit(1) #---Read a sites file #print "%s Parsing sites file %s" % \ # ( time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(time.time())), SitesFileName ) handlerXML = WLDocHandlers.readSiteFile(SitesFileName) #print "%s Sites file processed, proceeding to submission" % \ # time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(time.time())) DictionaryOfSites = handlerXML.getDictionaryOfSites() ListOfSites = AIStorageUtils.dict_sortbykey( DictionaryOfSites, AIStorageUtils.SORT_DESCENDING ) for (id, Site) in ListOfSites: print id, Site['location'], Site['machines'] print
def printWorkload( XMLhandler ): DictionaryOfApps = XMLhandler.getDictionaryOfApplications() ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, 'jdf', AIStorageUtils.SORT_TYPE_STRING, AIStorageUtils.SORT_ASCENDING ) NTotalJobs = len( ListOfApps ) print "Found", NTotalJobs, "apps. Sorting...done." for (id, App) in ListOfApps: print id, App['jdf'], if 'dependsOn' in App: print 'dependsOn=', App['dependsOn'] else: print print "--Full description----------" for (id, App) in ListOfApps: print id, App
def runWL( OutputDir, XMLhandler, NPoolThreads, NoSubmit=0, NoBackground=0, OneFile=0, testid=0, projectid = "default", testerid=0, timediff=0.0 ): DictionaryOfApps = XMLhandler.getDictionaryOfApplications() ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, 'runTime', AIStorageUtils.SORT_TYPE_FLOAT, AIStorageUtils.SORT_ASCENDING ) # ListOfApps = AIStorageUtils.dict_sortbykey( DictionaryOfApps, AIStorageUtils.SORT_DESCENDING ) for (id, App) in ListOfApps: print "Found ", App['id'], "due to start at", App['runTime'] NTotalJobs = len( ListOfApps ) #print "### Found", NTotalJobs, "apps. Sorting...done." startTime = float(ListOfApps[0][1]['runTime']) if startTime < 0.0: startTime = 0.0 for (id, App) in ListOfApps: App['runTime'] = float(App['runTime']) - startTime #print "### runTime:", App['runTime'], "\n" if App['runTime'] < 0.0: App['runTime'] = 0.0 #print "ID", id, "starts in %.3fs." % float(App['runTime']/1000.0) #-- generate all work units try: os.mkdir( OutputDir ) except: pass FirstSubmission = time.time() CommandLinesList = [] for (id, App) in ListOfApps: #-- generate item CommandLineItem = {} CommandLineItem['id'] = id CommandLineItem['firstSubmission'] = FirstSubmission CommandLineItem['startTime'] = float(App['runTime']/1000.0) #CommandLineItem['commandLine'] = "drunner -g -e -o -f %s 1> %s.out 2> %s.err &" % (App['jdf'], id, id) if OneFile == 0: StdOutFile = os.path.join( OutputDir, "%s.out" % id ) StdErrFile = os.path.join( OutputDir, "%s.err" % id ) ActualCommand = "%s 1> %s 2> %s" % ( App['submitCommand'], StdOutFile, StdErrFile ) #ActualCommand = "%s 2>%s" % ( App['submitCommand'], StdErrFile ) else: StdOutFile = os.path.join( OutputDir, "onefile.out" ) StdErrFile = os.path.join( OutputDir, "onefile.err" ) ActualCommand = "%s 1>> %s 2>> %s" % ( App['submitCommand'], StdOutFile, StdErrFile ) #ActualCommand = "%s" % ( App['submitCommand'] ) if NoBackground == 0: CommandLineItem['commandLine'] = ActualCommand #+ ' &' else: CommandLineItem['commandLine'] = ActualCommand CommandLineItem['stdout'] = StdOutFile CommandLineItem['stderr'] = StdErrFile CommandLineItem['onefile'] = OneFile #-- amod v.0.12: just generate commands CommandLineItem['NoSubmit'] = NoSubmit CommandLineItem['testid'] = testid CommandLineItem['projectid'] = projectid CommandLineItem['testerid'] = testerid CommandLineItem['timediff'] = timediff #-- append item #if os.path.exists(App['jdf']): CommandLinesList.append(CommandLineItem) #else: # print "Could not locate JDF", App['jdf'], "... skipping job" #-- build a WorkRequest object for each work unit requests = ASPNThreadPool.makeRequests(submitJob, CommandLinesList, printSubmitJobResults) #-- create a pool of NPoolThreads worker threads StdOutLock.acquire() print "[wl-submit.py] Starting a thread pool with", NPoolThreads, "threads" StdOutLock.release() submitThreadPool = ASPNThreadPool.ThreadPool(NPoolThreads, StdOutLock) StartSubmissionTime = time.time() #-- add all work units into the thread pool # NOTE: We expect the thread pool to be based on Queues, # beacause our applications need to be run at specified times # and the submit job waits until the current work unit is done # -> if we are NOT using Queues, it may happen that a work unit # that needs to be submitted at time T will get submitted much # later, due to other jobs starting the submission before it, # but waiting for their later start time for req in requests: submitThreadPool.putRequest(req) #DEBUG:print req.args StdOutLock.acquire() print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % \ (req.requestID, req.args[0]['id'], req.args[0]['startTime']) StdOutLock.release() #-- wait for all submissions to be completed submitThreadPool.wait() while 1: try: submitThreadPool.poll() EndSubmissionTime = time.time() #print "Main thread working..." time.sleep(0.5) except (KeyboardInterrupt, ASPNThreadPool.NoResultsPending): break EndSubmissionTime = time.time() NTotalJobsInQueue = len(submitThreadPool.workRequests) # should send to the database the 'onefile.out' and 'onefile.err' (not tested) if OneFile != 0: StdOutFile = os.path.join(OutputDir, "onefile.out") StdErrFile = os.path.join(OutputDir, "onefile.err") try: fin = open(StdOutFile) lines = (fin.read()).split("\n") for line in lines: if (len(line) > 1): sLine = "\n" + LOGFILE_PREFIX + str(testid) + "\1" + str(projectid) + "\1" + str(testerid) + "\1" + "0" + "\1" + line + "\n" StdOutLock.acquire() sys.stdout.write(sLine) StdOutLock.release() fin.close() except: pass try: fin = open(StdErrFile) lines = (fin.read()).split("\n") for line in lines: if (len(line) > 1): sLine = "\n" + LOGFILE_PREFIX + str(testid) + "\1" + str(projectid) + "\1" + str(testerid) + "\1" + "0" + "\1" + line + "\n" StdOutLock.acquire() sys.stdout.write(sLine) StdOutLock.release() fin.close() except: pass return StartSubmissionTime, EndSubmissionTime, NTotalJobs, NTotalJobsInQueue
def runWL(OutputDir, XMLhandler, NPoolThreads, NoSubmit=0, Background=0, OneFile=0): # --- get applications DictionaryOfApps = XMLhandler.getDictionaryOfApplications() # --- create composite structure manager TheCompositeApplicationData = CompositeApplicationData() ## ListOfApps = AIStorageUtils.dict_sortbykey( DictionaryOfApps, AIStorageUtils.SORT_DESCENDING ) ## for (id, App) in ListOfApps: ## print "Found ", App['id'], "due to start at", App['runTime'] ## # -- sort jobs ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, "runTime", AIStorageUtils.SORT_TYPE_FLOAT, AIStorageUtils.SORT_ASCENDING ) NTotalJobs = len(ListOfApps) print "Found", NTotalJobs, "apps. Sorting...done." # Modification - C.S.: make all the tasks have the start time 0 # startTime = float(ListOfApps[0][1]['runTime']) # if startTime < 0.0: startTime = 0.0 startTime = 0.0 # -- correct start times and add all applications to the composite structure manager for (id, App) in ListOfApps: App["runTime"] = 0 # App['runTime'] = float(App['runTime']) - startTime # if App['runTime'] < 0.0: App['runTime'] = 0.0 print "ID", id, "starts in %.3fs." % float(App["runTime"] / 1000.0) # add the 'dependsOn' key if missing if "dependsOn" not in App: App["dependsOn"] = [] TheCompositeApplicationData.addJob(id, App) # -- create all 'enables' relations TheCompositeApplicationData.buildEnablesRelations() # -- mark all the starting jobs as 'can run' for id in TheCompositeApplicationData.JobsWithDeps: TheCompositeApplicationData.triggerCanRunCheck(id) # --- generate all work units try: os.mkdir(OutputDir) except: pass # --- build a WorkRequest object for each work unit FirstSubmission = time.time() CommandLinesList = [] for (id, App) in ListOfApps: # -- generate item CommandLineItem = {} CommandLineItem[".CompositeApplicationData"] = TheCompositeApplicationData CommandLineItem["id"] = id CommandLineItem["firstSubmission"] = FirstSubmission CommandLineItem["startTime"] = float(App["runTime"] / 1000.0) # CommandLineItem['commandLine'] = "drunner -g -e -o -f %s 1> %s.out 2> %s.err &" % (App['jdf'], id, id) if OneFile == 0: StdOutFile = os.path.join(OutputDir, "%s.out" % id) StdErrFile = os.path.join(OutputDir, "%s.err" % id) ActualCommand = "%s 1> %s 2> %s" % (App["submitCommand"], StdOutFile, StdErrFile) else: StdOutFile = os.path.join(OutputDir, "onefile.out") StdErrFile = os.path.join(OutputDir, "onefile.err") ActualCommand = "%s 1>> %s 2>> %s" % (App["submitCommand"], StdOutFile, StdErrFile) if Background == 1: CommandLineItem["commandLine"] = ActualCommand + " &" else: CommandLineItem["commandLine"] = ActualCommand # -- amod v.0.12: just generate commands CommandLineItem["NoSubmit"] = NoSubmit # -- append item if os.path.exists(App["jdf"]): CommandLinesList.append(CommandLineItem) else: print "Could not locate JDF", App["jdf"], "... skipping job" requests = ASPNThreadPool.makeRequests(runJob, CommandLinesList, printJobResults) # --- create a pool of NPoolThreads worker threads print "[wl-exec-dagman.py] Starting a thread pool with", NPoolThreads, "threads" submitThreadPool = ASPNThreadPool.ThreadPool(NPoolThreads, StdOutLock) StartSubmissionTime = time.time() # --- add all work units into the thread pool # NOTE: We expect the thread pool to be based on Queues, # beacause our applications need to be run at specified times # and the submit job waits until the current work unit is done # -> if we are NOT using Queues, it may happen that a work unit # that needs to be submitted at time T will get submitted much # later, due to other jobs starting the submission before it, # but waiting for their later start time # Modification - corina: the requests are put in the thread pool only when # their dependencies are satisfied requestsBkp = requests[:] for req in requestsBkp: reqId = req.args[0]["id"] # take only the runnable jobs if TheCompositeApplicationData.isRunnable(reqId): submitThreadPool.putRequest(req) # remove the request from the list if it was submitted to the pool requests.remove(req) # DEBUG:print req.args print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % ( req.requestID, req.args[0]["id"], req.args[0]["startTime"], ) # --- wait for all submissions to be completed # submitThreadPool.wait() while 1: try: submitThreadPool.poll() EndSubmissionTime = time.time() time.sleep(0.5) ## if TheCompositeApplicationData.isCompositeApplicationFinished(): ## #submitThreadPool.wait() ## EndSubmissionTime = time.time() ## break ## time.sleep(1) ## #print "Main thread working..." except ASPNThreadPool.NoResultsPending: # -- check that all jobs have actually finished or failed if TheCompositeApplicationData.isCompositeApplicationFinished(): EndSubmissionTime = time.time() break else: # see if we have some more runnable jobs and add them to the pool requestsBkp2 = requests[:] for req in requestsBkp2: reqId = req.args[0]["id"] if TheCompositeApplicationData.isRunnable(reqId): submitThreadPool.putRequest(req) requests.remove(req) # DEBUG:print req.args print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % ( req.requestID, req.args[0]["id"], req.args[0]["startTime"], ) print "[wl-exec-dagman] Got ASPNThreadPool.NoResultsPending" print " All:", TheCompositeApplicationData.TotalJobs, "Done:", TheCompositeApplicationData.TotalSuccessful, "Failed:", TheCompositeApplicationData.TotalFailed time.sleep(2) except KeyboardInterrupt: break except: print ">>>" + traceback.print_exc() raise Exception, "aaaaaaaaaaaaaaaaaaaaaaa" NTotalJobsInQueue = len(submitThreadPool.workRequests) ## print ">>>", "NTotalJobsInQueue:", NTotalJobsInQueue ## ## #-- mark all the starting jobs as 'can run' ## for id in TheCompositeApplicationData.JobsWithDeps: ## print "ID", id, "isFailed:", TheCompositeApplicationData.isFailed(id), "isSuccessful:", TheCompositeApplicationData.isSuccessful(id) return StartSubmissionTime, EndSubmissionTime, NTotalJobs, NTotalJobsInQueue