def watch_jobs(jobs): """ Monitors jobs running. If jobs request frequency, then submits to frequency calculation """ orunning = list() frunning = list() ocomplete = list() fcomplete = list() ocrashed = list() fcrashed = list() crashed = list() completed = list() failed_submit = list() stuck = list() allcomplete = False jobdict = dict() starttime = time() for job in jobs: if job.status == 'Opt Submitted' or job.status == 'TS Submitted': orunning.append(job.jobid) jobdict[job.jobid] = job elif job.status == 'Freq Submitted': frunning.append(job.jobid) jobdict[job.jobid] = job else: failed_submit.append(job.name + ' - ' + job.status) logging.info('There are {} jobs being watched.'.format( len(jobdict) )) if len(failed_submit) > 0: logging.warning('There are {} jobs that failed to launch:\n{}'.format( len(failed_submit), turbogo_helpers.list_str(failed_submit) )) if len(jobdict) == 0: exit() loopcount = 0 change = False #delay to ensures all jobs are in queue, and catch first moment fails sleep(60) while not allcomplete: alljobs = turbogo_helpers.get_all_active_jobs() if len(alljobs) == 0 and (len(orunning) > 0 or len(frunning) > 0): #possible fail at getting jobs from queue sleep(60) alljobs = turbogo_helpers.get_all_active_jobs() if len(alljobs) == 0: #One more try sleep(300) alljobs = turbogo_helpers.get_all_active_jobs() checkojobs = list(orunning) checkfjobs = list(frunning) for job in alljobs: if job in checkojobs: checkojobs.remove(job) elif job in checkfjobs: checkfjobs.remove(job) if len(checkojobs) != 0: #Some jobs not running for ojob in checkojobs: job = jobdict[ojob] del jobdict[ojob] orunning.remove(job.jobid) #find out what happened to the job & deal with it status = check_opt(job) if status == 'freq': ocomplete.append(job.name) frunning.append(job.jobid) jobdict[job.jobid] = job logging.debug( "Job {} submitted for freq with jobid {}.".format( job.name, job.jobid )) elif status == 'fcrashed': fcrashed.append(job.name) crashed.append(job.name) logging.debug("Job {} crashed starting freq.".format( job.name )) elif status == 'ocrashed': ocrashed.append(job.name) crashed.append(job.name) logging.debug("Job {} crashed in opt.".format( job.name )) else: completed.append(job.name) write_stats(job) logging.debug("Job {} completed opt.".format( job.name )) change = True if len(checkfjobs) != 0: #some freq not running for fjob in checkfjobs: job = jobdict[fjob] del jobdict[fjob] frunning.remove(job.jobid) #find out what happened to the job and deal with it status = check_freq(job) if status == 'opt': #job was resubmitted with new geometry to avoid saddle point orunning.append(job.jobid) jobdict[job.jobid] = job logging.debug( "Job {} resubmitted for opt with jobid {}.".format( job.name, job.jobid )) elif status == 'fcrashed': fcrashed.append(job.name) crashed.append(job.name) logging.debug("Job {} crashed starting freq.".format( job.name )) elif status == 'ocrashed': ocrashed.append(job.name) crashed.append(job.name) logging.debug("Job {} crashed restarting opt.".format( job.name )) elif status == 'same' or status == 'imaginary': stuck.append(job.name) write_stats(job) logging.info( "Job {} stuck on transition state with freq {}.".format( job.name, job.firstfreq)) elif status == 'ts': write_stats(job) completed.append(job.name) logging.debug("Job {} completed ts.".format( job.name )) else: write_stats(job) completed.append(job.name) logging.debug("Job {} completed freq.".format( job.name )) change = True if len(orunning) == 0 and len(frunning) == 0: #all jobs finished or crashed: allcomplete = True else: if loopcount % (3*6) == 0 and change == True: #3-Hourly status update if a change happened logstring = "\n----------------------------------------------" \ "------\n" logstring += "At {}:\n".format(strftime("%d/%m/%y %H:%M:%S")) if len(orunning) > 0: logstring += "There are {} running opt jobs:\n{}\n".format( len(orunning), turbogo_helpers.list_str(orunning)) if len(frunning) > 0: logstring += "There are {} running freq jobs:\n{}\n".format( len(frunning), turbogo_helpers.list_str(frunning)) if len(crashed) > 0: logstring += "There are {} crashed jobs:\n{}\n".format( len(crashed), turbogo_helpers.list_str(crashed)) if len(stuck) > 0: logstring += "There are {} stuck jobs:\n{}\n".format( len(stuck), turbogo_helpers.list_str(stuck)) if len(completed) > 0: logstring += "There are {} completed jobs:\n{}\n".format( len(completed), turbogo_helpers.list_str(completed)) logstring += "-----------------------------------------------" \ "-----" logging.info(logstring) change = False loopcount += 1 sleep(10*60) #after job finished/crashed logging elapsed = turbogo_helpers.time_readable(time()-starttime) logging.warning("{} jobs completed. {} jobs crashed.".format( len(fcomplete),len(crashed))) logstring = "\n----------------------------------------------------\n" logstring += "Completed at {} after {}:\n".format( strftime("%d/%m/%y %H:%M:%S"), elapsed) if len(completed) > 0: logstring += "There are {} completed jobs:\n{}\n".format( len(completed), turbogo_helpers.list_str(completed)) if len(stuck) > 0: logstring += "There are {} stuck jobs:\n{}\n".format( len(stuck), turbogo_helpers.list_str(stuck)) if len(crashed) > 0: logstring += "There are {} crashed jobs:\n{}\n".format( len(crashed), turbogo_helpers.list_str(crashed)) if len(failed_submit) > 0: logstring += "There are {} jobs that failed to start:\n{}\n".format( len(failed_submit), turbogo_helpers.list_str(failed_submit)) logstring += "----------------------------------------------------" logging.info(logstring)
def write_stats(job): """ Writes a line to the stats file for each job that completes successfully. """ if not turbogo_helpers.check_files_exist(['stats.txt']): #write the header to the file try: with open('stats.txt', 'w') as f: f.write("{name:^16}{directory:^20}{optsteps:^10}{opttime:^12}" \ "{freqtime:^12}{tottime:^12}{firstfreq:^16}{energy:^16}" .format( name='Name', directory = 'Directory', optsteps = 'Opt Steps', opttime = 'Opt Time', freqtime = 'Freq Time', tottime = 'Total Time', firstfreq = '1st Frequency', energy = 'Energy', )) f.write('\n') except IOError as e: logging.warning("Error preparing stats file: {}".format(e)) except Exception as e: logging.warning("Unknown error {}".format(e)) name = job.name directory = os.path.join(job.indir, job.infile) try: with open(os.path.join(job.indir, 'energy')) as f: lines = f.readlines() except IOError as e: logging.warning("Error reading energy file for stats: {}".format(e)) optsteps = '?' energy = '?' except Exception as e: logging.warning("Unknown error {}.".format(e)) else: optsteps = lines[-2][:6].strip() energy = lines[-2][6:22].strip() opttime = turbogo_helpers.time_readable(job.otime) freqtime = turbogo_helpers.time_readable(job.ftime) tottime = turbogo_helpers.time_readable(job.otime + job.ftime) firstfreq = job.firstfreq try: with open('stats.txt', 'a') as f: f.write("{name:^16.16}{directory:^20.20}{optsteps:^10.10}" \ "{opttime:^12.12}{freqtime:^12.12}{tottime:^12.12}" \ "{firstfreq:^16.16}{energy:^16.16}" .format( name=name, directory = directory, optsteps = optsteps, opttime = opttime, freqtime = freqtime, tottime = tottime, firstfreq = firstfreq, energy = energy, )) f.write('\n') except (OSError, IOError) as e: logging.warning("Error writing stats file: {}".format(e))