def test_pmlogextract(self, pmlogextracnfn, adjustjobfn): pmlogextracnfn.return_value = -10 summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
def test_jobtoolong(self, summaryclass, extract): extract.return_value = 0 self.mockjob.configure_mock(walltime = 99999999) summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.TIME_TOO_LONG, 'skipped_too_long')
def test_invlid_nodecount(self, summaryclass, extract): extract.return_value = 0 self.mockjob.configure_mock(nodecount = 0) summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.INVALID_NODECOUNT, 'skipped_invalid_nodecount')
def test_parallel_too_short(self, summaryclass, extract): extract.return_value = 0 self.mockjob.configure_mock(walltime = 599, nodecount = 10) self.options['min_parallel_duration'] = 600 summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.PARALLEL_TOO_SHORT, 'skipped_parallel_too_short')
def test_job_too_short(self, summaryclass, extract): extract.return_value = 0 self.mockjob.configure_mock(walltime = 128) self.options['min_duration'] = 129 summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.TIME_TOO_SHORT, 'skipped_too_short')
def test_jobtoonodehours(self, summaryclass, extract): """ test the too many nodehours error """ extract.return_value = 0 self.mockjob.configure_mock(walltime=1000, nodecount=500) self.options['max_nodetime'] = 499999 summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.JOB_TOO_MANY_NODEHOURS, 'skipped_job_nodehours')
def test_pmlogextractfail2(self, popen, getextractcmdline, adjustjobfn): popensettings = { 'communicate.return_value': ("","__pmLogPutResult2: write failed: returns 491416 expecting 1246712: Cannot allocate memory"), 'returncode': 1 } popen.return_value = Mock(**popensettings) configres = {'getsection.return_value': {'subdir_out_format': '%j', 'archive_out_dir': tempfile.mkdtemp()}} self.mockconf.configure_mock(**configres) summarizejob(self.mockjob, self.mockconf, self.mockresconf, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
def test_pmlogextractfail1(self, popen, getextractcmdline, adjustjobfn): popensettings = { 'communicate.return_value': ("","pmlogextract: Warning: no qualifying records found."), 'returncode': 1 } popen.return_value = Mock(**popensettings) configres = {'getsection.return_value': {'subdir_out_format': '%j', 'archive_out_dir': tempfile.mkdtemp()}} self.mockconf.configure_mock(**configres) summarizejob(self.mockjob, self.mockconf, self.mockresconf, [], [], self.mockoutput, self.mocklog, self.options) self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
def test_pmlogextractfail0(self, popen, getextractcmdline, adjustjobfn): popensettings = { 'communicate.return_value': ("", "__pmLogPutResult2: write failed: returns 804876 expecting 954704: No space left on device" ), 'returncode': 1 } popen.return_value = Mock(**popensettings) configres = { 'getsection.return_value': { 'subdir_out_format': '%j', 'archive_out_dir': tempfile.mkdtemp() } } self.mockconf.configure_mock(**configres) _, mdata, _, error = summarizejob(self.mockjob, self.mockconf, self.mockresconf, [], [], self.options) self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error', error, mdata)
def process_resource(resconf, preprocs, plugins, config, opts): with outputter.factory(config, resconf, dry_run=opts["dry_run"]) as m: if resconf['batch_system'] == "XDMoD": dbif = XDMoDAcct(resconf['resource_id'], config) else: dbif = DbAcct(resconf['resource_id'], config) for job in get_jobs(opts, dbif): try: summarize_start = time.time() res = summarizejob(job, config, resconf, plugins, preprocs, opts) if res is None: continue # Extract-only mode s, mdata, success, s_err = res summarize_time = time.time() - summarize_start summary_dict = s.get() except Exception as e: logging.error( "Failure for summarization of job %s %s. Error: %s %s", job.job_id, job.jobdir, str(e), traceback.format_exc()) clean_jobdir(opts, job) if opts["fail_fast"]: raise else: continue process_summary(m, dbif, opts, job, summarize_time, (summary_dict, mdata, success, s_err)) clean_jobdir(opts, job)
def process_job(config, dbif, job, m, opts, plugins, preprocs, resconf): try: summarize_start = time.time() summary, mdata, success, summarize_error = summarizejob(job, config, resconf, plugins, preprocs, opts) summarize_time = time.time() - summarize_start # TODO: change behavior so markasdone only happens if this is successful m.process(summary, mdata) if not opts['dry_run']: dbif.markasdone(job, success, summarize_time, summarize_error) except Exception as e: logging.error("Failure for job %s %s. Error: %s %s", job.job_id, job.jobdir, str(e), traceback.format_exc()) finally: if opts['dodelete'] and job.jobdir is not None and os.path.exists(job.jobdir): # Clean up shutil.rmtree(job.jobdir)
def do_summarize(args): """ used in a separate process """ job, config, resconf, plugins, preprocs, opts = args try: summarize_start = time.time() res = summarizejob(job, config, resconf, plugins, preprocs, opts) if res is None: return job, None, None # Extract-only mode s, mdata, success, s_err = res summarize_time = time.time() - summarize_start # Ensure Summarize.get() is called on worker process since it is cpu-intensive summary_dict = s.get() except Exception as e: logging.error("Failure for summarization of job %s %s. Error: %s %s", job.job_id, job.jobdir, str(e), traceback.format_exc()) if opts["fail_fast"]: raise return job, None, None return job, (summary_dict, mdata, success, s_err), summarize_time
def processjobs(config, opts, procid): """ main function that does the work. One run of this function per process """ allpreprocs = loadpreprocessors() logging.debug("Loaded %s preprocessors", len(allpreprocs)) allplugins = loadplugins() logging.debug("Loaded %s plugins", len(allplugins)) for r, resconf in config.resourceconfigs(): if opts['resource'] == None or opts['resource'] == r or opts[ 'resource'] == str(resconf['resource_id']): logging.info("Processing resource %s", r) else: continue resconf = override_defaults(resconf, opts) preprocs, plugins = filter_plugins(resconf, allpreprocs, allplugins) logging.debug("Using %s preprocessors", len(preprocs)) logging.debug("Using %s plugins", len(plugins)) with outputter.factory(config, resconf) as m: if resconf['batch_system'] == "XDMoD": dbif = XDMoDAcct(resconf['resource_id'], config, opts['threads'], procid) else: dbif = DbAcct(resconf['resource_id'], config, opts['threads'], procid) if opts['mode'] == "single": for job in dbif.getbylocaljobid(opts['local_job_id']): summarizejob(job, config, resconf, plugins, preprocs, m, dbif, opts) elif opts['mode'] == "timerange": for job in dbif.getbytimerange(opts['start'], opts['end'], opts): summarizejob(job, config, resconf, plugins, preprocs, m, dbif, opts) else: for job in dbif.get(None, None): summarizejob(job, config, resconf, plugins, preprocs, m, dbif, opts)
def processjobs(config, opts, procid, comm): """ main function that does the work. One run of this function per process """ allpreprocs = loadpreprocessors() logging.debug("Loaded %s preprocessors", len(allpreprocs)) allplugins = loadplugins() logging.debug("Loaded %s plugins", len(allplugins)) for r, resconf in config.resourceconfigs(): if opts['resource'] == None or opts['resource'] == r or opts['resource'] == str(resconf['resource_id']): logging.info("Processing resource %s", r) else: continue resconf = override_defaults(resconf, opts) preprocs, plugins = filter_plugins(resconf, allpreprocs, allplugins) logging.debug("Using %s preprocessors", len(preprocs)) logging.debug("Using %s plugins", len(plugins)) with outputter.factory(config, resconf) as m: if resconf['batch_system'] == "XDMoD": dbif = XDMoDAcct(resconf['resource_id'], config, None, None) else: dbif = DbAcct(resconf['resource_id'], config) list_procs = 0 # Master if procid == 0: getjobs = {} if opts['mode'] == "single": getjobs['cmd'] = dbif.getbylocaljobid getjobs['opts'] = [opts['local_job_id'],] elif opts['mode'] == "timerange": getjobs['cmd'] = dbif.getbytimerange getjobs['opts'] = [opts['start'], opts['end'], opts] else: getjobs['cmd'] = dbif.get getjobs['opts'] = [None, None] logging.debug("MASTER STARTING") numworkers = opts['threads']-1 numsent = 0 numreceived = 0 for job in getjobs['cmd'](*(getjobs['opts'])): if numsent >= numworkers: list_procs += 1 if opts['dump_proclist'] and (list_procs == 1 or list_procs == 1000): # Once all ranks are going, dump the process list for debugging logging.info("Dumping process list") allpinfo = {} for proc in psutil.process_iter(): try: pinfo = proc.as_dict() except psutil.NoSuchProcess: pass else: allpinfo[pinfo['pid']] = pinfo with open("rank-{}_{}.proclist".format(procid, list_procs), 'w') as outfile: json.dump(allpinfo, outfile, indent=2) # Wait for a worker to be done and then send more work process = comm.recv(source=MPI.ANY_SOURCE, tag=1) numreceived += 1 comm.send(job, dest=process, tag=1) numsent += 1 logging.debug("Sent new job: %d sent, %d received", numsent, numreceived) else: # Initial batch comm.send(job, dest=numsent+1, tag=1) numsent += 1 logging.debug("Initial Batch: %d sent, %d received", numsent, numreceived) logging.info("After all jobs sent: %d sent, %d received", numsent, numreceived) # Get leftover results while numsent > numreceived: comm.recv(source=MPI.ANY_SOURCE, tag=1) numreceived += 1 logging.debug("Getting leftovers. %d sent, %d received", numsent, numreceived) # Shut them down for worker in xrange(numworkers): logging.debug("Shutting down: %d", worker+1) comm.send(None, dest=worker+1, tag=1) # Worker else: sendtime = time.time() midtime = time.time() recvtime = time.time() logging.debug("WORKER %d STARTING", procid) while True: recvtries = 0 while not comm.Iprobe(source=0, tag=1): if recvtries < 1000: recvtries += 1 continue # Sleep so we can instrument how efficient we are # Otherwise, workers spin on exit at the hidden mpi_finalize call. # If you care about maximum performance and don't care about wasted cycles, remove the Iprobe/sleep loop # Empirically, a tight loop with time.sleep(0.001) uses ~1% CPU time.sleep(0.001) job = comm.recv(source=0, tag=1) recvtime = time.time() mpisendtime = midtime-sendtime mpirecvtime = recvtime-midtime if (mpisendtime+mpirecvtime) > 2: logging.warning("MPI send/recv took %s/%s", mpisendtime, mpirecvtime) if job != None: logging.debug("Rank: %s, Starting: %s", procid, job.job_id) summarizejob(job, config, resconf, plugins, preprocs, m, dbif, opts) logging.debug("Rank: %s, Finished: %s", procid, job.job_id) sendtime = time.time() comm.send(procid, dest=0, tag=1) midtime = time.time() list_procs += 1 if opts['dump_proclist'] and (list_procs == 1 or list_procs == 10): # Once all ranks are going, dump the process list for debugging logging.info("Dumping process list") allpinfo = {} for proc in psutil.process_iter(): try: pinfo = proc.as_dict() except psutil.NoSuchProcess: pass else: allpinfo[pinfo['pid']] = pinfo with open("rank-{}_{}.proclist".format(procid, list_procs), 'w') as outfile: json.dump(allpinfo, outfile, indent=2) else: # Got shutdown message break