def test_wait(self): """pbs.qwait should wait for a pbs job to finish running.""" if os.path.exists(self.temp_output_filename): os.remove(self.temp_output_filename) pbs_id = pbs.qsub(self.pbs_script_filename) pbs.qwait(pbs_id) os.system('ls > /dev/null') # This triggers the panfs file system to make the file appear. assert os.path.exists(self.temp_output_filename), "pbs.qwait returned, but the expected output does not yet exist."
def submit_files_until_done(filenames, wait_for_all=False, delay_check=0.5, sleep_seconds=60 * 5, quiet=False, fail_when_max=False, retry_on_failure=True): global max_submissions submitted_ids = [] num_to_submit = len(filenames) while filenames: num_submitted = len(pbs.qstat(user=os.environ['USER'])) if (num_submitted < max_submissions): if os.path.exists(filenames[0]): try: job_id = pbs.qsub(filenames[0], verbose=not quiet) if delay_check: time.sleep(delay_check) pbs.qstat(job_id=job_id) # If this doesn't throw, then it was submitted successfully if not quiet: print 'Submitted %s as "%s" at %s (%s/%s left to submit)' % (filenames[0], job_id, time.asctime(), len(filenames[1:]), num_to_submit) filenames = filenames[1:] submitted_ids.append(job_id) num_submitted = num_submitted + 1 if not quiet: print 'I think submitted %d/%d' % (num_submitted,max_submissions) sys.stderr.flush() sys.stdout.flush() except pbs.PBSUtilError: traceback.print_exc() if not quiet: print 'Failed to submit %s at %s (%s left to submit)' % (filenames[0], time.asctime(), len(filenames[1:])) sys.stderr.flush() sys.stdout.flush() if not retry_on_failure: raise QSubFailure() time.sleep(max(int(round(sleep_seconds/2)), 1)) # Maybe we saturated the queue. else: if not quiet: print 'ERROR: Cannot submit %s because it does not exist.' % filenames[0] sys.stderr.flush() sys.stdout.flush() filenames = filenames[1:] else: if fail_when_max: raise ReachedMax() sys.stdout.write('Queue is currently full.') sys.stdout.flush() time.sleep(sleep_seconds) if wait_for_all: for job_id in submitted_ids: pbs.qwait(job_id) return submitted_ids
def startjob(self,jobnr): print "starte job" runfile="test.sh" if not self.testrun: runfile=self.runLine scriptname="run.sh" path =self.gettemppath(jobnr) self.startedcalcs +=1 decimalcount = int(math.floor(math.log10(self.calculations))) fullname = self.name+("_%"+str(decimalcount+1)+"d/%d")%(self.startedcalcs,self.calculations) fullname = fullname.replace(" ","0") print "Name: "+fullname+" in folder: "+path script = pbs.makescript(path,runfile,self.nodes,self.ppn,self.queue,fullname,self.email,self.outpath,self.memorymultiplikator,self.hours,self.pbsparameters) scriptfile =path+scriptname files.writefile(scriptfile,script) jobid =pbs.qsub(scriptfile) open(self.jobidfile,"a").write(jobid+"\n") return jobid
def run(student, db, datasets, hosts, nproc=1, nice=0, output_path='.', setup=None, student_args=None, use_qsub=False, qsub_queue='medium', qsub_name_suffix=None, dry_run=False, separate_student_output=False, warnings_as_errors=False, **kwargs): if not kwargs: args = '' else: args = ' '.join([ '--%s %s' % (key, value) for key, value in kwargs.items() if value is not None ]) + ' ' if qsub_name_suffix is None: qsub_name_suffix = '' elif not qsub_name_suffix.startswith('_'): qsub_name_suffix = '_' + qsub_name_suffix database = Database(db) print database output_path = os.path.normpath(output_path) if separate_student_output and os.path.basename(output_path) != student: output_path = os.path.join(output_path, os.path.splitext(student)[0]) if not os.path.exists(output_path): if dry_run: print "mkdir -p %s" % output_path else: mkdir_p(output_path) python_flags = '' if warnings_as_errors: python_flags = '-W error' CMD = "python %s run --output-path %s -s %s -n %%d --db %s --nice %d %s%%s" % ( python_flags, output_path, student, db, nice, args) if setup is not None: CMD = "%s && %s" % (setup, CMD) CWD = os.getcwd() datasets = datasets[:] proc_cmds = [] while len(datasets) > 0: ds = datasets.pop(0) output_name = os.path.splitext(student)[0] + '.' + ds suffix = kwargs.get('suffix', None) if suffix: output_name += '_%s' % suffix output_name += '.root' output_name = os.path.join(output_path, output_name) if os.path.exists(output_name): print "Output %s already exists. Please delete it and resubmit." % ( output_name) continue try: files = database[ds].files except KeyError: print "dataset %s not in database" % ds continue # determine actual number of required CPU cores nproc_actual = min(nproc, len(files)) cmd = CMD % (nproc_actual, ds) if student_args: cmd = '%s %s' % (cmd, ' '.join(student_args)) cmd = "cd %s && %s" % (CWD, cmd) if use_qsub: # use the batch system qsub(cmd, queue=qsub_queue, ppn=nproc_actual, name=student.strip('.py') + '.' + ds + qsub_name_suffix, stderr_path=output_path, stdout_path=output_path, dry_run=dry_run) else: # use simple ssh print cmd if not dry_run: proc_cmds.append(cmd) if not use_qsub and not dry_run: # use simple ssh with basic load balancing hosts = [Host(host) for host in hosts] procs = [] while True: active = mp.active_children() while len(active) < (2 * len(hosts)) and len(proc_cmds) > 0: hosts.sort() host = hosts[0] cmd = "ssh %s '%s'" % (host.name, proc_cmds.pop(0)) proc = mp.Process(target=run_helper, args=(cmd, )) proc.start() procs.append(proc) host.njobs += 1 # active_children() joins finished procs active = mp.active_children() #print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) #print "jobs: %i running %i queued"%(len(active),len(train_processes)) if len(proc_cmds) == 0 and len(active) == 0: break time.sleep(10)
def run(student, db, datasets, hosts, nproc=1, nice=0, output_path='.', setup=None, student_args=None, use_qsub=False, qsub_queue='medium', qsub_name_suffix=None, dry_run=False, separate_student_output=False, warnings_as_errors=False, **kwargs): if not kwargs: args = '' else: args = ' '.join(['--%s %s' % (key, value) for key, value in kwargs.items() if value is not None]) + ' ' if qsub_name_suffix is None: qsub_name_suffix = '' elif not qsub_name_suffix.startswith('_'): qsub_name_suffix = '_' + qsub_name_suffix database = Database(db) output_path = os.path.normpath(output_path) if separate_student_output and os.path.basename(output_path) != student: output_path = os.path.join(output_path, os.path.splitext(student)[0]) if not os.path.exists(output_path): if dry_run: print "mkdir -p %s" % output_path else: mkdir_p(output_path) python_flags = '' if warnings_as_errors: python_flags = '-W error' CMD = "python %s run --output-path %s -s %s -n %%d --db %s --nice %d %s%%s" % ( python_flags, output_path, student, db, nice, args) if setup is not None: CMD = "%s && %s" % (setup, CMD) CWD = os.getcwd() datasets = datasets[:] proc_cmds = [] while len(datasets) > 0: ds = datasets.pop(0) output_name = os.path.splitext(student)[0] + '.' + ds suffix = kwargs.get('suffix', None) if suffix: output_name += '_%s' % suffix output_name += '.root' output_name = os.path.join(output_path, output_name) if os.path.exists(output_name): print "Output %s already exists. Please delete it and resubmit." % ( output_name) continue try: files = database[ds].files except KeyError: print "dataset %s not in database" % ds continue # determine actual number of required CPU cores nproc_actual = min(nproc, len(files)) cmd = CMD % (nproc_actual, ds) if student_args: cmd = '%s %s' % (cmd, ' '.join(student_args)) cmd = "cd %s && %s" % (CWD, cmd) if use_qsub: # use the batch system qsub(cmd, queue=qsub_queue, ppn=nproc_actual, name=student.strip('.py') + '.' + ds + qsub_name_suffix, stderr_path=output_path, stdout_path=output_path, dry_run=dry_run) else: # use simple ssh print cmd if not dry_run: proc_cmds.append(cmd) if not use_qsub and not dry_run: # use simple ssh with basic load balancing hosts = [Host(host) for host in hosts] procs = [] while True: active = mp.active_children() while len(active) < (2 * len(hosts)) and len(proc_cmds) > 0: hosts.sort() host = hosts[0] cmd = "ssh %s '%s'" % (host.name, proc_cmds.pop(0)) proc = mp.Process(target=run_helper, args=(cmd,)) proc.start() procs.append(proc) host.njobs += 1 # active_children() joins finished procs active = mp.active_children() #print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) #print "jobs: %i running %i queued"%(len(active),len(train_processes)) if len(proc_cmds) == 0 and len(active) == 0: break time.sleep(10)
namehack = NameHack() for job in joblist: jobdir = JobDir(joinpath(conf.rootdir, job.name)) if depend: cptdir = JobDir(joinpath(conf.rootdir, job.checkpoint.name)) cptjob = cptdir.readval('.pbs_jobid') if not onlyecho: jobdir.create() print 'Job name: %s' % job.name print 'Job directory: %s' % jobdir qsub = pbs.qsub() qsub.pbshost = 'simpool.eecs.umich.edu' qsub.stdout = jobdir.file('jobout') qsub.name = job.name[:15] qsub.join = True qsub.node_type = node_type qsub.env['ROOTDIR'] = conf.rootdir qsub.env['JOBNAME'] = job.name if depend: qsub.afterok = cptjob if queue: qsub.queue = queue qsub.build(joinpath(progpath, 'job.py')) if verbose: print 'PBS Command: %s' % qsub.command
def test_qsub_submits(self): """check that qsub successfully submits a script.""" pbs_id = pbs.qsub(self.pbs_script_filename) assert pbs.qstat(job_id=pbs_id), "failed to find stats for %s which was just submitted." % pbs_id
def test_qsub(self): """pbs.qsub runs without error""" pbs.qsub(self.pbs_script_filename)