Example #1
0
 def test_wait(self):
     """pbs.qwait should wait for a pbs job to finish running."""
     if os.path.exists(self.temp_output_filename):
         os.remove(self.temp_output_filename)
     pbs_id = pbs.qsub(self.pbs_script_filename)
     pbs.qwait(pbs_id)
     os.system('ls > /dev/null') # This triggers the panfs file system to make the file appear.
     assert os.path.exists(self.temp_output_filename), "pbs.qwait returned, but the expected output does not yet exist."
Example #2
0
def submit_files_until_done(filenames, wait_for_all=False, delay_check=0.5, sleep_seconds=60 * 5, 
                            quiet=False, 
                            fail_when_max=False, 
                            retry_on_failure=True):
    global max_submissions
    submitted_ids = []
    num_to_submit = len(filenames)
    while filenames:
        num_submitted = len(pbs.qstat(user=os.environ['USER']))
        if (num_submitted < max_submissions):
            if os.path.exists(filenames[0]):
                try:
                    job_id = pbs.qsub(filenames[0], verbose=not quiet) 
                    if delay_check:
                        time.sleep(delay_check)
                    pbs.qstat(job_id=job_id) # If this doesn't throw, then it was submitted successfully
                    if not quiet:
                        print 'Submitted %s as "%s" at %s  (%s/%s left to submit)' % (filenames[0], job_id, time.asctime(), len(filenames[1:]), num_to_submit)
                    filenames = filenames[1:]
                    submitted_ids.append(job_id)
                    num_submitted = num_submitted + 1
                    if not quiet:
                        print 'I think submitted %d/%d' % (num_submitted,max_submissions)
                    sys.stderr.flush()
                    sys.stdout.flush()
                except pbs.PBSUtilError:
                    traceback.print_exc()
                    if not quiet:
                        print 'Failed to submit %s at %s  (%s left to submit)' % (filenames[0], time.asctime(), len(filenames[1:]))
                    sys.stderr.flush()
                    sys.stdout.flush()

                    if not retry_on_failure:
                        raise QSubFailure()

                    time.sleep(max(int(round(sleep_seconds/2)), 1))
                    # Maybe we saturated the queue.
            else:
                if not quiet:
                    print 'ERROR: Cannot submit %s because it does not exist.' % filenames[0]
                sys.stderr.flush()
                sys.stdout.flush()

                filenames = filenames[1:]
        else:
            if fail_when_max:
                raise ReachedMax()
            sys.stdout.write('Queue is currently full.')
            sys.stdout.flush()
            time.sleep(sleep_seconds)
    if wait_for_all:
        for job_id in submitted_ids:
            pbs.qwait(job_id)
    return submitted_ids
Example #3
0
 def startjob(self,jobnr):
     print "starte job"
     runfile="test.sh"
     if not self.testrun:
         runfile=self.runLine
     scriptname="run.sh"
     path =self.gettemppath(jobnr)
     self.startedcalcs +=1
     decimalcount = int(math.floor(math.log10(self.calculations)))
     fullname = self.name+("_%"+str(decimalcount+1)+"d/%d")%(self.startedcalcs,self.calculations)
     fullname = fullname.replace(" ","0")
     print "Name: "+fullname+" in folder: "+path
     script = pbs.makescript(path,runfile,self.nodes,self.ppn,self.queue,fullname,self.email,self.outpath,self.memorymultiplikator,self.hours,self.pbsparameters)
     scriptfile =path+scriptname
     files.writefile(scriptfile,script)
     jobid =pbs.qsub(scriptfile)
     open(self.jobidfile,"a").write(jobid+"\n")
     return jobid
Example #4
0
def run(student,
        db,
        datasets,
        hosts,
        nproc=1,
        nice=0,
        output_path='.',
        setup=None,
        student_args=None,
        use_qsub=False,
        qsub_queue='medium',
        qsub_name_suffix=None,
        dry_run=False,
        separate_student_output=False,
        warnings_as_errors=False,
        **kwargs):

    if not kwargs:
        args = ''
    else:
        args = ' '.join([
            '--%s %s' % (key, value)
            for key, value in kwargs.items() if value is not None
        ]) + ' '

    if qsub_name_suffix is None:
        qsub_name_suffix = ''
    elif not qsub_name_suffix.startswith('_'):
        qsub_name_suffix = '_' + qsub_name_suffix

    database = Database(db)
    print database

    output_path = os.path.normpath(output_path)
    if separate_student_output and os.path.basename(output_path) != student:
        output_path = os.path.join(output_path, os.path.splitext(student)[0])
    if not os.path.exists(output_path):
        if dry_run:
            print "mkdir -p %s" % output_path
        else:
            mkdir_p(output_path)

    python_flags = ''
    if warnings_as_errors:
        python_flags = '-W error'

    CMD = "python %s run --output-path %s -s %s -n %%d --db %s --nice %d %s%%s" % (
        python_flags, output_path, student, db, nice, args)
    if setup is not None:
        CMD = "%s && %s" % (setup, CMD)
    CWD = os.getcwd()

    datasets = datasets[:]

    proc_cmds = []
    while len(datasets) > 0:
        ds = datasets.pop(0)

        output_name = os.path.splitext(student)[0] + '.' + ds
        suffix = kwargs.get('suffix', None)
        if suffix:
            output_name += '_%s' % suffix
        output_name += '.root'
        output_name = os.path.join(output_path, output_name)
        if os.path.exists(output_name):
            print "Output %s already exists. Please delete it and resubmit." % (
                output_name)
            continue

        try:
            files = database[ds].files
        except KeyError:
            print "dataset %s not in database" % ds
            continue

        # determine actual number of required CPU cores
        nproc_actual = min(nproc, len(files))
        cmd = CMD % (nproc_actual, ds)
        if student_args:
            cmd = '%s %s' % (cmd, ' '.join(student_args))
        cmd = "cd %s && %s" % (CWD, cmd)

        if use_qsub:  # use the batch system
            qsub(cmd,
                 queue=qsub_queue,
                 ppn=nproc_actual,
                 name=student.strip('.py') + '.' + ds + qsub_name_suffix,
                 stderr_path=output_path,
                 stdout_path=output_path,
                 dry_run=dry_run)

        else:  # use simple ssh
            print cmd
            if not dry_run:
                proc_cmds.append(cmd)

    if not use_qsub and not dry_run:
        # use simple ssh with basic load balancing
        hosts = [Host(host) for host in hosts]
        procs = []
        while True:
            active = mp.active_children()
            while len(active) < (2 * len(hosts)) and len(proc_cmds) > 0:
                hosts.sort()
                host = hosts[0]
                cmd = "ssh %s '%s'" % (host.name, proc_cmds.pop(0))
                proc = mp.Process(target=run_helper, args=(cmd, ))
                proc.start()
                procs.append(proc)
                host.njobs += 1
                # active_children() joins finished procs
                active = mp.active_children()
            #print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
            #print "jobs: %i running %i queued"%(len(active),len(train_processes))
            if len(proc_cmds) == 0 and len(active) == 0:
                break
            time.sleep(10)
Example #5
0
def run(student,
        db,
        datasets,
        hosts,
        nproc=1,
        nice=0,
        output_path='.',
        setup=None,
        student_args=None,
        use_qsub=False,
        qsub_queue='medium',
        qsub_name_suffix=None,
        dry_run=False,
        separate_student_output=False,
        warnings_as_errors=False,
        **kwargs):

    if not kwargs:
        args = ''
    else:
        args = ' '.join(['--%s %s' % (key, value)
            for key, value in kwargs.items() if value is not None]) + ' '

    if qsub_name_suffix is None:
        qsub_name_suffix = ''
    elif not qsub_name_suffix.startswith('_'):
        qsub_name_suffix = '_' + qsub_name_suffix

    database = Database(db)

    output_path = os.path.normpath(output_path)
    if separate_student_output and os.path.basename(output_path) != student:
        output_path = os.path.join(output_path, os.path.splitext(student)[0])
    if not os.path.exists(output_path):
        if dry_run:
            print "mkdir -p %s" % output_path
        else:
            mkdir_p(output_path)

    python_flags = ''
    if warnings_as_errors:
        python_flags = '-W error'

    CMD = "python %s run --output-path %s -s %s -n %%d --db %s --nice %d %s%%s" % (
           python_flags, output_path, student, db, nice, args)
    if setup is not None:
        CMD = "%s && %s" % (setup, CMD)
    CWD = os.getcwd()

    datasets = datasets[:]

    proc_cmds = []
    while len(datasets) > 0:
        ds = datasets.pop(0)

        output_name = os.path.splitext(student)[0] + '.' + ds
        suffix = kwargs.get('suffix', None)
        if suffix:
            output_name += '_%s' % suffix
        output_name += '.root'
        output_name = os.path.join(output_path, output_name)
        if os.path.exists(output_name):
            print "Output %s already exists. Please delete it and resubmit." % (
                output_name)
            continue

        try:
            files = database[ds].files
        except KeyError:
            print "dataset %s not in database" % ds
            continue

        # determine actual number of required CPU cores
        nproc_actual = min(nproc, len(files))
        cmd = CMD % (nproc_actual, ds)
        if student_args:
            cmd = '%s %s' % (cmd, ' '.join(student_args))
        cmd = "cd %s && %s" % (CWD, cmd)

        if use_qsub: # use the batch system
            qsub(cmd,
                 queue=qsub_queue,
                 ppn=nproc_actual,
                 name=student.strip('.py') + '.' + ds + qsub_name_suffix,
                 stderr_path=output_path,
                 stdout_path=output_path,
                 dry_run=dry_run)

        else: # use simple ssh
            print cmd
            if not dry_run:
                proc_cmds.append(cmd)

    if not use_qsub and not dry_run:
        # use simple ssh with basic load balancing
        hosts = [Host(host) for host in hosts]
        procs = []
        while True:
            active = mp.active_children()
            while len(active) < (2 * len(hosts)) and len(proc_cmds) > 0:
                hosts.sort()
                host = hosts[0]
                cmd = "ssh %s '%s'" % (host.name, proc_cmds.pop(0))
                proc = mp.Process(target=run_helper, args=(cmd,))
                proc.start()
                procs.append(proc)
                host.njobs += 1
                # active_children() joins finished procs
                active = mp.active_children()
            #print time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
            #print "jobs: %i running %i queued"%(len(active),len(train_processes))
            if len(proc_cmds) == 0 and len(active) == 0:
                break
            time.sleep(10)
Example #6
0
File: send.py Project: AMDmi3/gem5
namehack = NameHack()

for job in joblist:
    jobdir = JobDir(joinpath(conf.rootdir, job.name))
    if depend:
        cptdir = JobDir(joinpath(conf.rootdir, job.checkpoint.name))
        cptjob = cptdir.readval('.pbs_jobid')

    if not onlyecho:
        jobdir.create()

    print 'Job name:       %s' % job.name
    print 'Job directory:  %s' % jobdir

    qsub = pbs.qsub()
    qsub.pbshost = 'simpool.eecs.umich.edu'
    qsub.stdout = jobdir.file('jobout')
    qsub.name = job.name[:15]
    qsub.join = True
    qsub.node_type = node_type
    qsub.env['ROOTDIR'] = conf.rootdir
    qsub.env['JOBNAME'] = job.name
    if depend:
        qsub.afterok = cptjob
    if queue:
        qsub.queue = queue
    qsub.build(joinpath(progpath, 'job.py'))

    if verbose:
        print 'PBS Command:    %s' % qsub.command
Example #7
0
namehack = NameHack()

for job in joblist:
    jobdir = JobDir(joinpath(conf.rootdir, job.name))
    if depend:
        cptdir = JobDir(joinpath(conf.rootdir, job.checkpoint.name))
        cptjob = cptdir.readval('.pbs_jobid')

    if not onlyecho:
        jobdir.create()

    print 'Job name:       %s' % job.name
    print 'Job directory:  %s' % jobdir

    qsub = pbs.qsub()
    qsub.pbshost = 'simpool.eecs.umich.edu'
    qsub.stdout = jobdir.file('jobout')
    qsub.name = job.name[:15]
    qsub.join = True
    qsub.node_type = node_type
    qsub.env['ROOTDIR'] = conf.rootdir
    qsub.env['JOBNAME'] = job.name
    if depend:
        qsub.afterok = cptjob
    if queue:
        qsub.queue = queue
    qsub.build(joinpath(progpath, 'job.py'))

    if verbose:
        print 'PBS Command:    %s' % qsub.command
Example #8
0
 def test_qsub_submits(self):
     """check that qsub successfully submits a script."""
     pbs_id = pbs.qsub(self.pbs_script_filename)
     assert pbs.qstat(job_id=pbs_id), "failed to find stats for %s which was just submitted." % pbs_id
Example #9
0
 def test_qsub(self):
     """pbs.qsub runs without error"""
     pbs.qsub(self.pbs_script_filename)