コード例 #1
0
ファイル: torque.py プロジェクト: wsfreund/python-torque
    def getstatusoutput(self, cmd):
        ''' run command either locally or remotely and get output and status '''
        if self.host is None:
            status, output = commands.getstatusoutput(cmd)
        else:
            status, output = ssh(cmd, self.host)

        return status, output
コード例 #2
0
ファイル: torque.py プロジェクト: jkitchin/python-torque
    def getstatusoutput(self, cmd):
        ''' run command either locally or remotely and get output and status '''
        if self.host is None:
            status,output = commands.getstatusoutput(cmd)
        else:
            status,output = ssh(cmd,self.host)

        return status, output
コード例 #3
0
ファイル: torque.py プロジェクト: wsfreund/python-torque
    def qsub(self, jobfiles, QSUB=None, remotedir=None):
        '''
        jobfiles is an iterable that contains all the files needed for the job.
        The job script is jobfiles[0]

        remotedir is the directory to run the job in on the remote server.
        this should be a unique directory for this job because the src files
        will be copied into it,
        and allthe files in it copied back. AND the remotedir will be deleted
        after the job is done

        server is assumed to be the same place as the PBS server host

        QSUB is the command to use to submit the job, including
        options. If it is None, the command is constructed.

        this function is designed to raise Exceptions unless the job is
        finished without
        PBS errors. Then it returns True!
        '''

        jobfile = jobfiles[0]
        rcfile = jobfile + '.rc'

        if QSUB is None:
            QSUB = 'qsub -j oe -l cput=24:00:00,mem=499mb'

        server = self.host

        jobdonefile = jobfile + '.done'
        if os.path.exists(jobdonefile):
            raise JobDone, 'That job is done. delete %s to resubmit it.' % jobdonefile

        remotedirfile = jobfile + '.remotedir'

        if remotedir is None and os.path.exists(remotedirfile):
            remotedir = open(remotedirfile, 'r').readline().strip()
        elif remotedir is None:
            import tempfile
            i, tmpdirname = tempfile.mkstemp(dir='.')
            path, tmpdirname = os.path.split(tmpdirname)

            remotedir = 'tmp/%s' % tmpdirname

            # mkstemp actually makes the file, so we delete it
            os.close(i)
            os.unlink(tmpdirname)
        else:
            remotedir = remotedir  # arg passed into function

        f = open(remotedirfile, 'w')
        f.write(remotedir)
        f.close()

        pushbackfile = jobfile + '.pushback'
        pullbackfile = jobfile + '.pullback'
        qsubfile = jobfile + '.qsubcmd'

        # see if job has been submitted before
        jobid_file = jobfile + '.jobid'
        if os.path.exists(jobid_file):
            f = open(jobid_file, 'r')
            jobid = f.readline().strip()
            f.close()

            self.fastpoll()
            # see if job is in the queue still
            for job in self:
                if job['Job Id'] == jobid:

                    if job['job_state'] == 'Q':
                        if self.verbosity > 1:
                            print '%s still in the queue' % jobid
                        raise JobInQueue, '%s still in the queue' % jobid
                    elif job['job_state'] == 'R':
                        if self.verbosity > 1:
                            print '%s is running' % jobid
                        raise JobRunning, '%s is running' % jobid
                    elif job['job_state'] == 'H':
                        raise JobHold, '%s is in Hold status' % jobid
                    elif job['job_state'] == 'E':
                        raise JobErrorStatus, '%s is in Error status' % jobid
                    elif job['job_state'] == 'C':
                        print '%s is done' % jobid
                    else:
                        raise UnknownJobStatus, '%s is in unknown state: %s' % (
                            jobid, job['job_state'])

            if self.verbosity > 1:
                print '%s is not in the queue anymore' % jobid

            # if you get here, it was not in the queue anymore
            # now we need to copy the results back
            src = '%s:%s/' % (server, remotedir)

            if self.verbosity > 1:
                print 'copying back remote results: ', src

            status, output = rsync(src, '.')

            # we have made it this far, we should now remove the remote
            # directory.
            if self.verbosity > 1:
                print 'removing remote directory: ', remotedir
            cmd = 'rm -fr  %s' % (remotedir)
            status, output = ssh(cmd, server)
            if self.verbosity > 1:
                print 'removing remote directory status: ', status

            # now remove some files we don't need anymore
            os.unlink(jobid_file)

            nodefile = 'pbs.%s.nodes' % jobid
            if os.path.exists(nodefile):
                os.unlink(nodefile)

            if os.path.exists(pushbackfile):
                os.unlink(pushbackfile)

            if os.path.exists(pullbackfile):
                os.unlink(pullbackfile)

            if os.path.exists(remotedirfile):
                os.unlink(remotedirfile)

            if os.path.exists(qsubfile):
                os.unlink(qsubfile)

            # now lets try to check for batch errors like memory
            # exceeded or cput exceeded
            jobnumber, host = jobid.split('.')
            joboutputfile = jobfile[0:15] + '.o%s' % jobnumber

            # this may not exist if user killed job before it started
            # I also assume here that output and error have been joined
            if os.path.exists(joboutputfile):
                f = open(joboutputfile, 'r')

                #now lets hunt for errors in the output file
                for line in f:

                    if '=>> PBS: job killed: mem' in line:
                        raise PBS_MemoryExceeded, line

                    elif '=>> PBS: job killed: cput' in line:
                        raise PBS_CputExceeded, line

                    elif 'Terminated' in line:
                        raise PBS_Terminated, line

                    elif '=>> PBS:' in line:
                        raise PBS_UknownError, line

                    elif 'ERROR: LAM/MPI' in line:
                        for line2 in f:
                            if 'ERROR' in line2:
                                print line2
                            if 'ssh' in line2:
                                print line2
                        raise LAMMPI_Error, line

                    elif 'forrtl' in line or 'SIGSEGV' in line:
                        raise FORTRAN_Error, line

                f.close()

            return True

        if os.environ.get('PBS_DRYRUN', None) is not None:
            print 'Dry run detected. exiting'
            return

        # this job needs to be submitted if you get here
        if self.verbosity > 1:
            print 'Submitting job:'
        destination = '%s:%s' % (server, remotedir)

        #make sure destination directory exists
        status, output = ssh('mkdir -p %s' % remotedir, server)

        #1 copy files to remote system
        status, output = rsync(jobfiles, destination)

        #2 submit job
        cmds = ['cd %s' % remotedir, '%s %s' % (QSUB, jobfile)]

        cmd = string.join(cmds, '; ')

        status, output = ssh(cmd, server)
        if status is not 0:
            print '==================================='
            print output
            print '==================================='
            raise PBS_UnknownError

        # we should save the jobid
        f = open(jobid_file, 'w')
        f.write(output)
        f.close()

        # copy jobid file to remotedir so we can tell on that end
        # what this temp dir is for.
        rsync(jobid_file, destination)

        # get user and hostname to copy results back to
        import platform
        uname = platform.uname()
        hostname = uname[1]
        status, user = commands.getstatusoutput('whoami')

        f = open(qsubfile, 'w')
        f.write('%s %s\n' % (QSUB, jobfile))
        f.close()

        f = open(pullbackfile, 'w')
        f.write('#!/bin/tcsh -x\n')
        f.write('rsync -avz %s:%s/ .\n' % (self.host, remotedir))
        f.write('ssh %s@%s rm -fr %s\n' % (user, self.host, remotedir))
        f.write('#end')
        f.close()
        os.chmod(pullbackfile, 0755)

        f = open(pushbackfile, 'w')
        f.write('#!/bin/tcsh -x\n')
        f.write('rsync -avz . %s@%s:%s\n' % (user, hostname, os.getcwd()))
        f.close()
        os.chmod(pushbackfile, 0755)
        rsync(pushbackfile, destination)

        raise JobSubmitted, output
コード例 #4
0
ファイル: torque.py プロジェクト: jkitchin/python-torque
    def qsub(self,
             jobfiles,
             QSUB=None,
             remotedir=None):

        '''
        jobfiles is an iterable that contains all the files needed for the job.
        The job script is jobfiles[0]

        remotedir is the directory to run the job in on the remote server.
        this should be a unique directory for this job because the src files
        will be copied into it,
        and allthe files in it copied back. AND the remotedir will be deleted
        after the job is done

        server is assumed to be the same place as the PBS server host

        QSUB is the command to use to submit the job, including
        options. If it is None, the command is constructed.

        this function is designed to raise Exceptions unless the job is
        finished without
        PBS errors. Then it returns True!
        '''

        jobfile = jobfiles[0]
        rcfile = jobfile + '.rc'

        if QSUB is None:
            QSUB = 'qsub -j oe -l cput=24:00:00,mem=499mb'

        server = self.host

        jobdonefile = jobfile + '.done'
        if os.path.exists(jobdonefile):
            raise JobDone, 'That job is done. delete %s to resubmit it.' % jobdonefile

        remotedirfile = jobfile + '.remotedir'

        if remotedir is None and os.path.exists(remotedirfile):
            remotedir = open(remotedirfile, 'r').readline().strip()
        elif remotedir is None:
            import tempfile
            i, tmpdirname = tempfile.mkstemp(dir='.')
            path, tmpdirname = os.path.split(tmpdirname)

            remotedir = 'tmp/%s' % tmpdirname

            # mkstemp actually makes the file, so we delete it
            os.close(i)
            os.unlink(tmpdirname)
        else:
            remotedir = remotedir  # arg passed into function

        f = open(remotedirfile, 'w')
        f.write(remotedir)
        f.close()

        pushbackfile = jobfile + '.pushback'
        pullbackfile = jobfile + '.pullback'
        qsubfile = jobfile + '.qsubcmd'

        # see if job has been submitted before
        jobid_file = jobfile + '.jobid'
        if os.path.exists(jobid_file):
            f = open(jobid_file, 'r')
            jobid = f.readline().strip()
            f.close()

            self.fastpoll()
            # see if job is in the queue still
            for job in self:
                if job['Job Id'] == jobid:

                    if job['job_state'] == 'Q':
                        if self.verbosity > 1:
                            print '%s still in the queue' % jobid
                        raise JobInQueue, '%s still in the queue' % jobid
                    elif job['job_state'] == 'R':
                        if self.verbosity > 1:
                            print '%s is running' % jobid
                        raise JobRunning, '%s is running' % jobid
                    elif job['job_state'] == 'H':
                        raise JobHold, '%s is in Hold status' % jobid
                    elif job['job_state'] == 'E':
                        raise JobErrorStatus, '%s is in Error status' % jobid
                    elif job['job_state'] == 'C':
                        print '%s is done' % jobid
                    else:
                        raise UnknownJobStatus, '%s is in unknown state: %s'% (jobid,
                                                                               job['job_state'])

            if self.verbosity > 1:
                print '%s is not in the queue anymore' % jobid

            # if you get here, it was not in the queue anymore
            # now we need to copy the results back
            src = '%s:%s/' % (server, remotedir)

            if self.verbosity > 1:
                print 'copying back remote results: ', src

            status,output = rsync(src, '.')

            # we have made it this far, we should now remove the remote
            # directory.
            if self.verbosity > 1:
                print 'removing remote directory: ', remotedir
            cmd = 'rm -fr  %s' % (remotedir)
            status, output = ssh(cmd, server)
            if self.verbosity > 1:
                print 'removing remote directory status: ', status

            # now remove some files we don't need anymore
            os.unlink(jobid_file)

            nodefile = 'pbs.%s.nodes' % jobid
            if os.path.exists(nodefile):
                os.unlink(nodefile)

            if os.path.exists(pushbackfile):
                os.unlink(pushbackfile)

            if os.path.exists(pullbackfile):
                os.unlink(pullbackfile)

            if os.path.exists(remotedirfile):
                os.unlink(remotedirfile)

            if os.path.exists(qsubfile):
                os.unlink(qsubfile)

            # now lets try to check for batch errors like memory
            # exceeded or cput exceeded
            jobnumber, host = jobid.split('.')
            joboutputfile = jobfile[0:15] + '.o%s' % jobnumber

            # this may not exist if user killed job before it started
            # I also assume here that output and error have been joined
            if os.path.exists(joboutputfile):
                f = open(joboutputfile, 'r')

                #now lets hunt for errors in the output file
                for line in f:

                    if '=>> PBS: job killed: mem' in line:
                        raise PBS_MemoryExceeded, line

                    elif '=>> PBS: job killed: cput' in line:
                        raise PBS_CputExceeded,line

                    elif 'Terminated' in line:
                        raise PBS_Terminated,line

                    elif '=>> PBS:' in line:
                        raise PBS_UknownError, line

                    elif 'ERROR: LAM/MPI' in line:
                        for line2 in f:
                            if 'ERROR' in line2:
                                print line2
                            if 'ssh' in line2:
                                print line2
                        raise LAMMPI_Error, line

                    elif 'forrtl' in line or 'SIGSEGV' in line:
                        raise FORTRAN_Error, line

                f.close()

            return True

        if os.environ.get('PBS_DRYRUN', None) is not None:
            print 'Dry run detected. exiting'
            return

        # this job needs to be submitted if you get here
        if self.verbosity > 1:
            print 'Submitting job:'
        destination = '%s:%s' % (server, remotedir)

        #make sure destination directory exists
        status,output = ssh('mkdir -p %s' % remotedir, server)

        #1 copy files to remote system
        status,output = rsync(jobfiles, destination)

        #2 submit job
        cmds = ['cd %s' % remotedir,
                '%s %s' % (QSUB, jobfile)]

        cmd = string.join(cmds, '; ')

        status, output = ssh(cmd, server)
        if status is not 0:
            print '==================================='
            print output
            print '==================================='
            raise PBS_UnknownError

        # we should save the jobid
        f = open(jobid_file, 'w')
        f.write(output)
        f.close()

        # copy jobid file to remotedir so we can tell on that end
        # what this temp dir is for.
        rsync(jobid_file, destination)

        # get user and hostname to copy results back to
        import platform
        uname = platform.uname()
        hostname = uname[1]
        status, user = commands.getstatusoutput('whoami')

        f = open(qsubfile, 'w')
        f.write('%s %s\n' % (QSUB, jobfile))
        f.close()


        f = open(pullbackfile, 'w')
        f.write('#!/bin/tcsh -x\n')
        f.write('rsync -avz %s:%s/ .\n' % (self.host, remotedir))
        f.write('ssh %s@%s rm -fr %s\n' % (user, self.host, remotedir))
        f.write('#end')
        f.close()
        os.chmod(pullbackfile, 0755)


        f = open(pushbackfile, 'w')
        f.write('#!/bin/tcsh -x\n')
        f.write('rsync -avz . %s@%s:%s\n' % (user, hostname, os.getcwd()))
        f.close()
        os.chmod(pushbackfile, 0755)
        rsync(pushbackfile, destination)

        raise JobSubmitted, output