Ejemplo n.º 1
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def getWallTime(self, resource):
        """Get the maximum runtime of the job in the correct format for the
           specified resource, either 'hms', 'hours', or 'seconds'. The time
           is stored internally as hours.

           Arguments:
             Resource  resource The resource to use for the time format
                                   option.

           Returns:
             str       time     The time in the correct format for the 
                                   specified resource.
        """
        timeFormat = ""
        time = ""
        if self.isParallel:
            timeFormat = resource.parallelTimeFormat
        else:
            timeFormat = resource.serialTimeFormat

        if timeFormat == 'hms':
            t = float(self.__wallTime)
            h = int(t)
            secs = (t - float(h))*3600
            m = int(secs/60)
            s = int(secs - m*60)
            time = "{0}:{1}:{2}".format(h,m,s)
        elif timeFormat == 'hours':
            time = str(int(math.ceil(float(self.__wallTime))))
        elif timeFormat == 'seconds':
            time = str(int(float(self.__wallTime)*3600))
        else:
            error.handleError("Unknown time format specified ({0}) for resource {1}.\n".format(timeFormat, resource.name), 1)
        return time
Ejemplo n.º 2
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def writeSerialJob(self, batch, resource, code, scriptFile):
        """This function writes out a serial job script for the specified
           resource. If errors are encountered then an error message is
           printed and the program exits.

           Arguments:
              Batch    batch      Batch system to use
              Resource resource   Resource to use
              Code     code       Code to use
              str      scriptFile The name of the script file to write
        """

        # Useful variables
        batchPre = batch.optionID

        # Does the specified resource allow serial jobs?
        if not resource.serialJobs:
            error.handleError("Resource: {0} does not support serial jobs.".format(resource.name))

        # The shell line
        text = resource.shell + "\n"
        scriptFile.write(text)

        scriptFile.write("#\n# Serial script produced by bolt\n")
        scriptFile.write("#        Resource: {0} ({1})\n".format(resource.name, resource.arch))
        scriptFile.write("#    Batch system: {0}\n#\n".format(batch.name))
        scriptFile.write("# bolt is written by EPCC (http://www.epcc.ed.ac.uk)\n#\n")

        # Get the batch options
        text = batch.getOptionLines(False, self.name, self.queueName, \
                                    self.getWallTime(resource), self.accountID)
        scriptFile.write(text)

        # Get any further options from resource configuration
        scriptFile.write(resource.jobOptions)

        # Script preambles: resource -> batch -> code -> job
        if resource.parallelScriptPreamble != ("" or None):
            scriptFile.write(resource.parallelScriptPreamble + "\n")
        if batch.parallelScriptPreamble != ("" or None):
            scriptFile.write(batch.parallelScriptPreamble + "\n")
        if code is not None:
            if code.preamble is not None: scriptFile.write(code.postamble + "\n")
        if self.scriptPreamble != ("" or None):
            scriptFile.write(self.scriptPreamble + "\n")

        # Serial run line
        scriptFile.write("# Run the serial program\n")
        scriptFile.write(self.jobCommand + "\n")

        # Script postambles: job -> code -> batch -> resource
        if self.scriptPostamble != ("" or None):
            scriptFile.write(self.scriptPostamble + "\n")
        if code is not None:
            if code.postamble is not None: scriptFile.write(code.postamble + "\n")
        if batch.parallelScriptPostamble != ("" or None):
            scriptFile.write(batch.parallelScriptPostamble + "\n")
        if resource.parallelScriptPostamble != ("" or None):
            scriptFile.write(resource.parallelScriptPostamble + "\n")
Ejemplo n.º 3
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def setThreads(self, threads):
        """Set the number of shared-memory threads per parallel tasks.
        Checks that an integer number of threads per task are requested
        and exits with an error if not.

        Arguments:
           int threads  The number of threads per parallel task.
        """
        # Check we have an integer number of tasks
        if re.search("^[0-9]+$", str(threads)) is not None:
            self.__threads = int(threads)
        else:
            # Something elsethrow an error
           error.handleError("Non-numeric number of threads per task specified ({0}).\n".format(threads))
Ejemplo n.º 4
0
Archivo: job.py Proyecto: ebreitmo/bolt
 def setTasksPerNode(self, tasks):
     """Set the number of parallel tasks per node. Checks that an integer
        number of tasks per node are requested and exits with an error if
        not.
      
        Arguments:
          str tasks   The number of parallel tasks per node
     """
     # Check we have an integer number of tasks
     if re.search("^[0-9]+$", str(tasks)) is not None:
         self.__pTasksPerNode = int(tasks)
     else:
         # Something elsethrow an error
        error.handleError("Non-numeric number of tasks per node specified ({0}).\n".format(tasks))
Ejemplo n.º 5
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def checkTime(self, resource):
        """Check that the time requested is consistent with the selected
           resource. If an error is found then a message is printed and
           the program exits.

           Arguments:
             Resource  resource The selected resource for the
                                   time-consistency check
        """

        # Number of nodes needed for this job
        nodesUsed = self.pTasks / self.pTasksPerNode
        if (self.pTasks % self.pTasksPerNode) > 0:
            nodesUsed += 1

        # Check of we have requested a consistent job length
        if self.wallTime > float(resource.maxJobTimeByNodes(nodesUsed)):
           error.handleError("Requested walltime ({0} hours) longer than maximum allowed on resource {1} for this number of nodes ({2} hours).".format(self.wallTime, resource.name, resource.maxJobTimeByNodes(nodesUsed)))
Ejemplo n.º 6
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def setWallTime(self, time):
        """Set the maximum runtime. Checks that the walltime is specified
           either as hh:mm:ss or integer number of hours. If not the
           correct format then print an error and exit.

           Arguments:
             str timeThe maximum walltime (hh:mm:ss or hours)
        """
        # Check we have a compatible time format
        if re.search("^[0-9]+:[0-9]+:[0-9]+$", str(time)) is not None:
            # hh:mm:ss, convert to hours
            hms = time.split(":")
            self.__wallTime = float(hms[0]) + float(hms[1])/60 + \
                              float(hms[2])/3600
        elif re.search("^[0-9]+\.[0-9]+$", str(time)) is not None:
            # Just hours
            self.__wallTime = float(time)
        elif re.search("^[0-9]+$", str(time)) is not None:
            # Just hours
            self.__wallTime = float(time)
        else:
            # Something elsethrow an error
           error.handleError("Time specified ({0}) is not in format hh:mm:ss.\n".format(time), 1)
Ejemplo n.º 7
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def checkTasks(self, resource, code):
        """Check that the tasks requested are consistent with the selected
           resource. If errors are found then an error is printed and the
           program exits.

           Arguments:
             Resource  resource The selected resource to use for the
                                consistency check
             Code      code     The code specified (None if no code
                                specified)
        """

        # Check parallel jobs are supported on this resource
        if not resource.parallelJobs:
            error.handleError("Resource {0} does not support parallel jobs.".format(resource.name))

        # Check we do not have more tasks per node than tasks
        if self.pTasksPerNode > self.pTasks:
            tpn = min(self.pTasks, resource.numCoresPerNode())
            error.printWarning("Number of specified parallel tasks per node ({0}) is greater than the number of specified parallel tasks ({1}). Reducing tasks per node to {2}.".format(self.pTasksPerNode, self.pTasks, tpn))
            self.setTasksPerNode(tpn)

        # Check the number of tasks per node
        if self.pTasksPerNode > resource.numCoresPerNode():
            tpn = resource.numCoresPerNode()
            error.printWarning("Number of specified parallel tasks per node ({0}) is greater than number available for resource {1} ({2}). Reducing tasks per node to {3}.".format(self.pTasksPerNode, resource.name, resource.numCoresPerNode(), tpn))
            self.setTasksPerNode(resource.numCoresPerNode())

        # Check that we support hybrid jobs if it has been requested
        if (self.threads > 1) and (self.pTasks > 1) and (not resource.hybridJobs):
            error.handleError("Resource {0} does not support hybrid distributed-/shared-memory jobs please only use 1 threads per task.".format(resource.name))

        # Check that the number of shared-memory threads requested 
        # is consistent
        # Do we have enough cores on a node
        coresPerNodeRequired = self.pTasksPerNode * self.threads
        if coresPerNodeRequired > resource.numCoresPerNode():
            error.handleError("Number of cores per node required ({0}) is greater than number available for resource {1} ({2}). Reduce number of threads per task or tasks per node".format(coresPerNodeRequired, resource.name, resource.numCoresPerNode()))
     

        # Check the total number of tasks
        # Number of nodes needed for this job
        nodesUsed = self.pTasks / self.pTasksPerNode
        if (self.pTasks % self.pTasksPerNode) > 0:
            nodesUsed += 1
        pUnits = nodesUsed * resource.numCoresPerNode()
        if pUnits > resource.maxTasks:
            error.handleError("Resources required ({0} cores) is greater than number available for resource {1} ({2}).".format(pUnits, resource.name, resource.maxTasks))
        if pUnits < resource.minTasks:
            error.handleError("Resources required ({0} cores) is less than minimum required for resource {1} ({2}).".format(pUnits, resource.name, resource.minTasks))

        # Check against tasks for code
        if code is not None:
            # Test the maximum tasks
            if (code.maxTasks > 0) and (pUnits > code.maxTasks):
                error.handleError("Resources required ({0} cores) is greater than number allowed for code {1} ({2}).".format(pUnits, code.name, code.maxTasks))
            # Test the mimimum tasks
            if (code.minTasks > 0) and (pUnits < code.minTasks):
                error.handleError("Resources required ({0} cores) is less than minimum required for code {1} ({2}).".format(pUnits, code.name, code.minTasks))
Ejemplo n.º 8
0
Archivo: job.py Proyecto: ebreitmo/bolt
    def setParallelDistribution(self, resource, batch):
        """This method distributes the tasks optimally for the specified
           resource. If any errors are encountered then an error message
           is printed and the program stops.

           Tasks can either be ditributed using options to a parallel job
           launcher (e.g. mpiexec), by the options passed to the batch
           system, or by using both methods.

           Arguments:
              Resource resource   The resource to use for the task
                                  distribution
              Batch    batch      The batch system to use for the task
                                  distribution
        """

        # Make sure the job run line is empty
        runLine = ""

        # First compute all the values we might need
        # Number of nodes needed
        nodesUsed = self.pTasks / self.pTasksPerNode
        if (self.pTasks % self.pTasksPerNode) > 0:
            nodesUsed += 1
        # Number of cores used per die
        coresPerDieUsed = min(self.pTasksPerNode, resource.coresPerDie)
        if (self.pTasksPerNode % (resource.diesPerSocket*resource.socketsPerNode)) == 0:
            coresPerDieUsed = self.pTasksPerNode / (resource.socketsPerNode*resource.diesPerSocket)
        else:
            # If we cannot divide this up then we just need to ignore this option
            coresPerDieUsed = 0
        # Task stride - if we have enough spare cores use the preferred stride
        # Also depends if we have specified threads or not - if we have specified 
        # the number of threads then this should be the stride
        strideUsed = 1
        if (self.threads > 1):
            strideUsed = self.threads
            if "csh" in resource.shell:
                runLine = "setenv OMP_NUM_THREADS " + str(self.threads) + "\n"
            else:
                runLine = "export OMP_NUM_THREADS=" + str(self.threads) + "\n"
        elif coresPerDieUsed == 0:
            # This is if we need to ignore the tasks per die option
            if (resource.numCoresPerNode() / self.pTasksPerNode) > resource.preferredStride:
                strideUsed = min(self.pTasksPerNode, resource.preferredStride)
        elif (resource.coresPerDie / coresPerDieUsed) >= resource.preferredStride:
            strideUsed = min(coresPerDieUsed, resource.preferredStride)
            

        # Test to see if we have a parallel run command
        runCommand = resource.parallelJobLauncher
        useRunCommand = True
        if (runCommand == None) or (runCommand == ""):
            useRunCommand = False
            # No job launcher command, are we using batch options instead?
            if not resource.useBatchParallelOpts:
                error.handleError("No parallel run command or batch options to use.\n", 1)

        pBatchOptions = ""

        #-------------------------------------------------------------------------------------------
        # Settings for using parallel job launcher
        if useRunCommand:

            # Most basic is just the parallel command and number of tasks
            option = resource.parallelTaskOption
            if (option is None) or (option == ""):
                error.handleError("The job launcher parallel task option is not set.\n", 1)
            elif self.pTasks == 0:
                error.handleError("The number of parallel tasks has not been set.\n", 1)
            runline = "{0}{1} {2} {3}".format(runLine, resource.parallelJobLauncher, option, self.pTasks)

            # Can we control the nodes used?
            option = resource.nodesOption
            if ((option != "") and (option is not None)) and (self.pTasksPerNode > 0):
                runline = "{0} {1} {2}".format(runline, option, nodesUsed)

            # Can we control the number of tasks per node?
            option = resource.taskPerNodeOption
            if ((option != "") and (option is not None)) and (self.pTasksPerNode > 0):
                runline = "{0} {1} {2}".format(runline, option, self.pTasksPerNode)

            # Can we control the number of tasks per die?
            option = resource.taskPerDieOption
            if ((option != "") and (option is not None)) and (self.pTasksPerNode > 1) and (coresPerDieUsed > 0):
                runline = "{0} {1} {2}".format(runline, option, coresPerDieUsed)
                
            # Can we control the stride
            option = resource.taskStrideOption
            if (option is not None) and (option != ""):
                runline = "{0} {1} {2}".format(runline, option, strideUsed) 
            
            self.__runLine = runline


        #-------------------------------------------------------------------------------------------
        # Settings for using parallel batch options
        # Most basic is just the parallel option and number of tasks/nodes. All jobs use this.
        option = batch.parallelOption
        if (option == None) or (option == ""):
            error.handleError("The batch parallel task option is not set.\n", 1)
        elif self.pTasks == 0:
            error.handleError("The number of parallel tasks has not been set.\n", 1)
        pUnits = self.pTasks
        # How are parallel resources allocated on this resource?
        if resource.parallelBatchUnit == "tasks":
            # Job allocated by tasks
            # Do we have exclusive node access or not
            if resource.nodeExclusive:
                # Yes, we need number of tasks corresponding to full nodes
                pUnits = nodesUsed * resource.numCoresPerNode()
            else:
                # No, we just need number of parallel tasks
                pUnits = self.pTasks
        elif resource.parallelBatchUnit == "nodes":
            # Job allocated by nodes
            pUnits = nodesUsed
        else:
            error.handleError("Unit of resource: {0} is not defined (use 'tasks' or 'nodes') in resource configuration file for resource: {1}.\n".format(resource.parallelBatchUnit, resource.name))
        # Set the option
        pBatchOptions = "{0} {1}{2}\n".format(batch.optionID, option, pUnits)

        # Additional options if we need them
        if resource.useBatchParallelOpts:
            # Can we control the number of tasks per node?
            option = batch.taskPerNodeOption
            if (option != "") and (option is not None) and (self.pTasksPerNode > 0):
                pBatchOptions = "{0}{1} {2}{3}\n".format(pBatchOptions, batch.optionID, option, self.pTasksPerNode)

            # Can we control the number of tasks per die?
            option = batch.taskPerDieOption
            if not ((option == "") or (option is None)) and (self.pTasksPerNode > 1) and (coresPerDieUsed > 0):
                pBatchOptions = "{0}{1} {2}{3}".format(pBatchOptions, batch.optionID, option, coresPerDieUsed)
                
            # Can we control the stride
            option = batch.taskStrideOption
            if (option is not None) and (option != ""):
                pBatchOptions = "{0}{1} {2}{3}".format(pBatchOptions, batch.optionID, option, strideUsed) 


        self.__pBatchOptions = pBatchOptions