def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print("pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def update_all_nodes(batchserver_name): """ Update info about all nodes of the given batchserver. """ server,created = getBatchServer(batchserver_name) if not pbs_data_nodes.has_key(batchserver_name): pbs_data_nodes[batchserver_name] = {'last_update':None, 'nodes':{}} if pbs_data_nodes[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_nodes[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("Nodes info is new enough for server: %s" % batchserver_name) print "not updated" return pbs_data_nodes print "updated" conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statnodes = pbs.pbs_statnode(conn, "" , [], "") pbs.pbs_disconnect(conn) for sn in statnodes: node,created = getNode(sn.name, server) attr_dict = dict([ (x.name,x.value) for x in sn.attribs]) pbs_data_nodes[batchserver_name]['nodes'][node] = update_one_node_from_pbs_data(node, attr_dict) pbs_data_nodes[batchserver_name]['last_update'] = datetime.datetime.now() return pbs_data_nodes
def submit_get_subfamilies_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(5) attropl[0].name = pbs.ATTR_N attropl[0].value = "FAT-CAT Get Sub-Families: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) job.status_id = 5 job.save() job_id = pbs.pbs_submit(c, attropl, "/clusterfs/ohana/software/fatcat/scripts/get_best_nodes.py", 'web', 'NULL') logger.info("Submitting %s to the grid to get best nodes with id %s" % (job.id, job_id)) if job_id: job.get_best_nodes_pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def check_all_jobs( self ): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append( pbs_server_name ) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) failures.append( pbs_server_name ) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob( c, None, stat_attrl, None ) pbs.pbs_disconnect( c ) statuses.update( self.convert_statjob_to_bunches( jobs ) ) return( ( failures, statuses ) )
def check_all_jobs(self): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server( pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append(pbs_server_name) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: log.debug( "connection to PBS server %s for state check failed" % pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob(c, None, stat_attrl, None) pbs.pbs_disconnect(c) statuses.update(self.convert_statjob_to_bunches(jobs)) return ((failures, statuses))
def submit_with_retry(pbs_attrs, script_path, queue, pbs_server=None): # connect to pbs server connection = _connect_to_server(pbs_server) # submit job retry = 0 job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue, None) # if pbs.pbs_submit failed, try again while not job_id and retry < _MAX_RETRY: retry += 1 print("Retrying connection...", file=sys.stderr) time.sleep(retry**2) job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue, None) pbs.pbs_disconnect(connection) #check to see if the job was submitted successfully. if not job_id: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error submitting job. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return job_id
def disconnect_from_server(conn): """Disconnect a given connection.""" if pbs_import_failed: _log.error(pbs_import_failed) return None pbs.pbs_disconnect(conn)
def stop_job(self, job): """Attempts to delete a job from the PBS queue""" job_id = job.get_job_runner_external_id().encode('utf-8') job_tag = "(%s/%s)" % (job.get_id_tag(), job_id) log.debug("%s Stopping PBS job" % job_tag) # Declare the connection handle c so that it can be cleaned up: c = None try: pbs_server_name = self.__get_pbs_server(job.destination_params) if pbs_server_name is None: log.debug( "(%s) Job queued but no destination stored in job params, cannot delete" % job_tag) return c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: log.debug( "(%s) Connection to PBS server for job delete failed" % job_tag) return pbs.pbs_deljob(c, job_id, '') log.debug("%s Removed from PBS queue before job completion" % job_tag) except: e = traceback.format_exc() log.debug("%s Unable to stop job: %s" % (job_tag, e)) finally: # Cleanup: disconnect from the server. if (None is not c): pbs.pbs_disconnect(c)
def queues_page(): conn = pbs.pbs_connect(pbsserver) queues = get_queues(conn) pbs.pbs_disconnect(conn) queues = queue_attributes_reformat(queues) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'queues': queues}
def jobs_page(): conn = pbs.pbs_connect(pbsserver) jobs = get_jobs(conn) pbs.pbs_disconnect(conn) jobs = job_attributes_reformat(jobs) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'jobs': jobs}
def stop_job( self, job ): """Attempts to delete a job from the PBS queue""" job_id = job.get_job_runner_external_id().encode('utf-8') job_tag = "(%s/%s)" % ( job.get_id_tag(), job_id ) log.debug( "%s Stopping PBS job" % job_tag ) # Declare the connection handle c so that it can be cleaned up: c = None try: pbs_server_name = self.__get_pbs_server( job.destination_params ) if pbs_server_name is None: log.debug("(%s) Job queued but no destination stored in job params, cannot delete" % job_tag ) return c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("(%s) Connection to PBS server for job delete failed" % job_tag ) return pbs.pbs_deljob( c, job_id, '' ) log.debug( "%s Removed from PBS queue before job completion" % job_tag ) except: e = traceback.format_exc() log.debug( "%s Unable to stop job: %s" % ( job_tag, e ) ) finally: # Cleanup: disconnect from the server. if ( None is not c ): pbs.pbs_disconnect( c )
def update_all_queues(batchserver_name): """ Update info about all queues for give batchserver. """ server,created = getBatchServer(batchserver_name) if server.queues_lastupdate and (datetime.datetime.now()-server.queues_lastupdate).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("Queue info is new enough for server: %s" % batchserver_name) return conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statqueues = pbs.pbs_statque(conn, "" , [], "") pbs.pbs_disconnect(conn) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return for sq in statqueues: queue,created = getQueue(sq.name, server) attr_dict = dict([ (x.name,x.value) for x in sq.attribs]) update_one_queue_from_pbs_data(queue, attr_dict) queue.save() server.queues_lastupdate = datetime.datetime.now() server.save()
def update_all_jobs(batchserver_name): """ Update info about all jobs of the given batchserver. """ server,created = getBatchServer(batchserver_name) if not pbs_data_jobs.has_key(batchserver_name): pbs_data_jobs[batchserver_name] = {'last_update':None, 'jobs':{}} if pbs_data_jobs[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_jobs[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("jobs info is new enough for server: %s" % batchserver_name) print "not updated" return pbs_data_jobs print "updated" conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statjobs = pbs.pbs_statjob(conn, "" , [], "") pbs.pbs_disconnect(conn) for sj in statjobs: jobid = sj.name attr_dict = dict([ (x.name,x.value) for x in sj.attribs]) attr_dict = {} for x in sj.attribs: if x.resource: attr_dict[x.name+"_"+x.resource] = x.value else: attr_dict[x.name] = x.value pbs_data_jobs[batchserver_name]['jobs'][jobid] = update_one_job_from_pbs_data(jobid, attr_dict) pbs_data_jobs[batchserver_name]['last_update'] = datetime.datetime.now() return pbs_data_jobs
def nodes_page(): conn = pbs.pbs_connect(pbsserver) nodes = get_nodes(conn) pbs.pbs_disconnect(conn) nodes = node_attributes_reformat(nodes) node_totals = get_node_totals(nodes) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'nodes': nodes, 'node_totals': node_totals}
def release_job(self, job_id): """ Release a user hold on a job :param job_id: job to release """ connection = _connect_to_server(self.pbs_server) rval = pbs.pbs_rlsjob(self.connection, job_id, 'u', '') pbs.pbs_disconnect(connection) return rval
def delete_job(self, job_id): """ Sends job delete request to pbs_server for job :param job_id: job id to delete :return: pbs_deljob return value (0 on success) """ connection = _connect_to_server(self.pbs_server) rval = pbs.pbs_deljob(connection, job_id, '') pbs.pbs_disconnect(connection) return rval
def release_all(self): """ Release all jobs in self.held_jobs list reusing connections. """ # copy the list of held jobs to iterate over because release_job mutates # self.held_jobs jobs = list(self.held_jobs) connection = _connect_to_server(self._server) for id in jobs: self.release_job(id, connection) pbs.pbs_disconnect(connection)
def submit_fxn_site_prediction_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) print server_name print c attropl = pbs.new_attropl(7) attropl[0].name = pbs.ATTR_N attropl[0].value = "Functional Site Prediction Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_r attropl[5].value = 'y' attropl[6].name = pbs.ATTR_l attropl[6].resource = 'walltime' attropl[6].value = '1000' job.status_id = 2 job.save() job_id = pbs.pbs_submit( c, attropl, "/home/cyrus_afrasiabi/ohana_repository/bpg/fxn_site_prediction.py", 'web', 'NULL') logger.info( "Submitting %s to the grid to get functional site predictions with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug( "No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error( "Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([(attrib.name, attrib.value) for attrib in j.attribs]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def check_single_job( self, pbs_server_name, job_id ): """ Returns the state of a single job, used to make sure a job is really dead. """ c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state jobs = pbs.pbs_statjob( c, job_id, stat_attrl, None ) pbs.pbs_disconnect( c ) return jobs[0].attribs[0].value
def delete_all_jobs(self, ids): """ delete all jobs in a list of jobs :param ids: list of all jobs :return: zero on success, otherwise return value of failed pbs_deljob """ for job_id in ids: connection = _connect_to_server(self.pbs_server) rval = pbs.pbs_deljob(connection, job_id, '') pbs.pbs_disconnect(connection) if rval and (rval != self.E_UNKNOWN or rval != self.E_STATE): return rval return 0
def submit_intrepid_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(6) attropl[0].name = pbs.ATTR_N attropl[0].value = "INTREPID Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=8' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_l attropl[5].resource = 'walltime' attropl[5].value = '48:00:00' if job.development_job: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_development_pipeline.py", 'web', 'NULL') else: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_pipeline.py", 'web', 'NULL') logger.info("Submitting %s to the grid with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.status_id = JOB_SUBMITTED job.save() else: pass pbs.pbs_disconnect(c) return job_id
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug("No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error("Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([ (attrib.name, attrib.value) for attrib in j.attribs ]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def release_all(self): """ Release all jobs in self.held_jobs list reusing connections. """ # if we are 'faking' pipeline submission with civet_run -n, then there # is nothing to do if not self.submit: return # copy the list of held jobs to iterate over because release_job mutates # self.held_jobs jobs = list(self.held_jobs) connection = _connect_to_server(self._server) for id in jobs: self.release_job(id, connection) pbs.pbs_disconnect(connection)
def update_one_queue(queue): """ Update live info about the given queue """ conn = pbs.pbs_connect(queue.server.name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statqueues = pbs.pbs_statque(conn, queue.name.encode('iso-8859-1', 'replace') , [], "") pbs.pbs_disconnect(conn) if len(statqueues)==0: logging.error("pbs_statque failed for queue: %s" % queue.name) return if len(statqueues)>1: logging.warning("pbs_statque returned more than one records for queue: %s" % queue.name) attr_dict = dict([ (x.name,x.value) for x in statqueues[0].attribs]) update_one_queue_from_pbs_data(queue, attr_dict) queue.save()
def release_job(self, job_id, connection=None): """ Release a user hold from a held batch job. :param job_id: job id to release (short form not allowed) :param id: job id to release (short form not allowed) :param connection: optional connection to a pbs_server, if not passed release_job will establish a new connection """ c = connection if connection else _connect_to_server(self._server) rval = pbs.pbs_rlsjob(c, job_id, 'u', '') if not connection: pbs.pbs_disconnect(c) if rval == 0: self.held_jobs.remove(job_id) return rval
def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print( "pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def cleanup(self): """Cleanup: disconnect from server.""" if self.clean_conn: self.log.debug("Disconnecting from server.") pbs.pbs_disconnect(self.pbsconn)
def end(self): if self._connection_id: pbs.pbs_disconnect(self._connection_id) self._connection_id = None
def disconnect_from_server(self): """Disconnect current connection.""" pbs.pbs_disconnect(self.conn) self.conn = None
def pbs_disconn(self, conn): pbs.pbs_disconnect(conn)
def queue_job(self, batch_job): """ queue a BatchJob. :param batch_job: description of the job to queue """ # batch job names should be unique for civet pipelines because the # job name is used to name log files and other output # Civet generates unique names for each step, so this is just checking # for a programming error assert batch_job.name not in self._job_names if self.execution_log_dir: log_dir = self.execution_log_dir else: log_dir = self.log_dir # set batch_job.stderr_path and batch_job.stdout_path if they aren't already set if not batch_job.stdout_path: batch_job.stdout_path = os.path.join(log_dir, batch_job.name + ".o") if not batch_job.stderr_path: batch_job.stderr_path = os.path.join(log_dir, batch_job.name + ".e") # write batch script filename = self.write_script(batch_job) if self.submit: # build up our torque job attributes and resources job_attributes = {} job_resources = { 'nodes': "{0}:ppn={1}".format(batch_job.nodes, batch_job.ppn), 'walltime': batch_job.walltime, 'epilogue': self.epilogue_filename } if batch_job.mem: job_resources['mem'] = batch_job.mem job_attributes[pbs.ATTR_v] = self.generate_env(batch_job.workdir) if batch_job.name: job_attributes[pbs.ATTR_N] = batch_job.name job_attributes[pbs.ATTR_o] = batch_job.stdout_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_o] == "/dev/null": job_attributes[ pbs.ATTR_o] = socket.gethostname() + ":/dev/null" job_attributes[pbs.ATTR_e] = batch_job.stderr_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_e] == "/dev/null": job_attributes[ pbs.ATTR_e] = socket.gethostname() + ":/dev/null" if batch_job.depends_on: job_attributes[pbs.ATTR_depend] = self._dependency_string( batch_job) elif self.submit_with_hold: job_attributes[pbs.ATTR_h] = 'u' if batch_job.mail_option: job_attributes[pbs.ATTR_m] = batch_job.mail_option if batch_job.email_list: job_attributes[pbs.ATTR_M] = batch_job.email_list if batch_job.date_time: job_attributes[pbs.ATTR_a] = str( int(time.mktime(batch_job.date_time.timetuple()))) pbs_attrs = pbs.new_attropl( len(job_attributes) + len(job_resources)) # populate pbs_attrs attr_idx = 0 for resource, val in job_resources.iteritems(): pbs_attrs[attr_idx].name = pbs.ATTR_l pbs_attrs[attr_idx].resource = resource pbs_attrs[attr_idx].value = val attr_idx += 1 for attribute, val in job_attributes.iteritems(): pbs_attrs[attr_idx].name = attribute pbs_attrs[attr_idx].value = val attr_idx += 1 # we've initialized pbs_attrs with all the attributes we need to set # now we can connect to the server and submit the job connection = _connect_to_server(self._server) # connected to pbs_server # submit job retry = 0 job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) # if pbs.pbs_submit failed, try again while not job_id and retry < _MAX_RETRY: retry += 1 print("Retrying connection...", file=sys.stderr) time.sleep(retry**2) job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) pbs.pbs_disconnect(connection) # check to see if the job was submitted successfully. if not job_id: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error submitting job. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) if self.submit_with_hold and not batch_job.depends_on: self.held_jobs.append(job_id) else: #self.submit is False, fake a job ID job_id = "{0}.civet".format(self._id_seq) self._id_seq += 1 self._job_names.append(batch_job.name) self._id_log.write( job_id + '\t' + batch_job.name + '\t' + str(self._printable_dependencies(batch_job.depends_on)) + '\n') self._id_log.flush() return job_id
def disconnect(ID): return pbs.pbs_disconnect(ID)
def disconnect_from_server(conn): """Disconnect a given connection.""" if pbs_import_failed: raise EasyBuildError(pbs_import_failed) pbs.pbs_disconnect(conn)
def disconnect(self): if hasattr(self, 'c'): pbs.pbs_disconnect(self.c)
def queue_job( self, job_wrapper ): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop('-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) ) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [ str( o ) for o in output_fnames ] output_files.append(ecfile) stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True ) stageout = self.get_stage_in_out( output_files ) attrs = [ dict( name=pbs.ATTR_o, value=pbs_ofile ), dict( name=pbs.ATTR_e, value=pbs_efile ), dict( name=pbs.ATTR_stagein, value=stagein ), dict( name=pbs.ATTR_stageout, value=stageout ), ] # If not, we're using NFS else: attrs = [ dict( name=pbs.ATTR_o, value=ofile ), dict( name=pbs.ATTR_e, value=efile ), ] # define PBS job options attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) ) job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) ) for i, attr in enumerate( attrs + pbs_options ): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath( job_wrapper.working_directory ) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join( job_wrapper.get_input_fnames() + output_files ), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [ stage_commands ] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script( job_file, script ) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id ) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): self.cleanup( ( ofile, efile, ecfile, job_file ) ) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) ) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) ) time.sleep(2) else: log.error( "(%s) All attempts to submit job failed" % galaxy_job_id ) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) ) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) ) # persist destination job_wrapper.set_job_destination( job_destination, job_id ) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( job_state )
def queue_job(self, job_wrapper): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not (self.app.config.pbs_stage_path)): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop( '-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params[ 'destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error("Connection to PBS server for submit failed: %s: %s" % (errno, text)) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [str(o) for o in output_fnames] output_files.append(ecfile) stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() + output_files, symlink=True) stageout = self.get_stage_in_out(output_files) attrs = [ dict(name=pbs.ATTR_o, value=pbs_ofile), dict(name=pbs.ATTR_e, value=pbs_efile), dict(name=pbs.ATTR_stagein, value=stagein), dict(name=pbs.ATTR_stageout, value=stageout), ] # If not, we're using NFS else: attrs = [ dict(name=pbs.ATTR_o, value=ofile), dict(name=pbs.ATTR_e, value=efile), ] # define PBS job options attrs.append( dict(name=pbs.ATTR_N, value=str("%s_%s_%s" % (job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user)))) job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options)) for i, attr in enumerate(attrs + pbs_options): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath(job_wrapper.working_directory) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with file(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join(job_wrapper.get_input_fnames() + output_files), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [stage_commands] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script(job_file, script) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id) pbs.pbs_disconnect(c) if self.app.config.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file)) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) time.sleep(2) else: log.error("(%s) All attempts to submit job failed" % galaxy_job_id) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id)) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id)) # persist destination job_wrapper.set_job_destination(job_destination, job_id) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(job_state)
def _disconnect(self): """Close the PBS/Torque connection""" pbs.pbs_disconnect(self.con) self.attribs = 'NULL'
def submit(self): attropl = pbs.new_attropl(self.attribute_count + 1) attropl_idx = 0 attropl[attropl_idx].name = pbs.ATTR_v attropl[attropl_idx].value = self.generate_env() attropl_idx += 1 if self.name: attropl[attropl_idx].name = pbs.ATTR_N attropl[attropl_idx].value = self.name attropl_idx += 1 if self.walltime: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'walltime' attropl[attropl_idx].value = self.walltime attropl_idx += 1 if self.nodes: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'nodes' attropl[attropl_idx].value = self.nodes attropl_idx += 1 if self.stdout_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stdout_path attropl_idx += 1 if self.stderr_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stderr_path attropl_idx += 1 if self.dependency_list: attropl[attropl_idx].name = pbs.ATTR_depend attropl[attropl_idx].value = self.dependency_list attropl_idx += 1 if self.mail_options: attropl[attropl_idx].name = pbs.ATTR_m attropl[attropl_idx].value = self.mail_options attropl_idx += 1 if self.mem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'mem' attropl[attropl_idx].value = self.mem attropl_idx += 1 if self.vmem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'vmem' attropl[attropl_idx].value = self.vmem attropl_idx += 1 connection = pbs.pbs_connect(pbs.pbs_default()) self.job_id = pbs.pbs_submit(connection, attropl, self.job_script, None, None) pbs.pbs_disconnect(connection) e, e_msg = pbs.error() # the batch system returned an error, throw exception if e: message = "%d: %s" % (e, e_msg) raise Exception(message) return self.job_id