Ejemplo n.º 1
0
    def _process(self, batch_list):
        '''This function execute the change to the batch server'''

        if ARGS_VERBOSE: 
            _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr)

        ## Always get the pbs_server name, even in dry-run mode
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            _print('Could not locate a pbs server', file=sys.stderr)
            sys.exit(1)

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr)

        ## If dry-run is not specified create a connection
        if not ARGS_DRYRUN:
            pbs_connection = pbs.pbs_connect(pbs_server)

        ## Execute the changes
        for node in batch_list:
            if not ARGS_DRYRUN:
                pbs_connection = pbs.pbs_connect(pbs_server)
                rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL')
                if rcode > 0:
                    errno, text = pbs.error()
                    _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr)
            else:
                _print("pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1])))

        ## Close the connection with the batch system
        if not ARGS_DRYRUN:
            pbs.pbs_disconnect(pbs_connection)
Ejemplo n.º 2
0
def update_all_nodes(batchserver_name):
    """ Update info about all nodes of the given batchserver.
    """
    server,created = getBatchServer(batchserver_name)
    if not pbs_data_nodes.has_key(batchserver_name):
        pbs_data_nodes[batchserver_name] = {'last_update':None, 'nodes':{}}

    if pbs_data_nodes[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_nodes[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate:
        logging.debug("Nodes info is new enough for server: %s" % batchserver_name)
        print "not updated"
        return pbs_data_nodes

    print "updated"

    conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace'))
    if conn==-1:
        logging.error("Cannot connect to %s - live data will be missing" % server.name)
        return
    statnodes = pbs.pbs_statnode(conn, "" , [], "")
    pbs.pbs_disconnect(conn)

    for sn in statnodes:
        node,created = getNode(sn.name, server)
        attr_dict = dict([ (x.name,x.value) for x in sn.attribs])
        pbs_data_nodes[batchserver_name]['nodes'][node] = update_one_node_from_pbs_data(node, attr_dict)
        pbs_data_nodes[batchserver_name]['last_update'] = datetime.datetime.now()

    return pbs_data_nodes
def submit_get_subfamilies_job(job):
    # This is how we are passing the fasta and job id to the script
    server_name = pbs.pbs_default()
    c = pbs.pbs_connect(server_name)
    attropl = pbs.new_attropl(5)

    attropl[0].name  = pbs.ATTR_N
    attropl[0].value = "FAT-CAT Get Sub-Families: %s" % job.id

    attropl[1].name  = pbs.ATTR_l
    attropl[1].resource = 'nodes'
    attropl[1].value = '1:ppn=1'

    attropl[2].name  = pbs.ATTR_o
    attropl[2].value = JOB_LOG_FILE

    attropl[3].name  = pbs.ATTR_e
    attropl[3].value = JOB_LOG_FILE

    attropl[4].name  = pbs.ATTR_v
    attropl[4].value = "job_id=%s" % (job.id)

    job.status_id = 5
    job.save()

    job_id = pbs.pbs_submit(c, attropl, "/clusterfs/ohana/software/fatcat/scripts/get_best_nodes.py", 'web', 'NULL')
    logger.info("Submitting %s to the grid to get best nodes with id %s" % (job.id, job_id))

    if job_id: 
        job.get_best_nodes_pbs_job_id = job_id
        job.save()

    pbs.pbs_disconnect(c)

    return job_id
Ejemplo n.º 4
0
 def check_all_jobs( self ):
     """
     Returns a list of servers that failed to be contacted and a dict
     of "job_id : status" pairs (where status is a bunchified version
     of the API's structure.
     """
     servers = []
     failures = []
     statuses = {}
     for pbs_job_state in self.watched:
         pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params)
         if pbs_server_name not in servers:
             servers.append( pbs_server_name )
         pbs_job_state.check_count += 1
     for pbs_server_name in servers:
         c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
         if c <= 0:
             log.debug("connection to PBS server %s for state check failed" % pbs_server_name )
             failures.append( pbs_server_name )
             continue
         stat_attrl = pbs.new_attrl(3)
         stat_attrl[0].name = pbs.ATTR_state
         stat_attrl[1].name = pbs.ATTR_used
         stat_attrl[2].name = pbs.ATTR_exitstat
         jobs = pbs.pbs_statjob( c, None, stat_attrl, None )
         pbs.pbs_disconnect( c )
         statuses.update( self.convert_statjob_to_bunches( jobs ) )
     return( ( failures, statuses ) )
Ejemplo n.º 5
0
 def check_all_jobs(self):
     """
     Returns a list of servers that failed to be contacted and a dict
     of "job_id : status" pairs (where status is a bunchified version
     of the API's structure.
     """
     servers = []
     failures = []
     statuses = {}
     for pbs_job_state in self.watched:
         pbs_server_name = self.__get_pbs_server(
             pbs_job_state.job_destination.params)
         if pbs_server_name not in servers:
             servers.append(pbs_server_name)
         pbs_job_state.check_count += 1
     for pbs_server_name in servers:
         c = pbs.pbs_connect(util.smart_str(pbs_server_name))
         if c <= 0:
             log.debug(
                 "connection to PBS server %s for state check failed" %
                 pbs_server_name)
             failures.append(pbs_server_name)
             continue
         stat_attrl = pbs.new_attrl(3)
         stat_attrl[0].name = pbs.ATTR_state
         stat_attrl[1].name = pbs.ATTR_used
         stat_attrl[2].name = pbs.ATTR_exitstat
         jobs = pbs.pbs_statjob(c, None, stat_attrl, None)
         pbs.pbs_disconnect(c)
         statuses.update(self.convert_statjob_to_bunches(jobs))
     return ((failures, statuses))
Ejemplo n.º 6
0
    def submit_with_retry(pbs_attrs, script_path, queue, pbs_server=None):
        # connect to pbs server
        connection = _connect_to_server(pbs_server)

        # submit job
        retry = 0
        job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue,
                                None)

        # if pbs.pbs_submit failed, try again
        while not job_id and retry < _MAX_RETRY:
            retry += 1
            print("Retrying connection...", file=sys.stderr)
            time.sleep(retry**2)
            job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue,
                                    None)

        pbs.pbs_disconnect(connection)

        #check to see if the job was submitted successfully.
        if not job_id:
            e, e_msg = pbs.error()
            # the batch system returned an error, throw exception
            raise Exception("Error submitting job.  "
                            "Torque error {0}: '{1}'".format(
                                e, torque_strerror(e)))

        return job_id
Ejemplo n.º 7
0
def disconnect_from_server(conn):
    """Disconnect a given connection."""
    if pbs_import_failed:
        _log.error(pbs_import_failed)
        return None

    pbs.pbs_disconnect(conn)
Ejemplo n.º 8
0
def disconnect_from_server(conn):
    """Disconnect a given connection."""
    if pbs_import_failed:
        _log.error(pbs_import_failed)
        return None

    pbs.pbs_disconnect(conn)
Ejemplo n.º 9
0
    def stop_job(self, job):
        """Attempts to delete a job from the PBS queue"""
        job_id = job.get_job_runner_external_id().encode('utf-8')
        job_tag = "(%s/%s)" % (job.get_id_tag(), job_id)
        log.debug("%s Stopping PBS job" % job_tag)

        # Declare the connection handle c so that it can be cleaned up:
        c = None

        try:
            pbs_server_name = self.__get_pbs_server(job.destination_params)
            if pbs_server_name is None:
                log.debug(
                    "(%s) Job queued but no destination stored in job params, cannot delete"
                    % job_tag)
                return
            c = pbs.pbs_connect(util.smart_str(pbs_server_name))
            if c <= 0:
                log.debug(
                    "(%s) Connection to PBS server for job delete failed" %
                    job_tag)
                return
            pbs.pbs_deljob(c, job_id, '')
            log.debug("%s Removed from PBS queue before job completion" %
                      job_tag)
        except:
            e = traceback.format_exc()
            log.debug("%s Unable to stop job: %s" % (job_tag, e))
        finally:
            # Cleanup: disconnect from the server.
            if (None is not c):
                pbs.pbs_disconnect(c)
Ejemplo n.º 10
0
def queues_page():
    conn = pbs.pbs_connect(pbsserver)
    queues = get_queues(conn)
    pbs.pbs_disconnect(conn)
    queues = queue_attributes_reformat(queues)
    now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P')
    return {'now': now, 'queues': queues}
Ejemplo n.º 11
0
def jobs_page():
    conn = pbs.pbs_connect(pbsserver)
    jobs = get_jobs(conn)
    pbs.pbs_disconnect(conn)
    jobs = job_attributes_reformat(jobs)
    now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P')
    return {'now': now, 'jobs': jobs}
Ejemplo n.º 12
0
    def stop_job( self, job ):
        """Attempts to delete a job from the PBS queue"""
        job_id = job.get_job_runner_external_id().encode('utf-8')
        job_tag = "(%s/%s)" % ( job.get_id_tag(), job_id )
        log.debug( "%s Stopping PBS job" % job_tag )

        # Declare the connection handle c so that it can be cleaned up:
        c = None

        try:
            pbs_server_name = self.__get_pbs_server( job.destination_params )
            if pbs_server_name is None:
                log.debug("(%s) Job queued but no destination stored in job params, cannot delete"
                          % job_tag )
                return
            c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
            if c <= 0:
                log.debug("(%s) Connection to PBS server for job delete failed"
                          % job_tag )
                return
            pbs.pbs_deljob( c, job_id, '' )
            log.debug( "%s Removed from PBS queue before job completion"
                       % job_tag )
        except:
            e = traceback.format_exc()
            log.debug( "%s Unable to stop job: %s" % ( job_tag, e ) )
        finally:
            # Cleanup: disconnect from the server.
            if ( None is not c ):
                pbs.pbs_disconnect( c )
Ejemplo n.º 13
0
def update_all_queues(batchserver_name):
    """ Update info about all queues for give batchserver.
    """
    server,created = getBatchServer(batchserver_name)
    if server.queues_lastupdate and (datetime.datetime.now()-server.queues_lastupdate).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate:
        logging.debug("Queue info is new enough for server: %s" % batchserver_name)
        return

    conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace'))
    if conn==-1:
        logging.error("Cannot connect to %s - live data will be missing" % server.name)
        return
    statqueues = pbs.pbs_statque(conn, "" , [], "")
    pbs.pbs_disconnect(conn)
    if conn==-1:
        logging.error("Cannot connect to %s - live data will be missing" % server.name)
        return
    
    for sq in statqueues:
        queue,created = getQueue(sq.name, server)
        attr_dict = dict([ (x.name,x.value) for x in sq.attribs])
        update_one_queue_from_pbs_data(queue, attr_dict)
        queue.save()
    server.queues_lastupdate = datetime.datetime.now()
    server.save()
Ejemplo n.º 14
0
def update_all_jobs(batchserver_name):
    """ Update info about all jobs of the given batchserver.
    """
    server,created = getBatchServer(batchserver_name)
    if not pbs_data_jobs.has_key(batchserver_name):
        pbs_data_jobs[batchserver_name] = {'last_update':None, 'jobs':{}}

    if pbs_data_jobs[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_jobs[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate:
        logging.debug("jobs info is new enough for server: %s" % batchserver_name)
        print "not updated"
        return pbs_data_jobs
    print "updated"

    conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace'))
    if conn==-1:
        logging.error("Cannot connect to %s - live data will be missing" % server.name)
        return
    statjobs = pbs.pbs_statjob(conn, "" , [], "")
    pbs.pbs_disconnect(conn)

    for sj in statjobs:
        jobid = sj.name
        attr_dict = dict([ (x.name,x.value) for x in sj.attribs])
        attr_dict = {}
        for x in sj.attribs:
            if x.resource:
                attr_dict[x.name+"_"+x.resource] = x.value
            else:
                attr_dict[x.name] = x.value

        pbs_data_jobs[batchserver_name]['jobs'][jobid] = update_one_job_from_pbs_data(jobid, attr_dict)
        pbs_data_jobs[batchserver_name]['last_update'] = datetime.datetime.now()

    return pbs_data_jobs
Ejemplo n.º 15
0
def nodes_page():
    conn = pbs.pbs_connect(pbsserver)
    nodes = get_nodes(conn)
    pbs.pbs_disconnect(conn)
    nodes = node_attributes_reformat(nodes)
    node_totals = get_node_totals(nodes)
    now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P')
    return {'now': now, 'nodes': nodes, 'node_totals': node_totals}
Ejemplo n.º 16
0
 def release_job(self, job_id):
     """
     Release a user hold on a job
     :param job_id: job to release
     """
     connection = _connect_to_server(self.pbs_server)
     rval = pbs.pbs_rlsjob(self.connection, job_id, 'u', '')
     pbs.pbs_disconnect(connection)
     return rval
Ejemplo n.º 17
0
    def delete_job(self, job_id):
        """
           Sends job delete request to pbs_server for job

           :param job_id: job id to delete
           :return:  pbs_deljob return value (0 on success)
        """
        connection = _connect_to_server(self.pbs_server)
        rval = pbs.pbs_deljob(connection, job_id, '')
        pbs.pbs_disconnect(connection)
        return rval
Ejemplo n.º 18
0
 def release_all(self):
     """
         Release all jobs in self.held_jobs list reusing connections.  
     """
     # copy the list of held jobs to iterate over because release_job mutates
     # self.held_jobs
     jobs = list(self.held_jobs)
     connection = _connect_to_server(self._server)
     for id in jobs:
         self.release_job(id, connection)
     pbs.pbs_disconnect(connection)
def submit_fxn_site_prediction_job(job):
    # This is how we are passing the fasta and job id to the script
    server_name = pbs.pbs_default()
    c = pbs.pbs_connect(server_name)

    print server_name
    print c

    attropl = pbs.new_attropl(7)

    attropl[0].name = pbs.ATTR_N
    attropl[0].value = "Functional Site Prediction Job: %s" % job.id

    attropl[1].name = pbs.ATTR_l
    attropl[1].resource = 'nodes'
    attropl[1].value = '1:ppn=1'

    attropl[2].name = pbs.ATTR_o
    attropl[2].value = JOB_LOG_FILE

    attropl[3].name = pbs.ATTR_e
    attropl[3].value = JOB_LOG_FILE

    attropl[4].name = pbs.ATTR_v
    attropl[4].value = "job_id=%s" % (job.id)

    attropl[5].name = pbs.ATTR_r
    attropl[5].value = 'y'

    attropl[6].name = pbs.ATTR_l
    attropl[6].resource = 'walltime'
    attropl[6].value = '1000'

    job.status_id = 2
    job.save()

    job_id = pbs.pbs_submit(
        c, attropl,
        "/home/cyrus_afrasiabi/ohana_repository/bpg/fxn_site_prediction.py",
        'web', 'NULL')
    logger.info(
        "Submitting %s to the grid to get functional site predictions with id %s"
        % (job.id, job_id))

    if job_id:
        job.pbs_job_id = job_id
        job.save()

    pbs.pbs_disconnect(c)

    return job_id
Ejemplo n.º 20
0
    def info(self, types=None):
        """
        Return jobinfo
        """
        if not self.jobid:
            self.log.debug("no jobid, job is not submitted yet?")
            return None

        # convert single type into list
        if type(types) is str:
            types = [types]

        self.log.debug("Return info types %s" % types)

        # create attribute list to query pbs with
        if types is None:
            jobattr = NULL
        else:
            jobattr = pbs.new_attrl(len(types))
            for idx, attr in enumerate(types):
                jobattr[idx].name = attr

        # get a new connection (otherwise this seems to fail)
        if self.clean_conn:
            pbs.pbs_disconnect(self.pbsconn)
            self.pbsconn = pbs.pbs_connect(self.pbs_server)
        jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL)
        if len(jobs) == 0:
            # no job found, return None info
            res = None
            self.log.debug(
                "No job found. Wrong id %s or job finished? Returning %s" %
                (self.jobid, res))
            return res
        elif len(jobs) == 1:
            self.log.debug("Request for jobid %s returned one result %s" %
                           (self.jobid, jobs))
        else:
            self.log.error(
                "Request for jobid %s returned more then one result %s" %
                (self.jobid, jobs))

        # only expect to have a list with one element
        j = jobs[0]
        # convert attribs into useable dict
        job_details = dict([(attrib.name, attrib.value)
                            for attrib in j.attribs])
        # manually set 'id' attribute
        job_details['id'] = j.name
        self.log.debug("Found jobinfo %s" % job_details)
        return job_details
Ejemplo n.º 21
0
 def check_single_job( self, pbs_server_name, job_id ):
     """
     Returns the state of a single job, used to make sure a job is
     really dead.
     """
     c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
     if c <= 0:
         log.debug("connection to PBS server %s for state check failed" % pbs_server_name )
         return None
     stat_attrl = pbs.new_attrl(1)
     stat_attrl[0].name = pbs.ATTR_state
     jobs = pbs.pbs_statjob( c, job_id, stat_attrl, None )
     pbs.pbs_disconnect( c )
     return jobs[0].attribs[0].value
Ejemplo n.º 22
0
    def delete_all_jobs(self, ids):
        """

        delete all jobs in a list of jobs

        :param ids: list of all jobs
        :return: zero on success, otherwise return value of failed pbs_deljob
        """
        for job_id in ids:
            connection = _connect_to_server(self.pbs_server)
            rval = pbs.pbs_deljob(connection, job_id, '')
            pbs.pbs_disconnect(connection)
            if rval and (rval != self.E_UNKNOWN or rval != self.E_STATE):
                return rval
        return 0
def submit_intrepid_job(job):
    # This is how we are passing the fasta and job id to the script
    server_name = pbs.pbs_default()
    c = pbs.pbs_connect(server_name)
    attropl = pbs.new_attropl(6)

    attropl[0].name = pbs.ATTR_N
    attropl[0].value = "INTREPID Job: %s" % job.id

    attropl[1].name = pbs.ATTR_l
    attropl[1].resource = 'nodes'
    attropl[1].value = '1:ppn=8'

    attropl[2].name = pbs.ATTR_o
    attropl[2].value = JOB_LOG_FILE

    attropl[3].name = pbs.ATTR_e
    attropl[3].value = JOB_LOG_FILE

    attropl[4].name = pbs.ATTR_v
    attropl[4].value = "job_id=%s" % (job.id)

    attropl[5].name = pbs.ATTR_l
    attropl[5].resource = 'walltime'
    attropl[5].value = '48:00:00'

    if job.development_job:
        job_id = pbs.pbs_submit(
            c, attropl,
            "/clusterfs/ohana/software/intrepid/scripts/intrepid_development_pipeline.py",
            'web', 'NULL')
    else:
        job_id = pbs.pbs_submit(
            c, attropl,
            "/clusterfs/ohana/software/intrepid/scripts/intrepid_pipeline.py",
            'web', 'NULL')
    logger.info("Submitting %s to the grid with id %s" % (job.id, job_id))

    if job_id:
        job.pbs_job_id = job_id
        job.status_id = JOB_SUBMITTED
        job.save()
    else:
        pass

    pbs.pbs_disconnect(c)

    return job_id
Ejemplo n.º 24
0
    def info(self, types=None):
        """
        Return jobinfo
        """
        if not self.jobid:
            self.log.debug("no jobid, job is not submitted yet?")
            return None

        # convert single type into list
        if type(types) is str:
            types = [types]

        self.log.debug("Return info types %s" % types)

        # create attribute list to query pbs with
        if types is None:
            jobattr = NULL
        else:
            jobattr = pbs.new_attrl(len(types))
            for idx, attr in enumerate(types):
                jobattr[idx].name = attr


        # get a new connection (otherwise this seems to fail)
        if self.clean_conn:
            pbs.pbs_disconnect(self.pbsconn)
            self.pbsconn = pbs.pbs_connect(self.pbs_server)
        jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL)
        if len(jobs) == 0:
            # no job found, return None info
            res = None
            self.log.debug("No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res))
            return res
        elif len(jobs) == 1:
            self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs))
        else:
            self.log.error("Request for jobid %s returned more then one result %s" % (self.jobid, jobs))

        # only expect to have a list with one element
        j = jobs[0]
        # convert attribs into useable dict
        job_details = dict([ (attrib.name, attrib.value) for attrib in j.attribs ])
        # manually set 'id' attribute
        job_details['id'] = j.name
        self.log.debug("Found jobinfo %s" % job_details)
        return job_details
Ejemplo n.º 25
0
    def release_all(self):
        """
            Release all jobs in self.held_jobs list reusing connections.  
        """

        # if we are 'faking' pipeline submission with civet_run -n, then there
        # is nothing to do
        if not self.submit:
            return

        # copy the list of held jobs to iterate over because release_job mutates
        # self.held_jobs
        jobs = list(self.held_jobs)
        connection = _connect_to_server(self._server)
        for id in jobs:
            self.release_job(id, connection)
        pbs.pbs_disconnect(connection)
Ejemplo n.º 26
0
def update_one_queue(queue):
    """ Update live info about the given queue 
    """
    conn = pbs.pbs_connect(queue.server.name.encode('iso-8859-1', 'replace'))
    if conn==-1:
        logging.error("Cannot connect to %s - live data will be missing" % server.name)
        return
    statqueues = pbs.pbs_statque(conn, queue.name.encode('iso-8859-1', 'replace') , [], "")
    pbs.pbs_disconnect(conn)
    if len(statqueues)==0:
        logging.error("pbs_statque failed for queue: %s" % queue.name)
        return
    if len(statqueues)>1:
        logging.warning("pbs_statque returned more than one records for queue: %s" % queue.name)

    attr_dict = dict([ (x.name,x.value) for x in statqueues[0].attribs])
    update_one_queue_from_pbs_data(queue, attr_dict)
    queue.save()
Ejemplo n.º 27
0
    def release_job(self, job_id, connection=None):
        """
            Release a user hold from a held batch job.
            
            :param job_id: job id to release (short form not allowed)
            :param id: job id to release (short form not allowed)
            :param connection: optional connection to a pbs_server, if not
                  passed release_job will establish a new connection
        """
        c = connection if connection else _connect_to_server(self._server)

        rval = pbs.pbs_rlsjob(c, job_id, 'u', '')

        if not connection:
            pbs.pbs_disconnect(c)

        if rval == 0:
            self.held_jobs.remove(job_id)
        return rval
Ejemplo n.º 28
0
    def _process(self, batch_list):
        '''This function execute the change to the batch server'''

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process input:%s' % str(batch_list),
                   file=sys.stderr)

        ## Always get the pbs_server name, even in dry-run mode
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            _print('Could not locate a pbs server', file=sys.stderr)
            sys.exit(1)

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server,
                   file=sys.stderr)

        ## If dry-run is not specified create a connection
        if not ARGS_DRYRUN:
            pbs_connection = pbs.pbs_connect(pbs_server)

        ## Execute the changes
        for node in batch_list:
            if not ARGS_DRYRUN:
                pbs_connection = pbs.pbs_connect(pbs_server)
                rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET,
                                        pbs.MGR_OBJ_NODE, node[0], node[1],
                                        'NULL')
                if rcode > 0:
                    errno, text = pbs.error()
                    _print('PBS error for node \'%s\': %s (%s)' %
                           (node[0], text, errno),
                           file=sys.stderr)
            else:
                _print(
                    "pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')"
                    % (node[0], str(node[1])))

        ## Close the connection with the batch system
        if not ARGS_DRYRUN:
            pbs.pbs_disconnect(pbs_connection)
Ejemplo n.º 29
0
 def cleanup(self):
     """Cleanup: disconnect from server."""
     if self.clean_conn:
         self.log.debug("Disconnecting from server.")
         pbs.pbs_disconnect(self.pbsconn)
Ejemplo n.º 30
0
 def end(self):
     if self._connection_id:
       pbs.pbs_disconnect(self._connection_id)
     self._connection_id = None
Ejemplo n.º 31
0
 def disconnect_from_server(self):
     """Disconnect current connection."""
     pbs.pbs_disconnect(self.conn)
     self.conn = None
Ejemplo n.º 32
0
 def disconnect_from_server(self):
     """Disconnect current connection."""
     pbs.pbs_disconnect(self.conn)
     self.conn = None
Ejemplo n.º 33
0
 def pbs_disconn(self, conn):
     pbs.pbs_disconnect(conn)
Ejemplo n.º 34
0
    def queue_job(self, batch_job):
        """
          queue a BatchJob.
          
          :param batch_job: description of the job to queue
        """

        # batch job names should be unique for civet pipelines because the
        # job name is used to name log files and other output
        # Civet generates unique names for each step, so this is just checking
        # for a programming error
        assert batch_job.name not in self._job_names

        if self.execution_log_dir:
            log_dir = self.execution_log_dir
        else:
            log_dir = self.log_dir

        # set batch_job.stderr_path and batch_job.stdout_path if they aren't already set
        if not batch_job.stdout_path:
            batch_job.stdout_path = os.path.join(log_dir,
                                                 batch_job.name + ".o")
        if not batch_job.stderr_path:
            batch_job.stderr_path = os.path.join(log_dir,
                                                 batch_job.name + ".e")

        # write batch script
        filename = self.write_script(batch_job)

        if self.submit:
            # build up our torque job attributes and resources
            job_attributes = {}
            job_resources = {
                'nodes': "{0}:ppn={1}".format(batch_job.nodes, batch_job.ppn),
                'walltime': batch_job.walltime,
                'epilogue': self.epilogue_filename
            }

            if batch_job.mem:
                job_resources['mem'] = batch_job.mem

            job_attributes[pbs.ATTR_v] = self.generate_env(batch_job.workdir)

            if batch_job.name:
                job_attributes[pbs.ATTR_N] = batch_job.name

            job_attributes[pbs.ATTR_o] = batch_job.stdout_path

            #XXX workaround for a TORQUE bug where local copies of stderr &
            # stdout files to /dev/null don't work correctly but remote
            # copies (to submit host) do
            if job_attributes[pbs.ATTR_o] == "/dev/null":
                job_attributes[
                    pbs.ATTR_o] = socket.gethostname() + ":/dev/null"

            job_attributes[pbs.ATTR_e] = batch_job.stderr_path

            #XXX workaround for a TORQUE bug where local copies of stderr &
            # stdout files to /dev/null don't work correctly but remote
            # copies (to submit host) do
            if job_attributes[pbs.ATTR_e] == "/dev/null":
                job_attributes[
                    pbs.ATTR_e] = socket.gethostname() + ":/dev/null"

            if batch_job.depends_on:
                job_attributes[pbs.ATTR_depend] = self._dependency_string(
                    batch_job)
            elif self.submit_with_hold:
                job_attributes[pbs.ATTR_h] = 'u'

            if batch_job.mail_option:
                job_attributes[pbs.ATTR_m] = batch_job.mail_option

            if batch_job.email_list:
                job_attributes[pbs.ATTR_M] = batch_job.email_list

            if batch_job.date_time:
                job_attributes[pbs.ATTR_a] = str(
                    int(time.mktime(batch_job.date_time.timetuple())))

            pbs_attrs = pbs.new_attropl(
                len(job_attributes) + len(job_resources))

            # populate pbs_attrs
            attr_idx = 0
            for resource, val in job_resources.iteritems():
                pbs_attrs[attr_idx].name = pbs.ATTR_l
                pbs_attrs[attr_idx].resource = resource
                pbs_attrs[attr_idx].value = val
                attr_idx += 1

            for attribute, val in job_attributes.iteritems():
                pbs_attrs[attr_idx].name = attribute
                pbs_attrs[attr_idx].value = val
                attr_idx += 1

            # we've initialized pbs_attrs with all the attributes we need to set
            # now we can connect to the server and submit the job
            connection = _connect_to_server(self._server)

            # connected to pbs_server

            # submit job
            retry = 0
            job_id = pbs.pbs_submit(connection, pbs_attrs, filename,
                                    self.queue, None)

            # if pbs.pbs_submit failed, try again
            while not job_id and retry < _MAX_RETRY:
                retry += 1
                print("Retrying connection...", file=sys.stderr)
                time.sleep(retry**2)
                job_id = pbs.pbs_submit(connection, pbs_attrs, filename,
                                        self.queue, None)

            pbs.pbs_disconnect(connection)

            # check to see if the job was submitted successfully.
            if not job_id:
                e, e_msg = pbs.error()
                # the batch system returned an error, throw exception
                raise Exception("Error submitting job.  "
                                "Torque error {0}: '{1}'".format(
                                    e, torque_strerror(e)))

            if self.submit_with_hold and not batch_job.depends_on:
                self.held_jobs.append(job_id)

        else:
            #self.submit is False, fake a job ID
            job_id = "{0}.civet".format(self._id_seq)
            self._id_seq += 1

        self._job_names.append(batch_job.name)

        self._id_log.write(
            job_id + '\t' + batch_job.name + '\t' +
            str(self._printable_dependencies(batch_job.depends_on)) + '\n')
        self._id_log.flush()
        return job_id
Ejemplo n.º 35
0
def disconnect(ID):
	return pbs.pbs_disconnect(ID)
Ejemplo n.º 36
0
 def cleanup(self):
     """Cleanup: disconnect from server."""
     if self.clean_conn:
         self.log.debug("Disconnecting from server.")
         pbs.pbs_disconnect(self.pbsconn)
Ejemplo n.º 37
0
def disconnect_from_server(conn):
    """Disconnect a given connection."""
    if pbs_import_failed:
        raise EasyBuildError(pbs_import_failed)

    pbs.pbs_disconnect(conn)
Ejemplo n.º 38
0
 def disconnect(self):
     if hasattr(self, 'c'):
         pbs.pbs_disconnect(self.c)
Ejemplo n.º 39
0
    def queue_job( self, job_wrapper ):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop('-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name)

        c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail( "Unable to queue job for execution.  Resubmitting the job may succeed." )
            log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) )
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [ str( o ) for o in output_fnames ]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True )
            stageout = self.get_stage_in_out( output_files )
            attrs = [
                dict( name=pbs.ATTR_o, value=pbs_ofile ),
                dict( name=pbs.ATTR_e, value=pbs_efile ),
                dict( name=pbs.ATTR_stagein, value=stagein ),
                dict( name=pbs.ATTR_stageout, value=stageout ),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict( name=pbs.ATTR_o, value=ofile ),
                dict( name=pbs.ATTR_e, value=efile ),
            ]

        # define PBS job options
        attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) )
        job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) )
        for i, attr in enumerate( attrs + pbs_options ):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath( job_wrapper.working_directory )

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with open(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join( job_wrapper.get_input_fnames() + output_files ),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [ stage_commands ]
        script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        self.write_executable_script( job_file, script )
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id )
            pbs.pbs_disconnect(c)
            if job_wrapper.cleanup_job in ( "always", "onsuccess" ):
                self.cleanup( ( ofile, efile, ecfile, job_file ) )
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) )

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) )
            time.sleep(2)
        else:
            log.error( "(%s) All attempts to submit job failed" % galaxy_job_id )
            job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
        else:
            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) )

        # persist destination
        job_wrapper.set_job_destination( job_destination, job_id )

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( job_state )
Ejemplo n.º 40
0
def disconnect_from_server(conn):
    """Disconnect a given connection."""
    if pbs_import_failed:
        raise EasyBuildError(pbs_import_failed)

    pbs.pbs_disconnect(conn)
Ejemplo n.º 41
0
    def queue_job(self, job_wrapper):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job(
                job_wrapper,
                include_metadata=not (self.app.config.pbs_stage_path)):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop(
                '-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params[
                    'destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or
                                                           '', pbs_server_name)

        c = pbs.pbs_connect(util.smart_str(pbs_server_name))
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail(
                "Unable to queue job for execution.  Resubmitting the job may succeed."
            )
            log.error("Connection to PBS server for submit failed: %s: %s" %
                      (errno, text))
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory,
                               job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [str(o) for o in output_fnames]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() +
                                            output_files,
                                            symlink=True)
            stageout = self.get_stage_in_out(output_files)
            attrs = [
                dict(name=pbs.ATTR_o, value=pbs_ofile),
                dict(name=pbs.ATTR_e, value=pbs_efile),
                dict(name=pbs.ATTR_stagein, value=stagein),
                dict(name=pbs.ATTR_stageout, value=stageout),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict(name=pbs.ATTR_o, value=ofile),
                dict(name=pbs.ATTR_e, value=efile),
            ]

        # define PBS job options
        attrs.append(
            dict(name=pbs.ATTR_N,
                 value=str("%s_%s_%s" %
                           (job_wrapper.job_id, job_wrapper.tool.id,
                            job_wrapper.user))))
        job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options))
        for i, attr in enumerate(attrs + pbs_options):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath(job_wrapper.working_directory)

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with file(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join(job_wrapper.get_input_fnames() + output_files),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [stage_commands]
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ecfile,
                                   env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory,
                                 job_wrapper.job_id)
        self.write_executable_script(job_file, script)
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug(
                "Job %s deleted by user before it entered the PBS queue" %
                job_wrapper.job_id)
            pbs.pbs_disconnect(c)
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                self.cleanup((ofile, efile, ecfile, job_file))
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file))

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name,
                                    None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" %
                        (galaxy_job_id, tries, errno, text))
            time.sleep(2)
        else:
            log.error("(%s) All attempts to submit job failed" % galaxy_job_id)
            job_wrapper.fail(
                "Unable to run this job due to a cluster error, please retry it later"
            )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" %
                      (galaxy_job_id, job_id))
        else:
            log.debug("(%s) queued in %s queue as %s" %
                      (galaxy_job_id, pbs_queue_name, job_id))

        # persist destination
        job_wrapper.set_job_destination(job_destination, job_id)

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(job_state)
Ejemplo n.º 42
0
	def _disconnect(self):
		"""Close the PBS/Torque connection"""
		pbs.pbs_disconnect(self.con)
		self.attribs = 'NULL'
Ejemplo n.º 43
0
    def submit(self):
 
 
        attropl = pbs.new_attropl(self.attribute_count + 1)
        attropl_idx = 0
 
        attropl[attropl_idx].name  = pbs.ATTR_v
        attropl[attropl_idx].value = self.generate_env()
        attropl_idx += 1
 
        if self.name:
            attropl[attropl_idx].name   = pbs.ATTR_N
            attropl[attropl_idx].value  = self.name
            attropl_idx += 1
           
        if self.walltime:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'walltime'
            attropl[attropl_idx].value    = self.walltime
            attropl_idx += 1
        
        if self.nodes:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'nodes'
            attropl[attropl_idx].value    = self.nodes
            attropl_idx += 1
           
        if self.stdout_path:
            attropl[attropl_idx].name  = pbs.ATTR_o
            attropl[attropl_idx].value = self.stdout_path
            attropl_idx += 1

        if self.stderr_path:
            attropl[attropl_idx].name  = pbs.ATTR_o
            attropl[attropl_idx].value = self.stderr_path
            attropl_idx += 1
           
        if self.dependency_list:
            attropl[attropl_idx].name = pbs.ATTR_depend
            attropl[attropl_idx].value = self.dependency_list
            attropl_idx += 1
           
        if self.mail_options:
            attropl[attropl_idx].name = pbs.ATTR_m
            attropl[attropl_idx].value = self.mail_options
            attropl_idx += 1
           
        if self.mem:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'mem'
            attropl[attropl_idx].value    = self.mem
            attropl_idx += 1
            
        if self.vmem:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'vmem'
            attropl[attropl_idx].value    = self.vmem
            attropl_idx += 1
            
        connection = pbs.pbs_connect(pbs.pbs_default())
        
        self.job_id = pbs.pbs_submit(connection, attropl, self.job_script, None, None)
       
        pbs.pbs_disconnect(connection)
        
        e, e_msg = pbs.error()
        
        # the batch system returned an error, throw exception 
        if e:
            message = "%d: %s" % (e, e_msg)
            raise Exception(message)
            
        return self.job_id
Ejemplo n.º 44
0
 def _disconnect(self):
     """Close the PBS/Torque connection"""
     pbs.pbs_disconnect(self.con)
     self.attribs = 'NULL'
Ejemplo n.º 45
0
 def pbs_disconn(self, conn):
     pbs.pbs_disconnect(conn)