Ejemplo n.º 1
0
def findLostJobs():
    """ Find jobs that are in active state (running) but
    torque server does know about them (mark them as lost).
    """
    for dead_server in BatchServer.objects.filter(isactive=False):
        for job in Job.objects.filter(job_state=getJobState('R'), server=dead_server):
            job.job_state = getJobState('L')
            job.save()
            log(LOG_DEBUG, "Running job on inactive server marked Lost: %s@%s" % (job.jobid,job.server.name))
    
    for live_server in BatchServer.objects.filter(isactive=True):
        p = subprocess.Popen(["qstat", "-fx", "@%s" % live_server.name], stdout=subprocess.PIPE)
        (out,err) = p.communicate()
        log(LOG_DEBUG, "Qstat output from live server %s obtained" % (live_server.name))
        try:
            log(LOG_DEBUG, "before parseString()")
            jobsxml = parseString(out)
            log(LOG_DEBUG, "after parseString()")
            starttime = time.time()
            log(LOG_DEBUG, "before feedJobsXML()")
            feedJobsXML(jobsxml, True)
            log(LOG_DEBUG, "after feedJobsXML()")
            endtime = time.time()
            log(LOG_INFO, "feedJobsXML() took %f seconds" % (endtime-starttime))
        except ExpatError:
            log(LOG_ERROR, "Cannot parse line: %s" % (out))
Ejemplo n.º 2
0
def findDeletedJobs():
    """ Find deleted jobs (in accounting events table) and mark them as deleted in job table.
    Many jobs have Delete request in AccEvnt table but they are not really deleted (they
    finish ok, or get aborted). This function should filter those.
    """
    # TODO: check that checkEventsRunningJob is still right
    # Evaluation of all the jobs get realy looot of memory 
    # so we get the job one by one

    maxjobid = Job.objects.filter(job_state__shortname='C').aggregate(Max("id"))['id__max']
    for n in range(1,maxjobid+1):
        # clean SQL debug memory once in a while (see http://docs.djangoproject.com/en/1.2/faq/models/#why-is-django-leaking-memory)
        if settings.DEBUG == True and n%1000==0:
            db.reset_queries()
        try:
            j = Job.objects.get(pk=n)
        except Job.DoesNotExist:
            log(LOG_ERROR, "job with pk=%d not found" % (n))
            continue
        aes = AccountingEvent.objects.filter(job=j).order_by("-timestamp")
        ae = aes[0]
        if ae.type=='D':
            j.job_state = getJobState('D')
            j.save()
            log(LOG_DEBUG, "job %s changed to Deleted state" % (j))
        else:
            log(LOG_DEBUG, "job %s unchanged" % (j))
Ejemplo n.º 3
0
def checkEventsRunningJobs():
    """ Check that Running jobs are running according to the event log
    """
    for rj in Job.objects.filter(job_state__shortname='R'):
        log(LOG_INFO, "Checking job id: %d" % (rj.jobid))
        aes = AccountingEvent.objects.filter(job=rj, type__in=['E','D','A']).count()
        if aes!=0:
            log(LOG_ERROR, "job id: %d, db id: %d is in Running state but accounting records are finished - fixing it." % (rj.jobid, rj.id))
            rj.job_state = getJobState('C')
            rj.save()
Ejemplo n.º 4
0
def updatePBSNodes():
    """
    Get pbsnodes info from torque server once in a while and update the info
    """
    global last_updatePBSNodes
    updatePBSNodes_interval = 600
    now = int(time.time())
    log(LOG_INFO, "pbsnodes data check, last: %d, now: %d" % (last_updatePBSNodes, now))
    if now - last_updatePBSNodes > updatePBSNodes_interval:
        log(LOG_INFO, "pbsnodes data outdated - getting new")
        for ts in BatchServer.objects.all():
            log(LOG_INFO, "pbsnodes data outdated - getting new for %s" % ts.name)
            # TODO: timeout after 1min
            p = subprocess.Popen(["pbsnodes", "-ax", "-s", ts.name], stdout=subprocess.PIPE)
            (out,err) = p.communicate()
            try:
                starttime = time.time()
                nodesxml = parseString(out)
                feedNodesXML(nodesxml, ts.name)
                nodesxml.unlink()
                endtime = time.time()
                log(LOG_INFO, "feedNodesXML() took %f seconds" % (endtime-starttime))
            except ExpatError:
                log(LOG_ERROR, "Cannot parse line: %s" % (out))
            
# TODO: this should be done much less often just to find the Lost jobs
# it is not that useful normally
#            p = subprocess.Popen(["qstat", "-fx", "@%s" % ts.name], stdout=subprocess.PIPE)
#            (out,err) = p.communicate()
#            try:
#                jobsxml = parseString(out)
#                starttime = time.time()
#                feedJobsXML(jobsxml, True)
#                jobsxml.unlink()
#                endtime = time.time()
#                log(LOG_INFO, "feedJobsXML() took %f seconds" % (endtime-starttime))
#            except ExpatError:
#                log(LOG_ERROR, "Cannot parse line: %s" % (out))
            run_qstat = getRunningCountQstat(ts.name)
            run_db = Job.objects.filter(job_state=getJobState('R')).count()
            log(LOG_INFO, "Running jobs:: according to qstat: %d, according to database: %d" % (run_qstat, run_db))
        last_updatePBSNodes = now
Ejemplo n.º 5
0
def parseOneLogLine(line,lineno):
    """
    Parse one line from accounting log and insert the data into DB.
    """
    cursor = connection.cursor()
    try:
        date,event,fulljobid,attrs = line.split(';')
    except ValueError:
        log(LOG_WARNING, "skipping invalid line %d: '%s'" % (lineno,line))
        return
        
    log(LOG_DEBUG, "processing accounting line: %s:%s:%s ..." %(date, event, fulljobid))
    # We ignore PBSPro Licensing lines (it is not job related)
    if event=='L':
        log(LOG_DEBUG, "ignored licensing line")
        return

    attrdir = {}
    try:
        for key,val in map(lambda x: x.split('=',1), attrs.split()): 
            attrdir[key] = val
    except ValueError:
        log(LOG_WARNING, "skipping line with invalid attribues %d: '%s'" % (lineno,attrs))

    jobid_name, server_name = JOBID_REGEX.search(fulljobid).groups()
    server,created = getBatchServer(server_name)
    if created:
        log(LOG_INFO, "new server will be created: %s" % server_name)

    #job,created = Job.objects.get_or_create(jobid=jobid_name, server=server)
    job = SQLJob()
    job.jobid = jobid_name
    job.server_id = server.id

    job.refresh_id_jobstate_id()


    if attrdir.has_key('owner'):
        shname = attrdir['owner'].split('@')[1]
        submithost,created = getSubmitHost(shname)
        if created:
            log(LOG_INFO, "new submit host will be created: %s" % shname)
        job.submithost_id = submithost.pk

    if attrdir.has_key('requestor'):
        shname = attrdir['requestor'].split('@')[1]
        submithost,created = getSubmitHost(shname)
        if created:
            log(LOG_INFO, "new submit host will be created: %s" % shname)
        job.submithost_id = submithost.id

    if attrdir.has_key('group'):
        group,created = getGroup(attrdir['group'], server)
        if created:
            log(LOG_INFO, "new group will be created: %s" % attrdir['group'])

    if attrdir.has_key('user'):
        user,created = getUser(attrdir['user'], server, group)
        if created:
            log(LOG_INFO, "new user will be created: %s" % attrdir['user'])
        job.job_owner_id = user.id
        # TODO: convert this to SQL as well
        user.group = group

    if attrdir.has_key('resources_used.cput'):
        h,m,s = attrdir['resources_used.cput'].split(":")
        job.cput = (int(h)*60+int(m))*60+int(s)
    if attrdir.has_key('resources_used.walltime'):
        h,m,s = attrdir['resources_used.walltime'].split(":")
        job.walltime = (int(h)*60+int(m))*60+int(s)
    if attrdir.has_key('resources_used.cput') and attrdir.has_key('resources_used.walltime'):
        if job.walltime!=0:
            job.efficiency = 100*job.cput/job.walltime
        else:
            job.efficiency = 0

    if attrdir.has_key('Exit_status'):
        job.exit_status = int(attrdir['Exit_status'])

    if event=='Q':
        new_state = getJobState('Q')
    elif event=='S' or event=='R' or event=='C' or event=='T':
        new_state = getJobState('R')
    elif event=='E':
        new_state = getJobState('C')
    elif event=='D':
        new_state = getJobState('D')
    elif event=='A':
        new_state = getJobState('A')
    elif event=='G':
        new_state = getJobState('D')
    else:
        log(LOG_ERROR, "Unknown event type in accounting log file: %s" % line)
        return
    if job.job_state_id != getJobState('C').id:
#        if new_state == getJobState('R') and job.job_state != getJobState('R'):
#            RunningJob.objects.get_or_create(mainjob=job)
#        elif new_state != getJobState('R') and job.job_state == getJobState('R'):
#            try:
#                rj = RunningJob.objects.get(mainjob=job)
#                rj.delete()
#            except RunningJob.DoesNotExist:
#                pass

        job.job_state_id = new_state.id
    else:
        log(LOG_INFO, "Job %s.%s is already finished, not changing the state." % (job.jobid,server.name))
    # running job cache update
        

    if attrdir.has_key('queue'):
        queue,created = getQueue(attrdir['queue'], server)
        if created:
            log(LOG_INFO, "new queue will be created: %s" % attrdir['queue'])
        job.queue_id = queue.id
    if attrdir.has_key('ctime'):
        job.ctime = datetime.datetime.fromtimestamp(int(attrdir['ctime']))
    if attrdir.has_key('mtime'):
        job.mtime = datetime.datetime.fromtimestamp(int(attrdir['mtime']))
    if attrdir.has_key('qtime'):
        job.qtime = datetime.datetime.fromtimestamp(int(attrdir['qtime']))
    if attrdir.has_key('etime'):
        job.etime = datetime.datetime.fromtimestamp(int(attrdir['etime']))
    if attrdir.has_key('start'):
        job.start_time = datetime.datetime.fromtimestamp(int(attrdir['start']))
    if attrdir.has_key('end'):
        job.comp_time = datetime.datetime.fromtimestamp(int(attrdir['end']))
    if attrdir.has_key('exec_host'):
        exec_host_names_slots = attrdir['exec_host'].split('+')
        job.jobslots = []

        # convert PBSPro records like 'node1/0*2' to more generic 'node1/0+node1/1'
        exec_host_names_slots_new = []
        for exec_host_name_slot in exec_host_names_slots:
            if exec_host_name_slot.find('*')>=0:
                exec_host_slot0, numslots = exec_host_name_slot.split('*')
                exec_host_name = exec_host_slot0.split('/')[0]
                exec_host_name_slot_new=[ "%s/%d" % (exec_host_name, i) for i in range(0,int(numslots)) ]
                log(LOG_DEBUG, "Exec_host %s converted to %s" % (exec_host_name_slot,exec_host_name_slot_new))
                exec_host_names_slots_new.extend(exec_host_name_slot_new)
            else:
                exec_host_names_slots_new.append(exec_host_name_slot)
        exec_host_names_slots = exec_host_names_slots_new

        for exec_host_name_slot in exec_host_names_slots:
                
            name,slotstr = exec_host_name_slot.split('/')
            slot = int(slotstr)
            node,created = getNode(name, server)
            if created:
                log(LOG_INFO, "new node will be created: node name: %s" % (name))
                node.save()
            js,created = getJobSlot(slot=slot,node=node)
            if created:
                log(LOG_INFO, "new jobslot will be created: slot: %d, node name: %s" % (slot,name))
                js.save()
            job.jobslots.append(js.id)
    job.save()


    if job.id == -1:
        job.refresh_id_jobstate_id()
    d,t = date.split(' ')
    m,d,y = d.split('/')
#    ae,created = AccountingEvent.objects.get_or_create(timestamp='%s-%s-%s %s' % (y,m,d,t), type=event, job=job)
    timestamp='%s-%s-%s %s' % (y,m,d,t)
    cursor.execute("INSERT IGNORE INTO trqacc_accountingevent (timestamp, type, job_id) VALUES (%s,%s,%s)", [timestamp, event, job.id])