Example #1
0
 def run(self):
     self.lock.acquire()
     try:
         # loop over all datasets
         for vuid,name,modDate in self.datasets:
             # only dis datasets
             if re.search('_dis\d+$',name) is None:
                 _logger.error("Eraser : non disDS %s" % name)
                 continue
             # delete
             _logger.debug("Eraser %s dis %s %s" % (self.operationType,modDate,name))
             # delete or shorten
             endStatus = 'deleted'
             status,out = rucioAPI.eraseDataset(name)
             if not status:
                 _logger.error(out)
                 continue
             _logger.debug('OK with %s' % name)
             # update
             self.proxyLock.acquire()
             varMap = {}
             varMap[':vuid'] = vuid
             varMap[':status'] = endStatus
             taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                  varMap)
             self.proxyLock.release()
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
def main(tbuf=None, **kwargs):
    # logger
    tmpLog = LogWrapper(_logger)

    tmpLog.debug("================= start ==================")
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config,'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot']
    # get users
    sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
    varMap = {}
    varMap[':patt'] = '%p%'
    tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap)
    for realDN, in tmpRes:
        if realDN is None:
            continue
        realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False)
        name = taskBuffer.cleanUserID(realDN)
        # check proxy
        tmpLog.debug("check proxy cache for {}".format(name))
        for role in roles:
            my_proxy_interface_instance.checkProxy(realDN, role=role, name=name)
    tmpLog.debug("done")
Example #3
0
def get_task_attribute_map(task_id_list):
    var_map = {}
    for i, task_id in enumerate(task_id_list):
        var_map[':task_id{0}'.format(i)] = task_id
    task_id_bindings = ','.join(':task_id{0}'.format(i)
                                for i in range(len(task_id_list)))

    sql = """
          SELECT jeditaskid, prodsourcelabel, gshare FROM ATLAS_PANDA.jedi_tasks 
          WHERE jeditaskid IN({0})
          """.format(task_id_bindings)

    _logger.debug('sql: {0}'.format(sql))
    _logger.debug('task_id_bindings: {0}'.format(task_id_bindings))
    var_map

    status, ret_sel = taskBuffer.querySQLS(sql, var_map)
    task_pslabel_map = {}
    task_gshare_map = {}
    if ret_sel:
        _logger.debug('ret_sel: {0}'.format(ret_sel))
        for jeditaskid, prodsourcelabel, gshare in ret_sel:
            task_pslabel_map[jeditaskid] = prodsourcelabel
            task_gshare_map[jeditaskid] = gshare
    return task_pslabel_map, task_gshare_map
Example #4
0
            tmpLog.debug("old process : %s %s" % (pid, startTime))
            tmpLog.debug(line)
            commands_get_status_output('kill -9 %s' % pid)
except Exception:
    type, value, traceBack = sys.exc_info()
    tmpLog.error("kill process : %s %s" % (type, value))

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
aSiteMapper = SiteMapper(taskBuffer)

# delete
tmpLog.debug("Del session")
status, retSel = taskBuffer.querySQLS(
    "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
if retSel is not None:
    try:
        maxID = retSel[0][0]
        tmpLog.debug("maxID : %s" % maxID)
        if maxID is not None:
            varMap = {}
            varMap[':maxID'] = maxID
            varMap[':jobStatus1'] = 'activated'
            varMap[':jobStatus2'] = 'waiting'
            varMap[':jobStatus3'] = 'failed'
            varMap[':jobStatus4'] = 'cancelled'
            status, retDel = taskBuffer.querySQLS(
                "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                varMap)
    except Exception:
Example #5
0
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']:
    varMap = {}
    varMap[':prodSourceLabel'] = 'user'
    varMap[':pmerge'] = 'pmerge'
    if table == 'ATLAS_PANDA.jobsActive4':
        sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
    else:
        # with time range for archived table
        varMap[':modificationTime'] = datetime.datetime.utcnow(
        ) - datetime.timedelta(minutes=60)
        sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
    # exec
    status, res = taskBuffer.querySQLS(sql, varMap, arraySize=10000)
    if res is None:
        tmpLog.debug("total %s " % res)
    else:
        tmpLog.debug("total %s " % len(res))
        # make map
        for cnt, prodUserName, jobStatus, workingGroup, computingSite in res:
            # append to PerUser map
            usageBreakDownPerUser.setdefault(prodUserName, {})
            usageBreakDownPerUser[prodUserName].setdefault(workingGroup, {})
            usageBreakDownPerUser[prodUserName][workingGroup].setdefault(
                computingSite, {
                    'rundone': 0,
                    'activated': 0,
                    'running': 0
                })
Example #6
0
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("Freezer start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ",
                                          {':destinationDBlock':name})
             self.proxyLock.release()
             if retF < 0:
                 _logger.error("SQL error")
             else:
                 allFinished = True
                 onePandaID = None
                 for tmpPandaID,tmpFileStatus in resF:
                     onePandaID = tmpPandaID
                     if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']:
                         allFinished = False
                         break
                 # check sub datasets in the jobset for event service job
                 if allFinished:
                     self.proxyLock.acquire()
                     tmpJobs = taskBuffer.getFullJobStatus([onePandaID])
                     self.proxyLock.release()
                     if len(tmpJobs) > 0 and tmpJobs[0] is not None:
                         if EventServiceUtils.isEventServiceMerge(tmpJobs[0]):
                             self.proxyLock.acquire()
                             cThr = Closer(taskBuffer, [], tmpJobs[0])
                             allFinished = cThr.checkSubDatasetsInJobset()
                             self.proxyLock.release()
                             _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished))
                 # no files in filesTable
                 if allFinished:
                     _logger.debug("freeze %s " % name)
                     dsExists = True
                     if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                             or name.startswith('hc_test.') or name.startswith('panda.um.'):
                         dsExists = False
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer is not None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob is not None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = True,''
                     elif dsExists:
                         # check if dataset exists
                         status,out = rucioAPI.getMetaData(name)
                         if status == True:
                             if out is not None:
                                 try:
                                     rucioAPI.closeDataset(name)
                                     status = True
                                 except Exception:
                                     errtype,errvalue = sys.exc_info()[:2]
                                     out = 'failed to freeze : {0} {1}'.format(errtype,errvalue)
                                     status = False
                             else:
                                 # dataset not exist
                                 status,out = True,''
                                 dsExists = False
                     else:
                         status,out = True,''
                     if not status:
                         _logger.error('{0} failed to freeze with {1}'.format(name,out))
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists:
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = rucioAPI.getNumberOfFiles(name)
                         if status is not True:
                             if status is False:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = rucioAPI.eraseDataset(name)
                                     _logger.debug('OK with %s' % name)
                             except Exception:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Example #7
0
 def run(self):
     self.lock.acquire()
     try:
         # loop over all datasets
         for vuid,name,modDate in self.datasets:
             _logger.debug("Close %s %s" % (modDate,name))
             dsExists = True
             if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                     or name.startswith('hc_test.') or name.startswith('panda.um.'):
                 dsExists = False
             if dsExists:
                 # check if dataset exists
                 status,out = rucioAPI.getMetaData(name)
                 if status == True:
                     if out is not None:
                         try:
                             rucioAPI.closeDataset(name)
                             status = True
                         except Exception:
                             errtype,errvalue = sys.exc_info()[:2]
                             out = 'failed to freeze : {0} {1}'.format(errtype,errvalue)
                             status = False
                     else:
                         # dataset not exist
                         status,out = True,''
                         dsExists = False
             else:
                 status,out = True,''
             if not status:
                 _logger.error('{0} failed to close with {1}'.format(name,out))
             else:
                 self.proxyLock.acquire()
                 varMap = {}
                 varMap[':vuid'] = vuid
                 varMap[':newstatus'] = 'completed'
                 varMap[':oldstatus'] = 'tobeclosed'
                 taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newstatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status=:oldstatus",
                                  varMap)
                 self.proxyLock.release()                    
                 # set tobedeleted to dis
                 setTobeDeletedToDis(name)
                 # skip if dataset is not real
                 if not dsExists:
                     continue
                 # count # of files
                 status,out = rucioAPI.getNumberOfFiles(name)
                 if status is not True:
                     if status is False:
                         _logger.error(out)
                 else:
                     _logger.debug(out)                                            
                     try:
                         nFile = int(out)
                         if nFile == 0:
                             # erase dataset
                             _logger.debug('erase %s' % name)
                             status,out = rucioAPI.eraseDataset(name)
                             _logger.debug('OK with %s' % name)
                     except Exception:
                         pass
     except Exception:
         pass
     self.pool.remove(self)
     self.lock.release()
Example #8
0
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             # check just in case
             if re.search('_sub\d+$',name) is None:
                 _logger.debug("skip non sub %s" % name)
                 continue
             _logger.debug("delete sub %s" % name)
             if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                     or name.startswith('hc_test.') or name.startswith('panda.um.'):
                 dsExists = False
             else:
                 dsExists = True
                 # get PandaIDs
                 self.proxyLock.acquire()
                 retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ DISTINCT PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ",
                                                  {':destinationDBlock':name})
                 self.proxyLock.release()
                 if retF is None:
                     _logger.error("SQL error for sub {0}".format(name))
                     continue
                 else:
                     _logger.debug("sub {0} has {1} jobs".format(name,len(resF)))
                     self.proxyLock.acquire()
                     # check jobs
                     sqlP  = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 WHERE PandaID=:PandaID "
                     sqlP += "UNION "
                     sqlP += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>CURRENT_DATE-30 "
                     allDone = True
                     for pandaID, in resF:
                         retP,resP = taskBuffer.querySQLS(sqlP, {':PandaID':pandaID})
                         if len(resP) == 0:
                             _logger.debug("skip delete sub {0} PandaID={1} not found".format(name,pandaID))
                             allDone = False
                             break
                         jobStatus = resP[0][0]
                         if jobStatus not in ['finished','failed','cancelled','closed']:
                             _logger.debug("skip delete sub {0} PandaID={1} is active {2}".format(name,pandaID,jobStatus))
                             allDone = False
                             break
                     self.proxyLock.release()
                     if allDone:
                         _logger.debug("deleting sub %s" % name)
                         try:
                             rucioAPI.eraseDataset(name, grace_period=4)
                             status = True
                         except Exception:
                             errtype,errvalue = sys.exc_info()[:2]
                             out = '{0} {1}'.format(errtype,errvalue)
                             _logger.error('{0} failed to erase with {1}'.format(name,out))
                     else:
                         _logger.debug("wait sub %s" % name)
                         continue
             # update dataset
             self.proxyLock.acquire()
             varMap = {}
             varMap[':vuid'] = vuid
             varMap[':ost1'] = 'completed' 
             varMap[':ost2'] = 'cleanup' 
             varMap[':newStatus'] = 'deleted' 
             taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newStatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status IN (:ost1,:ost2) ",
                                  varMap)
             self.proxyLock.release()                            
             _logger.debug("end %s " % name)
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Example #9
0
# freeze dataset
_logger.debug("==== freeze datasets ====")
timeLimitRU = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
timeLimitRL = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14)
# reset doing so that Closer can update unmerged datasets
sql  = "SELECT name FROM ATLAS_PANDA.Datasets "
sql += "WHERE type=:type AND (modificationdate BETWEEN :modificationdateRL AND :modificationdateRU) AND subType=:subType AND status=:oldStatus "
varMap = {}
varMap[':modificationdateRU'] = timeLimitRU
varMap[':modificationdateRL'] = timeLimitRL
varMap[':type'] = 'output'
varMap[':subType'] = 'sub'
varMap[':oldStatus'] = 'doing'
retReset,resReset = taskBuffer.querySQLS(sql,varMap)
sql = "UPDATE ATLAS_PANDA.Datasets SET status=:newStatus,modificationdate=:modificationdateU WHERE name=:name AND status=:oldStatus "
if resReset is not None:
    for name, in resReset:
        varMap = {}
        varMap[':name'] = name
        varMap[':oldStatus'] = 'doing'
        varMap[':newStatus'] = 'running'
        varMap[':modificationdateU'] = timeLimitU
        _logger.debug("reset {0} to freeze".format(name))
        taskBuffer.querySQLS(sql,varMap)
# loop for freezer
freezeLock = threading.Semaphore(5)
freezeProxyLock = threading.Lock()
freezeThreadPool = ThreadPool()
maxRows = 100000
Example #10
0
    action='store_const',
    const=True,
    dest='resurrectDS',
    default=False,
    help='resurrect output and log datasets if they were already deleted')

options = parser.parse_args()

jediTaskID = int(options.tid)

if True:
    if options.resurrectDS:
        sd, so = taskBuffer.querySQLS(
            'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)',
            {
                ':id': jediTaskID,
                ':t1': 'output',
                ':t2': 'log'
            })
        rc = RucioClient()
        for datasetName, in so:
            for i in range(3):
                try:
                    scope, name = rucioAPI.extract_scope(datasetName)
                    rc.get_did(scope, name)
                    break
                except DataIdentifierNotFound:
                    print('resurrect {0}'.format(datasetName))
                    rc.resurrect([{'scope': scope, 'name': name}])
                    try:
                        rc.set_metadata(scope, name, 'lifetime', None)
Example #11
0
if options.files is not None:
    files = options.files.split(',')
else:
    # get files from rucio
    rc = RucioClient()
    scope, name = rucioAPI.extract_scope(options.ds)
    files_rucio = set()
    for i in rc.list_files(scope, name):
        files_rucio.add(i['name'])
    # get files from panda
    dsName = options.ds.split(':')[-1]
    fd, fo = taskBuffer.querySQLS(
        'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c WHERE d.jediTaskID=c.jediTaskID AND d.datasetID=c.datasetID AND d.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ',
        {
            ':s': 'finished',
            ':t1': 'output',
            ':t2': 'log',
            ':name': dsName
        })
    files = []
    for tmpLFN, in fo:
        if tmpLFN not in files_rucio:
            files.append(tmpLFN)
    print('')
    print('found {0} lost files -> {1}'.format(len(files), ','.join(files)))

s, jediTaskID = taskBuffer.resetFileStatusInJEDI('', True, options.ds, files,
                                                 [], options.dryRun)
if options.dryRun:
    sys.exit(0)
if s:
Example #12
0
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
from pandacommon.pandalogger.PandaLogger import PandaLogger


# initialize cx_Oracle using dummy connection
initializer.init()

# logger
_logger = PandaLogger().getLogger('boostUser')
_logger.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

user = sys.stdin.read()
user = user[:-1]

sql = "UPDATE atlas_panda.%s set currentPriority=:prio where prodUserName=:uname and prodSourceLabel IN (:label1,:label2) and currentPriority<:prio"
varMap = {}
varMap[':prio'] = 4000
varMap[':uname'] = user
varMap[':label1'] = 'user'
varMap[':label2'] = 'panda'
for table in ('jobsactive4','jobsdefined4'):
	_logger.debug((sql % table) + str(varMap))
	ret = taskBuffer.querySQLS(sql % table,varMap)
	_logger.debug('ret -> %s' % str(ret))

_logger.debug("================= end ==================")
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None):
    # options
    parser = argparse.ArgumentParser()
    if taskBuffer:
        parser.add_argument('--ds',action='store',dest='ds',default=None,
                            help='dataset name')
    else:
        parser.add_argument('--ds',action='store',dest='ds',default=None,required=True,
                            help='dataset name')
    parser.add_argument('--files',action='store',dest='files',default=None,
                        help='comma-separated list of lost file names. The list is dedeuced if this option is omitted')
    parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False,
                        help='not retry child tasks')
    parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False,
                        help='resurrect output and log datasets if they were already deleted')
    parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False,
                        help='dry run')
    parser.add_argument('--force', action='store_const', const=True, dest='force', default=False,
                        help='force retry even if no lost files')
    parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent',
                        default=False, help='reproduce the input files from which the lost files were produced. '
                        'Typically useful to recover merged files when unmerged files were already deleted')
    # parse options
    if taskBuffer:
        if args_list:
            options = parser.parse_args(args_list)
        else:
            options, unknown = parser.parse_known_args()
    else:
        if args_list:
            options = parser.parse_args(args_list)
        else:
            options = parser.parse_args()

    # executed via command-line
    givenTaskID = None
    dn = None
    if taskBuffer is None:
        # instantiate TB
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

    else:
        # set options from dict
        if exec_options is None:
            exec_options = {}
        keys = set(vars(options).keys())
        for k in exec_options:
            if k in keys:
                setattr(options, k, exec_options[k])
        if 'jediTaskID' in exec_options:
            givenTaskID = exec_options['jediTaskID']
        if 'userName' in exec_options:
            dn = exec_options['userName']

    ds_files = {}
    if options.files is not None:
        files = options.files.split(',')
        ds_files[options.ds] = files
    else:
        # look for lost files
        if not givenTaskID:
            # get files from rucio
            st, files_rucio = get_files_from_rucio(options.ds, log_stream)
            if st is not True:
                return st, files_rucio
            # get files from panda
            dsName = options.ds.split(':')[-1]
            fd, fo = taskBuffer.querySQLS(
                'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c '
                'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND '
                'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ',
                {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName})
            for tmpLFN, in fo:
                if tmpLFN not in files_rucio:
                    ds_files.setdefault(options.ds, [])
                    ds_files[options.ds].append(tmpLFN)
            # get taskID
            td, to = taskBuffer.querySQLS(
                        'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets '
                        'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ',
                        {':t1': 'output', ':t2': 'log', ':datasetName': dsName})
            jediTaskID, = to[0]
        else:
            # get dataset names
            dd, do = taskBuffer.querySQLS(
                'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets '
                'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ',
                {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID})
            # get files from rucio
            files_rucio = set()
            for tmpDS, in do:
                st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream)
                if st is None:
                    return st, tmp_files_rucio
                # ignore unknown dataset
                if st:
                    files_rucio = files_rucio.union(tmp_files_rucio)
            # get files from rucio
            fd, fo = taskBuffer.querySQLS(
                'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c '
                'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND '
                'd.type IN (:t1,:t2) AND c.status=:s ',
                {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID})
            for tmpDS, tmpLFN in fo:
                if tmpLFN not in files_rucio:
                    ds_files.setdefault(tmpDS, [])
                    ds_files[tmpDS].append(tmpLFN)
        for tmpDS in ds_files:
            files = ds_files[tmpDS]
            msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files))
            if log_stream:
                log_stream.info(msgStr)
            else:
                print(msgStr)

    # no lost files
    if not ds_files and not options.force:
        return True, "No lost files. Use --force to ignore this check"

    # reset file status
    s = False
    for tmpDS in ds_files:
        files = ds_files[tmpDS]
        if dn:
            ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS,
                                                                              files, options.reproduceParent,
                                                                              options.dryRun)
        else:
            ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS,
                                                                              files, options.reproduceParent,
                                                                              options.dryRun)
        msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID)
        if log_stream:
            log_stream.info(msgStr)
        else:
            print(msgStr)
        s |= ts
        # recover parent
        if options.reproduceParent:
            # reproduce input
            for lostDS in lostInputFiles:
                com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS']
                if options.dryRun:
                    com_args.append('--dryRun')
                com_args += ['--files', ','.join(lostInputFiles[lostDS])]
            main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args)

    # go ahead
    if options.dryRun:
        return True, 'Done in the dry-run mode with {}'.format(s)
    if s or options.force:
        if options.resurrectDS:
            sd,so = taskBuffer.querySQLS(
                'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)',
                {':id': jediTaskID, ':t1': 'output', ':t2': 'log'})
            rc = RucioClient()
            for datasetName, in so:
                for i in range(3):
                    try:
                        scope, name = rucioAPI.extract_scope(datasetName)
                        rc.get_did(scope, name)
                        break
                    except DataIdentifierNotFound:
                        print('resurrect {0}'.format(datasetName))
                        rc.resurrect([{'scope': scope, 'name': name}])
                        try:
                            rc.set_metadata(scope, name, 'lifetime', None)
                        except Exception:
                            pass
        if not options.reproduceParent:
            msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1]
        else:
            msgStr = Client.reloadInput(jediTaskID)[-1][-1]
        if log_stream:
            log_stream.info("Retried task with {}".format(msgStr))
            log_stream.info("Done")
        else:
            print("Retried task: done with {}".format(msgStr))
        return True, msgStr
    else:
        msgStr = 'failed'
        if log_stream:
            log_stream.error(msgStr)
        else:
            print(msgStr)
        return False, msgStr
Example #14
0
# get low priority ES jobs per site
sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime "
sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es "
sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "
sqlEsJobs += "ORDER BY currentPriority,PandaID "

varMap = {}
varMap[':label1'] = 'managed'
varMap[':label2'] = 'test'
varMap[':es'] = 1
varMap[':prio'] = 200
varMap[':jobStat'] = 'running'
# exec
status, res = taskBuffer.querySQLS(sqlEsJobs, varMap, arraySize=100000)
if res is None:
    tmpLog.debug("total %s " % res)
else:
    tmpLog.debug("total %s " % len(res))
    # get number of jobs per site
    siteJobsMap = {}
    for pandaID, siteName, commandToPilot, startTime in res:
        if siteName not in siteJobsMap:
            siteJobsMap[siteName] = {'running': [], 'killing': []}
        if commandToPilot == 'tobekilled':
            siteJobsMap[siteName]['killing'].append(pandaID)
        else:
            # kill only old jobs
            if startTime < timeLimit:
                siteJobsMap[siteName]['running'].append(pandaID)
Example #15
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    tmpLog = LogWrapper(_logger, None)

    tmpLog.debug("===================== start =====================")

    # current minute
    currentMinute = datetime.datetime.utcnow().minute

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # delete
    tmpLog.debug("Del session")
    status, retSel = taskBuffer.querySQLS(
        "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
    if retSel is not None:
        try:
            maxID = retSel[0][0]
            tmpLog.debug("maxID : %s" % maxID)
            if maxID is not None:
                varMap = {}
                varMap[':maxID'] = maxID
                varMap[':jobStatus1'] = 'activated'
                varMap[':jobStatus2'] = 'waiting'
                varMap[':jobStatus3'] = 'failed'
                varMap[':jobStatus4'] = 'cancelled'
                status, retDel = taskBuffer.querySQLS(
                    "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                    varMap)
        except Exception:
            pass

    # count # of getJob/updateJob in dispatcher's log
    try:
        # don't update when logrotate is running
        timeNow = datetime.datetime.utcnow()
        logRotateTime = timeNow.replace(hour=3,
                                        minute=2,
                                        second=0,
                                        microsecond=0)
        if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
               (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
            tmpLog.debug("skip pilotCounts session for logrotate")
        else:
            # log filename
            dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
            # time limit
            timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                hours=3)
            timeLimitS = datetime.datetime.utcnow() - datetime.timedelta(
                hours=1)
            # check if tgz is required
            com = 'head -1 %s' % dispLogName
            lostat, loout = commands_get_status_output(com)
            useLogTgz = True
            if lostat == 0:
                match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                  loout)
                if match is not None:
                    startTime = datetime.datetime(*time.strptime(
                        match.group(0), '%Y-%m-%d %H:%M:%S')[:6])
                    # current log contains all info
                    if startTime < timeLimit:
                        useLogTgz = False
            # log files
            dispLogNameList = [dispLogName]
            if useLogTgz:
                today = datetime.date.today()
                dispLogNameList.append('{0}-{1}.gz'.format(
                    dispLogName, today.strftime('%Y%m%d')))
            # delete tmp
            commands_get_status_output('rm -f %s.tmp-*' % dispLogName)
            # tmp name
            tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow(
            ).strftime('%Y-%m-%d-%H-%M-%S'))
            # loop over all files
            pilotCounts = {}
            pilotCountsS = {}
            for tmpDispLogName in dispLogNameList:
                # expand or copy
                if tmpDispLogName.endswith('.gz'):
                    com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName)
                else:
                    com = 'cp %s %s' % (tmpDispLogName, tmpLogName)
                lostat, loout = commands_get_status_output(com)
                if lostat != 0:
                    errMsg = 'failed to expand/copy %s with : %s' % (
                        tmpDispLogName, loout)
                    raise RuntimeError(errMsg)
                # search string
                sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*'
                sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)'
                # read
                logFH = open(tmpLogName)
                for line in logFH:
                    # check format
                    match = re.search(sStr, line)
                    if match is not None:
                        # check timerange
                        timeStamp = datetime.datetime(*time.strptime(
                            match.group(1), '%Y-%m-%d %H:%M:%S')[:6])
                        if timeStamp < timeLimit:
                            continue
                        tmpMethod = match.group(2)
                        tmpSite = match.group(3)
                        tmpNode = match.group(4)
                        tmpType = match.group(5)

                        # protection against corrupted entries from pilot,
                        # e.g. pilot reading site json from cvmfs while it was being updated
                        if tmpSite not in aSiteMapper.siteSpecList:
                            continue
                        # sum
                        pilotCounts.setdefault(tmpSite, {})
                        pilotCounts[tmpSite].setdefault(tmpMethod, {})
                        pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0)
                        pilotCounts[tmpSite][tmpMethod][tmpNode] += 1
                        # short
                        if timeStamp > timeLimitS:
                            if tmpSite not in pilotCountsS:
                                pilotCountsS[tmpSite] = dict()
                            if tmpMethod not in pilotCountsS[tmpSite]:
                                pilotCountsS[tmpSite][tmpMethod] = dict()
                            if tmpNode not in pilotCountsS[tmpSite][tmpMethod]:
                                pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0
                            pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1
                # close
                logFH.close()
            # delete tmp
            commands_get_status_output('rm %s' % tmpLogName)
            # update
            hostID = panda_config.pserverhost.split('.')[0]
            tmpLog.debug("pilotCounts session")
            retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3)
            tmpLog.debug(retPC)
            retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1)
            tmpLog.debug(retPC)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue))

    # nRunning
    tmpLog.debug("nRunning session")
    try:
        if (currentMinute / panda_config.nrun_interval
            ) % panda_config.nrun_hosts == panda_config.nrun_snum:
            retNR = taskBuffer.insertnRunningInSiteData()
            tmpLog.debug(retNR)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("nRunning : %s %s" % (errType, errValue))

    # session for co-jumbo jobs
    tmpLog.debug("co-jumbo session")
    try:
        ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
        if ret is None:
            tmpLog.debug("failed to get co-jumbo jobs to finish")
        else:
            coJumboA, coJumboD, coJumboW, coJumboTokill = ret
            tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
                len(coJumboA)))
            if len(coJumboA) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboA,
                                               fromDefined=False,
                                               fromActive=True,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False)
            tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
                len(coJumboD)))
            if len(coJumboD) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboD,
                                               fromDefined=True,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], True)
            tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format(
                len(coJumboW)))
            if len(coJumboW) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboW,
                                               fromDefined=False,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=True)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False, True)
            tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
                len(coJumboTokill)))
            if len(coJumboTokill) > 0:
                jediJobs = list(coJumboTokill)
                nJob = 100
                iJob = 0
                while iJob < len(jediJobs):
                    tmpLog.debug(' killing %s' %
                                 str(jediJobs[iJob:iJob + nJob]))
                    Client.killJobs(jediJobs[iJob:iJob + nJob],
                                    51,
                                    keepUnmerged=True)
                    iJob += nJob
    except Exception:
        errStr = traceback.format_exc()
        tmpLog.error(errStr)

    tmpLog.debug("Fork session")

    # thread for fork
    class ForkThr(threading.Thread):
        def __init__(self, fileName):
            threading.Thread.__init__(self)
            self.fileName = fileName

        def run(self):
            if 'VIRTUAL_ENV' in os.environ:
                prefix = os.environ['VIRTUAL_ENV']
            else:
                prefix = ''
            setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix)
            runStr = '%s/python -Wignore ' % panda_config.native_python
            runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i '
            runStr += self.fileName
            if self.fileName.split('/')[-1].startswith('set.NULL.'):
                runStr += ' -t'
            comStr = setupStr + runStr
            tmpLog.debug(comStr)
            commands_get_status_output(comStr)

    # get set.* files
    filePatt = panda_config.logdir + '/' + 'set.*'
    fileList = glob.glob(filePatt)

    # the max number of threads
    maxThr = 10
    nThr = 0

    # loop over all files
    forkThrList = []
    timeNow = datetime.datetime.utcnow()
    for tmpName in fileList:
        if not os.path.exists(tmpName):
            continue
        try:
            # takes care of only recent files
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(tmpName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(minutes=1) and \
                    (timeNow - modTime) < datetime.timedelta(hours=1):
                cSt, cOut = commands_get_status_output(
                    'ps aux | grep fork | grep -v PYTH')
                # if no process is running for the file
                if cSt == 0 and tmpName not in cOut:
                    nThr += 1
                    thr = ForkThr(tmpName)
                    thr.start()
                    forkThrList.append(thr)
                    if nThr > maxThr:
                        break
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            tmpLog.error("%s %s" % (errType, errValue))

    # join fork threads
    for thr in forkThrList:
        thr.join()

    # terminate TaskBuffer IF
    # taskBufferIF.terminate()

    tmpLog.debug("===================== end =====================")
Example #16
0
    taskBuffer.init(panda_config.dbhost,
                    panda_config.dbpasswd,
                    nDBConnection=1)

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config, 'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = [
            'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot'
        ]
    # get users
    sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
    varMap = {}
    varMap[':patt'] = '%p%'
    tmpStat, tmpRes = taskBuffer.querySQLS(sql, varMap)
    for realDN, in tmpRes:
        if realDN is None:
            continue
        realDN = re.sub('/CN=limited proxy', '', realDN)
        realDN = re.sub('(/CN=proxy)+', '', realDN)
        realDN = re.sub('(/CN=\d+)+$', '', realDN)
        # check proxy
        tmpLog.debug("check proxy cache for DN={0}".format(realDN))
        for role in roles:
            my_proxy_interface_instance.checkProxy(realDN, role=role)
    tmpLog.debug("done")