def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if type(file["locations"]) == set: seName = list(file["locations"])[0] elif type(file["locations"]) == list: if len(file['locations']) > 1: logging.error("Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) seName = file["locations"][0] else: seName = file["locations"] wmbsFile["locations"] = set() if seName != None: wmbsFile.setLocation(se = seName, immediateSave = False) wmbsFile['jid'] = jobID self.wmbsFilesToBuild.append(wmbsFile) return wmbsFile
def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error("Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn = pnn, immediateSave = False) wmbsFile['jid'] = jobID return wmbsFile
def loadFiles(self, size=10): """ _loadFiles_ Grab some files from the resultProxy Should handle multiple proxies. Not really sure about that """ if len(self.proxies) < 1: # Well, you don't have any proxies. # This is what happens when you ran out of files last time logging.info("No additional files found; Ending.") return set() resultProxy = self.proxies[0] rawResults = [] if type(resultProxy.keys) == list: keys = resultProxy.keys else: keys = resultProxy.keys() if type(keys) == set: # If it's a set, handle it keys = list(keys) files = set() while len(rawResults) < size and len(self.proxies) > 0: length = size - len(rawResults) newResults = resultProxy.fetchmany(size=length) if len(newResults) < length: # Assume we're all out # Eliminate this proxy self.proxies.remove(resultProxy) rawResults.extend(newResults) if rawResults == []: # Nothing to do return set() fileList = self.formatDict(results=rawResults, keys=keys) fileIDs = list(set([x["fileid"] for x in fileList])) myThread = threading.currentThread() fileInfoAct = self.daoFactory(classname="Files.GetForJobSplittingByID") fileInfoDict = fileInfoAct.execute(file=fileIDs, conn=myThread.transaction.conn, transaction=True) getLocAction = self.daoFactory(classname="Files.GetLocationBulk") getLocDict = getLocAction.execute(files=fileIDs, conn=myThread.transaction.conn, transaction=True) for fID in fileIDs: fl = WMBSFile(id=fID) fl.update(fileInfoDict[fID]) locations = getLocDict.get((fID), []) for loc in locations: fl.setLocation(loc, immediateSave=False) files.add(fl) return files
def filesOfStatus(self, status, limit=0, loadChecksums=True, doingJobSplitting=False): """ _filesOfStatus_ Return a Set of File objects that have the given status with respect to this subscription. """ existingTransaction = self.beginTransaction() status = status.title() files = set() if limit > 0: action = self.daofactory( classname="Subscriptions.Get%sFilesByLimit" % status) fileList = action.execute(self["id"], limit, conn=self.getDBConn(), transaction=self.existingTransaction()) else: action = self.daofactory(classname="Subscriptions.Get%sFiles" % status) fileList = action.execute(self["id"], conn=self.getDBConn(), transaction=self.existingTransaction()) if doingJobSplitting: fileInfoAct = self.daofactory( classname="Files.GetForJobSplittingByID") else: fileInfoAct = self.daofactory(classname="Files.GetByID") fileInfoDict = fileInfoAct.execute( file=[x["file"] for x in fileList], conn=self.getDBConn(), transaction=self.existingTransaction()) #Run through all files for f in fileList: fl = File(id=f['file']) if loadChecksums: fl.loadChecksum() fl.update(fileInfoDict[f['file']]) if 'locations' in f.keys(): fl.setLocation(f['locations'], immediateSave=False) files.add(fl) self.commitTransaction(existingTransaction) return files
def filesOfStatus(self, status, limit = 0, loadChecksums = True, doingJobSplitting = False): """ _filesOfStatus_ Return a Set of File objects that have the given status with respect to this subscription. """ existingTransaction = self.beginTransaction() status = status.title() files = set() if limit > 0: action = self.daofactory(classname = "Subscriptions.Get%sFilesByLimit" % status) fileList = action.execute(self["id"], limit, conn = self.getDBConn(), transaction = self.existingTransaction()) else: action = self.daofactory(classname = "Subscriptions.Get%sFiles" % status) fileList = action.execute(self["id"], conn = self.getDBConn(), transaction = self.existingTransaction()) if doingJobSplitting: fileInfoAct = self.daofactory(classname = "Files.GetForJobSplittingByID") else: fileInfoAct = self.daofactory(classname = "Files.GetByID") fileInfoDict = fileInfoAct.execute(file = [x["file"] for x in fileList], conn = self.getDBConn(), transaction = self.existingTransaction()) #Run through all files for f in fileList: fl = File(id = f['file']) if loadChecksums: fl.loadChecksum() fl.update(fileInfoDict[f['file']]) if 'locations' in f.keys(): fl.setLocation(f['locations'], immediateSave = False) files.add(fl) self.commitTransaction(existingTransaction) return files
def loadFiles(self, size=10): """ _loadFiles_ Grab some files from the resultProxy Should handle multiple proxies. Not really sure about that """ if len(self.proxies) < 1: # Well, you don't have any proxies. # This is what happens when you ran out of files last time logging.info("No additional files found; Ending.") return set() resultProxy = self.proxies[0] rawResults = [] if type(resultProxy.keys) == list: keys = resultProxy.keys else: keys = resultProxy.keys() if type(keys) == set: # If it's a set, handle it keys = list(keys) files = set() while len(rawResults) < size and len(self.proxies) > 0: length = size - len(rawResults) newResults = resultProxy.fetchmany(size=length) if len(newResults) < length: # Assume we're all out # Eliminate this proxy self.proxies.remove(resultProxy) rawResults.extend(newResults) if rawResults == []: # Nothing to do return set() fileList = self.formatDict(results=rawResults, keys=keys) fileIDs = list(set([x['fileid'] for x in fileList])) myThread = threading.currentThread() fileInfoAct = self.daoFactory(classname="Files.GetForJobSplittingByID") fileInfoDict = fileInfoAct.execute(file=fileIDs, conn=myThread.transaction.conn, transaction=True) getLocAction = self.daoFactory(classname="Files.GetLocationBulk") getLocDict = getLocAction.execute(files=fileIDs, conn=myThread.transaction.conn, transaction=True) for fID in fileIDs: fl = WMBSFile(id=fID) fl.update(fileInfoDict[fID]) locations = getLocDict.get((fID), []) for loc in locations: fl.setLocation(loc, immediateSave=False) files.add(fl) return files
def execute(self, jobID, fileSelection = None, conn = None, transaction = False): """ _execute_ Execute the SQL for the given job ID and then format and return the result. """ if type(jobID) == list: if len(jobID) < 1: # Nothing to do return [] binds = jobID else: binds = {"jobid": jobID} result = self.dbi.processData(self.sql, binds, conn = conn, transaction = transaction) jobList = self.formatJobs(result) filesResult = self.dbi.processData(self.fileSQL, binds, conn = conn, transaction = transaction) fileList = self.formatDict(filesResult) fileBinds = [] if fileSelection: fileList = filter(lambda x : x['lfn'] in fileSelection[x['jobid']], fileList) for x in fileList: # Add new runs x['newRuns'] = [] # Assemble unique list of binds if not {'fileid': x['id']} in fileBinds: fileBinds.append({'fileid': x['id']}) parentList = [] if len(fileBinds) > 0: parentResult = self.dbi.processData(self.parentSQL, fileBinds, conn = conn, transaction = transaction) parentList = self.formatDict(parentResult) lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn = conn, transaction = transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber = r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id = f['id']) wmbsFile.update(f) wmbsFile['locations'].add(f['se_name']) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) for entry in parentList: if entry['id'] == f['id']: wmbsFile['parents'].add(entry['lfn']) filesForJobs[jobid][f['id']] = wmbsFile else: # If the file is there, just add the location filesForJobs[jobid][f['id']]['locations'].add(f['se_name']) for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() return jobList
def execute(self, jobID, conn=None, transaction=False): """ _execute_ Execute the SQL for the given job ID and then format and return the result. """ if type(jobID) == list: if len(jobID) < 1: # Nothing to do return [] binds = jobID else: binds = {"jobid": jobID} result = self.dbi.processData(self.sql, binds, conn=conn, transaction=transaction) jobList = self.formatJobs(result) filesResult = self.dbi.processData(self.fileSQL, binds, conn=conn, transaction=transaction) fileList = self.formatDict(filesResult) fileBinds = [] for x in fileList: # Add new runs x['newRuns'] = [] # Assemble unique list of binds if not {'fileid': x['id']} in fileBinds: fileBinds.append({'fileid': x['id']}) parentList = [] if len(fileBinds) > 0: parentResult = self.dbi.processData(self.parentSQL, fileBinds, conn=conn, transaction=transaction) parentList = self.formatDict(parentResult) lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn=conn, transaction=transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber=r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id=f['id']) wmbsFile.update(f) wmbsFile['locations'].add(f['se_name']) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) for entry in parentList: if entry['id'] == f['id']: wmbsFile['parents'].add(entry['lfn']) filesForJobs[jobid][f['id']] = wmbsFile else: # If the file is there, just add the location filesForJobs[jobid][f['id']]['locations'].add(f['se_name']) for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() return jobList
def execute(self, jobID, fileSelection=None, conn=None, transaction=False): """ _execute_ Execute the SQL for the given job ID and then format and return the result. fileSelection is a dictionary key'ed by the job id and with a list of lfns """ if isinstance(jobID, list) and not len(jobID): return [] elif isinstance(jobID, list): binds = jobID else: binds = [{"jobid": jobID}] result = self.dbi.processData(self.sql, binds, conn=conn, transaction=transaction) jobList = self.formatDict(result) for entry in jobList: entry.setdefault('input_files', []) filesResult = self.dbi.processData(self.fileSQL, binds, conn=conn, transaction=transaction) fileList = self.formatDict(filesResult) noDuplicateFiles = {} fileBinds = [] if fileSelection: fileList = [x for x in fileList if x['lfn'] in fileSelection[x['jobid']]] for x in fileList: # Assemble unique list of binds if {'fileid': x['id']} not in fileBinds: fileBinds.append({'fileid': x['id']}) noDuplicateFiles[x['id']] = x parentList = [] if len(fileBinds) > 0: parentResult = self.dbi.processData(self.parentSQL, fileBinds, conn=conn, transaction=transaction) parentList = self.formatDict(parentResult) # only upload to not duplicate files to prevent excessive memory self.getRunLumis(fileBinds, noDuplicateFiles.values(), conn, transaction) filesForJobs = {} for f in fileList: jobid = f['jobid'] filesForJobs.setdefault(jobid, {}) if f['id'] not in filesForJobs[jobid]: wmbsFile = File(id=f['id']) # need to update with noDuplicateFiles since this one has run lumi information. wmbsFile.update(noDuplicateFiles[f["id"]]) if f['pnn']: # file might not have a valid location, or be Null wmbsFile['locations'].add(f['pnn']) for r in wmbsFile.pop('newRuns'): wmbsFile.addRun(r) for entry in parentList: if entry['id'] == f['id']: wmbsFile['parents'].add(entry['lfn']) wmbsFile.pop('pnn', None) # not needed for anything, just remove it filesForJobs[jobid][f['id']] = wmbsFile elif f['pnn']: # If the file is there and it has a location, just add it filesForJobs[jobid][f['id']]['locations'].add(f['pnn']) for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() return jobList
def execute(self, jobID, fileSelection=None, conn=None, transaction=False): """ _execute_ Execute the SQL for the given job ID and then format and return the result. fileSelection is a dictionary key'ed by the job id and with a list of lfns """ if isinstance(jobID, list) and not len(jobID): return [] elif isinstance(jobID, list): binds = jobID else: binds = [{"jobid": jobID}] result = self.dbi.processData(self.sql, binds, conn=conn, transaction=transaction) jobList = self.formatDict(result) for entry in jobList: entry.setdefault('input_files', []) filesResult = self.dbi.processData(self.fileSQL, binds, conn=conn, transaction=transaction) fileList = self.formatDict(filesResult) noDuplicateFiles = {} fileBinds = [] if fileSelection: fileList = [ x for x in fileList if x['lfn'] in fileSelection[x['jobid']] ] for x in fileList: # Assemble unique list of binds if {'fileid': x['id']} not in fileBinds: fileBinds.append({'fileid': x['id']}) noDuplicateFiles[x['id']] = x parentList = [] if len(fileBinds) > 0: parentResult = self.dbi.processData(self.parentSQL, fileBinds, conn=conn, transaction=transaction) parentList = self.formatDict(parentResult) # only upload to not duplicate files to prevent excessive memory self.getRunLumis(fileBinds, noDuplicateFiles.values(), conn, transaction) filesForJobs = {} for f in fileList: jobid = f['jobid'] filesForJobs.setdefault(jobid, {}) if f['id'] not in filesForJobs[jobid]: wmbsFile = File(id=f['id']) # need to update with noDuplicateFiles since this one has run lumi information. wmbsFile.update(noDuplicateFiles[f["id"]]) if 'pnn' in f: # file might not have a valid location wmbsFile['locations'].add(f['pnn']) for r in wmbsFile.pop('newRuns'): wmbsFile.addRun(r) for entry in parentList: if entry['id'] == f['id']: wmbsFile['parents'].add(entry['lfn']) wmbsFile.pop('pnn', None) # not needed for anything filesForJobs[jobid][f['id']] = wmbsFile elif 'pnn' in f: # If the file is there and it has a location, just add it filesForJobs[jobid][f['id']]['locations'].add(f['pnn']) for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() return jobList
def execute(self, jobID, conn = None, transaction = False): """ _execute_ Execute the SQL for the given job ID(s) and then format and return the result. """ if type(jobID) != list: jobID = [jobID] binds = [{"jobid": x} for x in jobID] if not binds: return [] #First load full file information with run/lumis filesResult = self.dbi.processData(self.fileSQL, binds, conn = conn, transaction = transaction) fileList = self.formatDict(filesResult) #Clear duplicates bindDict = {} for result in fileList: bindDict[result['id']] = 1 result['newRuns'] = [] fileBinds = [{'fileid' : x} for x in bindDict.keys()] #Load file information if len(fileBinds): lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn = conn, transaction = transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber = r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id = f['id']) wmbsFile.update(f) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) filesForJobs[jobid][f['id']] = wmbsFile #Add the file information to job objects and load the masks jobList = [Job(id = x) for x in jobID] for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() j['mask'].load(j['id']) return jobList
def execute(self, jobID, conn=None, transaction=False): """ _execute_ Execute the SQL for the given job ID(s) and then format and return the result. """ if type(jobID) != list: jobID = [jobID] binds = [{"jobid": x} for x in jobID] if not binds: return [] #First load full file information with run/lumis filesResult = self.dbi.processData(self.fileSQL, binds, conn=conn, transaction=transaction) fileList = self.formatDict(filesResult) #Clear duplicates bindDict = {} for result in fileList: bindDict[result['id']] = 1 result['newRuns'] = [] fileBinds = [{'fileid': x} for x in bindDict.keys()] #Load file information if len(fileBinds): lumiResult = self.dbi.processData(self.runLumiSQL, fileBinds, conn=conn, transaction=transaction) lumiList = self.formatDict(lumiResult) lumiDict = {} for l in lumiList: if not l['fileid'] in lumiDict.keys(): lumiDict[l['fileid']] = [] lumiDict[l['fileid']].append(l) for f in fileList: fileRuns = {} if f['id'] in lumiDict.keys(): for l in lumiDict[f['id']]: run = l['run'] lumi = l['lumi'] try: fileRuns[run].append(lumi) except KeyError: fileRuns[run] = [] fileRuns[run].append(lumi) for r in fileRuns.keys(): newRun = Run(runNumber=r) newRun.lumis = fileRuns[r] f['newRuns'].append(newRun) filesForJobs = {} for f in fileList: jobid = f['jobid'] if not jobid in filesForJobs.keys(): filesForJobs[jobid] = {} if f['id'] not in filesForJobs[jobid].keys(): wmbsFile = File(id=f['id']) wmbsFile.update(f) for r in wmbsFile['newRuns']: wmbsFile.addRun(r) filesForJobs[jobid][f['id']] = wmbsFile #Add the file information to job objects and load the masks jobList = [Job(id=x) for x in jobID] for j in jobList: if j['id'] in filesForJobs.keys(): j['input_files'] = filesForJobs[j['id']].values() j['mask'].load(j['id']) return jobList