def loadCatalog(catalog, book, dataset): # load the unique file ids of the existing catalog for existence checks (careful) mitcfg = book.split("/")[0] version = book.split("/")[1] catalogedIds = fileIds.fileIds() # first make sure the catalog is compact rc = 0 cmd = "grep root " + catalog + '/' + book + '/' + dataset + '/RawFiles.00' list = cmd.split(" ") p = subprocess.Popen(list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = p.communicate() rc = p.returncode if rc != 0: print " ERROR -- %d" % (rc) print out print err #sys.exit(1) for line in out.split("\n"): f = line.split(" ") if len(f) > 2: name = f[0] nEvents = int(f[1]) catalogedId = fileIds.fileId(name, nEvents) catalogedIds.addFileId(catalogedId) return catalogedIds
def loadFilesFromDisk(book, dataset): fileOnDiskIds = fileIds.fileIds() # list all files from the giben directory cmd = 'list ' + DIR + "/" + book + "/" + dataset if DEBUG > 0: print " CMD (loadFilesFromDisk): " + cmd rc = 0 list = cmd.split(" ") p = subprocess.Popen(list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = p.communicate() rc = p.returncode if rc != 0: print " ERROR -- %d" % (rc) print out print err #sys.exit(1) for line in out.split("\n"): if '.root' in line: f = line.split(" ") if len(f) > 1: name = f[1] nEvents = -1 fileOnDiskId = fileIds.fileId(name, nEvents) fileOnDiskIds.addFileId(fileOnDiskId) return fileOnDiskIds
def loadLfns(dataset): lfnIds = fileIds.fileIds() # find the correct file lfnFile = "/home/cmsprod/cms/jobs/lfns/" + dataset + ".lfns" if DEBUG > 0: print " LFN file: " + lfnFile rc = 0 cmd = "cat " + lfnFile list = cmd.split(" ") p = subprocess.Popen(list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = p.communicate() rc = p.returncode if rc != 0: print " ERROR -- %d" % (rc) print out print err #sys.exit(1) for line in out.split("\n"): f = line.split(" ") if len(f) > 2: name = f[1] nEvents = int(f[2]) lfnId = fileIds.fileId(name, nEvents) lfnIds.addFileId(lfnId) return lfnIds
def makeCatalog(dir, nFilesPerSet): cmd = 'cat ' + dir + '/RawFiles.?? | grep root | sort -u' init = False iFileset = 0 fileset = None filesetsOut = open(dir + '/Filesets', 'w') with open(dir + '/Files', 'w') as filesOut: for line in os.popen(cmd).readlines(): # run command line = line[:-1] line = " ".join(str(line).split()).strip() f = line.split(' ') g = f[0].split('/') dataDir = '/'.join(g[:-1]) if not init: fileset = fileIds.fileIdSet('0000', dataDir) init = True fileName = g[-1] if len(f) != 7: print ' Length is not six: %d' % len(f) sys.exit(1) file = fileIds.fileId(fileName, int(f[1]), int(f[2]), int(f[3]), int(f[4]), int(f[5]), int(f[6])) if (fileset.nFiles() < nFilesPerSet): fileset.addFile(file) else: fileset.showShort(filesetsOut, DEBUG) fileset.showShortFiles(filesOut, DEBUG) iFileset = iFileset + 1 name = '%04d' % iFileset fileset.reset(name, dataDir) fileset.addFile(file) # complete what you started (will never be empty if init is set) if init: fileset.showShort(filesetsOut, DEBUG) fileset.showShortFiles(filesOut, DEBUG) iFileset = iFileset + 1 filesetsOut.close()
def getFiles(book, dataset): # get all corresponding files version = book.split('/')[1] mitcfg = book.split('/')[0] # decode the dataset f = dataset.split('+') process = f[0] setup = f[1] tier = f[2] sql = "select FileName, NEvents from Files inner join Requests on " \ + " Files.RequestId = Requests.RequestId inner join Datasets on " \ + " Requests.DatasetId = Datasets.DatasetId where " \ + " DatasetProcess = '%s' and DatasetSetup='%s' and DatasetTier='%s'"%(process,setup,tier) \ + " and RequestConfig = '%s' and RequestVersion = '%s'"%(mitcfg,version) results = [] try: # Execute the SQL command if DEBUG > 0: print ' SQL: %s' % (sql) Cursor.execute(sql) if DEBUG > 0: print ' SQL: fetch results' results = Cursor.fetchall() if DEBUG > 0: print ' SQL: DONE' except: print 'ERROR(%s) - could not find request id.' % (sql) # found the request Id catalogedIds = fileIds.fileIds() for row in results: fileId = row[0] nEvents = int(row[1]) catalogedId = fileIds.fileId(fileId, nEvents) catalogedIds.addFileId(catalogedId) return catalogedIds
def findDatasetProperties(dataset, dbsInst, debug=0): # test whether this is a legitimate dataset by asking DAS and determine size and number of files if "=" in dataset: # find config, version and original dataset name f = dataset.split("=") conf = (f[0])[1:] vers = f[1] dset = f[2].replace("/", "+") sizeGb = 10 # does not matter nFiles = 0 lfns = {} cmd = 'cat %s/%s/%s/%s/Filesets' % (CATALOG_INPUT, conf, vers, dset) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = f[0] path = re.sub(r'root://.*/(/store/.*)', r'\1', f[1]) lfn = "%s/%s.root" % (path, id) nEvents = int(f[2]) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, id, path) lfns[fId.getName()] = lfn if debug > 1: print " Adding: %s, %s" % (id, lfn) return (sizeGb, nFiles, lfns) ## dealing with a standard dataset first test #if not isDatasetValid(dataset,dbsInst,debug): # return (-1,-1,-1) proxy = getProxy() url = 'curl -s --cert %s -k -H "Accept: application/json"'%proxy \ + ' "https://cmsweb.cern.ch/dbs/prod/global/DBSReader/' \ + 'files?dataset=%s&detail=true"'%(dataset) if debug > 1: print ' CURL: ' + url myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(url) if rc != 0: print ' ERROR ocurred in %s' % (url) sys.exit(1) data = json.loads(out) units = 'GB' nFiles = 0 totalSize = 0 lfns = {} blocks = [] for entry in data: valid = int(entry["is_file_valid"]) fileName = entry["logical_file_name"] path = "/".join(fileName.split("/")[:-1]) size = int(entry["file_size"]) block = entry["block_name"].split("#")[1] nEvents = int(entry["event_count"]) if valid == 1: nFiles += 1 totalSize += size #print '%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.) fId = fileIds.fileId(fileName, nEvents) lfn = fileIds.lfn(fId, block, path) lfns[fId.getName()] = lfn try: sizeGb = convertSizeToGb(str(totalSize)) except: print '\n Error - could not convert size and number of files (%s %s / %s).'\ %(totalSize,units,nFiles) sys.exit(1) if debug > 1: for lfn in lfns: lfns[lfn].show() print '\n DBS - %s --> %.1f %s (nFiles: %d)\n' % (dataset, sizeGb, units, nFiles) return (sizeGb, nFiles, lfns)
def findDatasetProperties(dataset, dbsInst, debug=0): # test whether this is a legitimate dataset by asking DAS and determine size and number of files if "=" in dataset: # this is a dataset produced with Kraken # find config, version and original dataset name f = dataset.split("=") conf = (f[0])[1:] vers = f[1] dset = f[2].replace("/", "+") sizeGb = 10 # does not matter nFiles = 0 cmd = 'cat %s/%s/%s/%s/Filesets' % (CATALOG_INPUT, conf, vers, dset) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = f[0] path = re.sub(r'root://.*/(/store/.*)', r'\1', f[1]) lfn = "%s/%s.root" % (path, id) nEvents = int(f[2]) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, id, path) lfns[fId.getName()] = lfn if debug > -1: print(" Adding: %s, %s, %s" % (id, lfn.fId.getName())) return (sizeGb, nFiles, lfns) # dealing with a standard dataset first test if dbsInst == 'private': print(" Private dataset detected.") sizeGb = 10 # does not matter nFiles = 0 f = dataset.split("/") trunc = f[1] conf = f[2] vers = f[3] dset = f[4] cmd = 'cat %s/%s/%s/%s/%s/RawFiles.00' % (CATALOG_INPUT, trunc, conf, vers, dset) print(" CMD: %s" % cmd) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): #print(" LINE - >%s<"%(line)) line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = (f[0].split('/')[-1]).replace('.root', '') block = id[0:20] path = "/".join(f[0].split('/')[0:-1]) path = re.sub(r'root://.*/(/store/.*)', r'\1', path) lfn = "%s/%s.root" % (path, id) #print(" ID: %s\nPATH %s\nLFN: %s"%(id,path,lfn)) nEvents = int(f[2]) # #print '%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.) # fId = fileIds.fileId(fileName,nEvents) # lfn = fileIds.lfn(fId,block,path) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, block, path) #lfn.show() lfns[fId.getName()] = lfn if debug > -1: print " Adding: %s, %s" % (id, path) else: pass #print(" LINE invalid") return (sizeGb, nFiles, lfns) # dealing with a standard dataset first test if not isDatasetValid(dataset, dbsInst, debug): print(' WARNING - dataset was not found to be valid.') print(' - continue and see whether it is in production.') print(' - to get all data this call has to be repeated') print(' - once the dataset is completed.') #return (-1,-1,-1) else: print(' INFO - dataset is valid.') proxy = getProxy() url = 'curl -s --cert %s -k -H "Accept: application/json"'%proxy \ + ' "https://cmsweb.cern.ch/dbs/prod/global/DBSReader/' \ + 'files?dataset=%s&detail=true"'%(dataset) if debug > 1: print(' CURL: ' + url) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(url) if rc != 0: print(' ERROR ocurred in %s' % (url)) sys.exit(1) data = json.loads(out) units = 'GB' nFiles = 0 totalSize = 0 blocks = [] for entry in data: valid = int(entry["is_file_valid"]) fileName = entry["logical_file_name"] path = "/".join(fileName.split("/")[:-1]) size = int(entry["file_size"]) block = entry["block_name"].split("#")[1] nEvents = int(entry["event_count"]) if valid == 1: nFiles += 1 totalSize += size #print('%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.)) fId = fileIds.fileId(fileName, nEvents) lfn = fileIds.lfn(fId, block, path) lfns[fId.getName()] = lfn try: sizeGb = convertSizeToGb(str(totalSize)) except: print('\n Error - could not convert size and number of files (%s %s / %s).'\ %(totalSize,units,nFiles)) sys.exit(1) if debug > 1: for lfn in lfns: lfns[lfn].show() print('\n DBS - %s --> %.1f %s (nFiles: %d)\n' % (dataset, sizeGb, units, nFiles)) return (sizeGb, nFiles, lfns)