def split_by_lumi(config, dataset_info, task_list): if config.has_key('lumi mask'): lumi_mask = LumiList(filename=config['lumi mask']) dataset_info.total_lumis = 0 for file in dataset_info.files: dataset_info.lumis[file] = lumi_mask.filterLumis(dataset_info.lumis[file]) dataset_info.total_lumis += len(dataset_info.lumis[file]) lumis_per_task = config['lumis per task'] lumis_processed = 0 task_id = 0 tasks = [] files = iter(dataset_info.files) file = files.next() input_files_this_task = [file] task_lumis_remaining = dataset_info.lumis[file] while lumis_processed < dataset_info.total_lumis: for file in input_files_this_task: common_lumis = set(dataset_info.lumis[file]).intersection(set(task_lumis_remaining)) if len(common_lumis) == 0 or len(dataset_info.lumis[file]) == 0: input_files_this_task.remove(file) while lumis_per_task <= len(task_lumis_remaining): task_lumis = LumiList(lumis=task_lumis_remaining[:lumis_per_task]) task_lumis_remaining = task_lumis_remaining[lumis_per_task:] tasks.append((input_files_this_task, task_lumis.getVLuminosityBlockRange())) task_id += 1 lumis_processed += lumis_per_task try: file = files.next() input_files_this_task.append(file) task_lumis_remaining.extend(dataset_info.lumis[file]) except: lumis_per_task = len(task_lumis_remaining) with open(task_list, 'w') as json_file: json.dump(tasks, json_file) return len(tasks)
def fetchDBSInfo(self): """ Contact DBS """ # make assumption that same host won't be used for both # this check should catch most deployed servers (useDBS2, useDBS3, dbs2_url, dbs3_url) = verify_dbs_url(self) # DBS2 is gone dbs_url=dbs3_url useDBS2 = False useDBS3 = True verifyDBS23 = False common.logger.info("Accessing DBS at: %s" % dbs_url) ## check if runs are selected runselection = [] if (self.cfg_params.has_key('CMSSW.runselection')): runselection = parseRange2(self.cfg_params['CMSSW.runselection']) if len(runselection)>1000000: common.logger.info("ERROR: runselection range has more then 1M numbers") common.logger.info("ERROR: Too large. runselection is ignored") runselection=[] ## check if various lumi parameters are set self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None) self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \ self.cfg_params.get('CMSSW.lumis_per_job',None) lumiList = None if self.lumiMask: lumiList = LumiList(filename=self.lumiMask) if runselection: runList = LumiList(runs = runselection) self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0)) self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0)) common.logger.log(10-1,"runselection is: %s"%runselection) if not self.splitByRun: self.splitByLumi = self.lumiMask or self.lumiParams or self.ads if self.splitByRun and not runselection: msg = "Error: split_by_run must be combined with a runselection" raise CrabException(msg) ## service API if useDBS2 or verifyDBS23: args = {} args['url'] = dbs2_url args['level'] = 'CRITICAL' ## check if has been requested to use the parent info useparent = int(self.cfg_params.get('CMSSW.use_parent',0)) defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt' ## check if has been asked for a non default file to store/read analyzed fileBlocks #SB no no, we do not want this, it is not even documented ! #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName)) if self.cfg_params.get('CMSSW.fileblocks_file') : msg = "CMSSW.fileblocks_file option non supported" raise CrabException(msg) fileBlocks_FileName = os.path.abspath(defaultName) if useDBS2 or verifyDBS23: #common.logger.info("looking up DBS2 ...") import DBSAPI.dbsApi import DBSAPI.dbsApiException start_time=time.time() api2 = DBSAPI.dbsApi.DbsApi(args) files2 = self.queryDbs(api2,path=self.datasetPath,runselection=runselection,useParent=useparent) elapsed=time.time() - start_time common.logger.info("DBS2 lookup took %5.2f sec" % elapsed) if useDBS2: self.files = files2 if useDBS3 or verifyDBS23: #common.logger.info("looking up DBS3 ...") from dbs.apis.dbsClient import DbsApi start_time=time.time() api3 = DbsApi(dbs3_url) files3 = self.queryDbs3(api3,path=self.datasetPath,runselection=runselection,useParent=useparent) elapsed=time.time() - start_time common.logger.info("DBS3 lookup took %5.2f sec" % elapsed) if useDBS3: self.files = files3 # Check to see what the dataset is pdsName = self.datasetPath.split("/")[1] if useDBS2 : primDSs = api2.listPrimaryDatasets(pdsName) dataType = primDSs[0]['Type'] elif useDBS3 : dataType=api3.listDataTypes(dataset=self.datasetPath)[0]['data_type'] common.logger.info("Datatype is %s" % dataType) if dataType == 'data' and not \ (self.splitByRun or self.splitByLumi or self.splitDataByEvent): msg = 'Data must be split by lumi or by run. ' \ 'Please see crab -help for the correct settings' raise CrabException(msg) anFileBlocks = [] if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName) # parse files and fill arrays for file in self.files : parList = [] fileLumis = [] # List of tuples # skip already analyzed blocks fileblock = file['Block']['Name'] if fileblock not in anFileBlocks : filename = file['LogicalFileName'] # asked retry the list of parent for the given child if useparent==1: parList = [x['LogicalFileName'] for x in file['ParentList']] if self.splitByLumi: fileLumis = [ (x['RunNumber'], x['LumiSectionNumber']) for x in file['LumiList'] ] self.parent[filename] = parList # For LumiMask, intersection of two lists. if self.lumiMask and runselection: self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis)) elif runselection: self.lumis[filename] = runList.filterLumis(fileLumis) elif self.lumiMask: self.lumis[filename] = lumiList.filterLumis(fileLumis) else: self.lumis[filename] = fileLumis if filename.find('.dat') < 0 : events = file['NumberOfEvents'] # Count number of events and lumis per block if fileblock in self.eventsPerBlock.keys() : self.eventsPerBlock[fileblock] += events else : self.eventsPerBlock[fileblock] = events # Number of events per file self.eventsPerFile[filename] = events # List of files per block if fileblock in self.blocksinfo.keys() : self.blocksinfo[fileblock].append(filename) else : self.blocksinfo[fileblock] = [filename] # total number of events self.maxEvents += events self.maxLumis += len(self.lumis[filename]) if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0: msg = "No new fileblocks available for dataset: "+str(self.datasetPath) raise CrabException(msg) if len(self.eventsPerBlock) <= 0: msg="No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath raise CrabException(msg)
def fetchDBSInfo(self): """ Contact DBS """ ## get DBS URL global_url = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" dbs_url = self.cfg_params.get('CMSSW.dbs_url', global_url) common.logger.info("Accessing DBS at: " + dbs_url) ## check if runs are selected runselection = [] if (self.cfg_params.has_key('CMSSW.runselection')): runselection = parseRange2(self.cfg_params['CMSSW.runselection']) if len(runselection) > 1000000: common.logger.info( "ERROR: runselection range has more then 1M numbers") common.logger.info("ERROR: Too large. runselection is ignored") runselection = [] ## check if various lumi parameters are set self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask', None) self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \ self.cfg_params.get('CMSSW.lumis_per_job',None) lumiList = None if self.lumiMask: lumiList = LumiList(filename=self.lumiMask) if runselection: runList = LumiList(runs=runselection) self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0)) self.splitDataByEvent = int( self.cfg_params.get('CMSSW.split_by_event', 0)) common.logger.log(10 - 1, "runselection is: %s" % runselection) if not self.splitByRun: self.splitByLumi = self.lumiMask or self.lumiParams or self.ads if self.splitByRun and not runselection: msg = "Error: split_by_run must be combined with a runselection" raise CrabException(msg) ## service API args = {} args['url'] = dbs_url args['level'] = 'CRITICAL' ## check if has been requested to use the parent info useparent = int(self.cfg_params.get('CMSSW.use_parent', 0)) ## check if has been asked for a non default file to store/read analyzed fileBlocks defaultName = common.work_space.shareDir() + 'AnalyzedBlocks.txt' fileBlocks_FileName = os.path.abspath( self.cfg_params.get('CMSSW.fileblocks_file', defaultName)) api = DBSAPI.dbsApi.DbsApi(args) self.files = self.queryDbs(api, path=self.datasetPath, runselection=runselection, useParent=useparent) # Check to see what the dataset is pdsName = self.datasetPath.split("/")[1] primDSs = api.listPrimaryDatasets(pdsName) dataType = primDSs[0]['Type'] common.logger.debug("Datatype is %s" % dataType) if dataType == 'data' and not \ (self.splitByRun or self.splitByLumi or self.splitDataByEvent): msg = 'Data must be split by lumi or by run. ' \ 'Please see crab -help for the correct settings' raise CrabException(msg) anFileBlocks = [] if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName) # parse files and fill arrays for file in self.files: parList = [] fileLumis = [] # List of tuples # skip already analyzed blocks fileblock = file['Block']['Name'] if fileblock not in anFileBlocks: filename = file['LogicalFileName'] # asked retry the list of parent for the given child if useparent == 1: parList = [ x['LogicalFileName'] for x in file['ParentList'] ] if self.splitByLumi: fileLumis = [(x['RunNumber'], x['LumiSectionNumber']) for x in file['LumiList']] self.parent[filename] = parList # For LumiMask, intersection of two lists. if self.lumiMask and runselection: self.lumis[filename] = runList.filterLumis( lumiList.filterLumis(fileLumis)) elif runselection: self.lumis[filename] = runList.filterLumis(fileLumis) elif self.lumiMask: self.lumis[filename] = lumiList.filterLumis(fileLumis) else: self.lumis[filename] = fileLumis if filename.find('.dat') < 0: events = file['NumberOfEvents'] # Count number of events and lumis per block if fileblock in self.eventsPerBlock.keys(): self.eventsPerBlock[fileblock] += events else: self.eventsPerBlock[fileblock] = events # Number of events per file self.eventsPerFile[filename] = events # List of files per block if fileblock in self.blocksinfo.keys(): self.blocksinfo[fileblock].append(filename) else: self.blocksinfo[fileblock] = [filename] # total number of events self.maxEvents += events self.maxLumis += len(self.lumis[filename]) if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0: msg = "No new fileblocks available for dataset: " + str( self.datasetPath) raise CrabException(msg) if len(self.eventsPerBlock) <= 0: raise NotExistingDatasetError( ("\nNo data for %s in DBS\nPlease check" + " dataset path variables in crab.cfg") % self.datasetPath)
def fetchDBSInfo(self): """ Contact DBS """ ## get DBS URL global_url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" dbs_url= self.cfg_params.get('CMSSW.dbs_url', global_url) common.logger.info("Accessing DBS at: "+dbs_url) ## check if runs are selected runselection = [] if (self.cfg_params.has_key('CMSSW.runselection')): runselection = parseRange2(self.cfg_params['CMSSW.runselection']) ## check if various lumi parameters are set self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None) self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \ self.cfg_params.get('CMSSW.lumis_per_job',None) lumiList = None if self.lumiMask: lumiList = LumiList(filename=self.lumiMask) if runselection: runList = LumiList(runs = runselection) self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0)) self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0)) common.logger.log(10-1,"runselection is: %s"%runselection) if not self.splitByRun: self.splitByLumi = self.lumiMask or self.lumiParams or self.ads if self.splitByRun and not runselection: msg = "Error: split_by_run must be combined with a runselection" raise CrabException(msg) ## service API args = {} args['url'] = dbs_url args['level'] = 'CRITICAL' ## check if has been requested to use the parent info useparent = int(self.cfg_params.get('CMSSW.use_parent',0)) ## check if has been asked for a non default file to store/read analyzed fileBlocks defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt' fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName)) api = DBSAPI.dbsApi.DbsApi(args) self.files = self.queryDbs(api,path=self.datasetPath,runselection=runselection,useParent=useparent) # Check to see what the dataset is pdsName = self.datasetPath.split("/")[1] primDSs = api.listPrimaryDatasets(pdsName) dataType = primDSs[0]['Type'] common.logger.debug("Datatype is %s" % dataType) if dataType == 'data' and not \ (self.splitByRun or self.splitByLumi or self.splitDataByEvent): msg = 'Data must be split by lumi or by run. ' \ 'Please see crab -help for the correct settings' raise CrabException(msg) anFileBlocks = [] if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName) # parse files and fill arrays for file in self.files : parList = [] fileLumis = [] # List of tuples # skip already analyzed blocks fileblock = file['Block']['Name'] if fileblock not in anFileBlocks : filename = file['LogicalFileName'] # asked retry the list of parent for the given child if useparent==1: parList = [x['LogicalFileName'] for x in file['ParentList']] if self.splitByLumi: fileLumis = [ (x['RunNumber'], x['LumiSectionNumber']) for x in file['LumiList'] ] self.parent[filename] = parList # For LumiMask, intersection of two lists. if self.lumiMask and runselection: self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis)) elif runselection: self.lumis[filename] = runList.filterLumis(fileLumis) elif self.lumiMask: self.lumis[filename] = lumiList.filterLumis(fileLumis) else: self.lumis[filename] = fileLumis if filename.find('.dat') < 0 : events = file['NumberOfEvents'] # Count number of events and lumis per block if fileblock in self.eventsPerBlock.keys() : self.eventsPerBlock[fileblock] += events else : self.eventsPerBlock[fileblock] = events # Number of events per file self.eventsPerFile[filename] = events # List of files per block if fileblock in self.blocksinfo.keys() : self.blocksinfo[fileblock].append(filename) else : self.blocksinfo[fileblock] = [filename] # total number of events self.maxEvents += events self.maxLumis += len(self.lumis[filename]) if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0: msg = "No new fileblocks available for dataset: "+str(self.datasetPath) raise CrabException(msg) if len(self.eventsPerBlock) <= 0: raise NotExistingDatasetError(("\nNo data for %s in DBS\nPlease check" + " dataset path variables in crab.cfg") % self.datasetPath)
def fetchDBSInfo(self): """ Contact DBS """ # make assumption that same host won't be used for both # this check should catch most deployed servers DBS2HOST = 'cmsdbsprod.cern.ch' DBS3HOST = 'cmsweb.cern.ch' useDBS2 = False useDBS3 = False verifyDBS23 = False useDAS = False # knwon DBS end-points known_dbs_urls = [] global_dbs2 = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" global_dbs3 = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" caf_dbs2_01 = "http://cmsdbsprod.cern.ch/cms_dbs_caf_analysis_01/servlet/DBSServlet" local_dbs2_01 = "http://cmsdbsprod.cern.ch/cms_dbs_ph_analysis_01/servlet/DBSServlet" local_dbs2_02 = "http://cmsdbsprod.cern.ch/cms_dbs_ph_analysis_02/servlet/DBSServlet" caf_dbs3_01 = "https://cmsweb.cern.ch/dbs/prod/caf01/DBSReader" local_dbs3_01 = "https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader" local_dbs3_02 = "https://cmsweb.cern.ch/dbs/prod/phys02/DBSReader" local_dbs3_03 = "https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader" known_dbs_urls = [ \ global_dbs2, caf_dbs2_01, local_dbs2_01, local_dbs2_02, \ global_dbs3, caf_dbs3_01, local_dbs3_01, local_dbs2_01, local_dbs3_03 \ ] ## correspondence maps of DBS2/3 isntances dbs2to3={} dbs3to2={} dbs2to3[global_dbs2] = global_dbs3 dbs2to3[caf_dbs2_01] = caf_dbs3_01 dbs2to3[local_dbs2_01] = local_dbs3_01 dbs2to3[local_dbs2_02] = local_dbs3_02 dbs2to3[local_dbs3_01] = local_dbs3_01 dbs2to3[local_dbs3_02] = local_dbs3_02 dbs2to3[local_dbs3_03] = local_dbs3_03 # reverse map: for key,value in dbs2to3.iteritems(): dbs3to2[value]=key ## get DBS URL specified by user (default to global DBS2) dbs_url = self.cfg_params.get('CMSSW.dbs_url', global_dbs2) if self.cfg_params.get('CMSSW.use_dbs3'): useDBS3 = int(self.cfg_params.get('CMSSW.use_dbs3'))==1 if self.cfg_params.get('CMSSW.verify_dbs23'): verifyDBS23 = int(self.cfg_params.get('CMSSW.verify_dbs23'))==1 if verifyDBS23 and not dbs_url in known_dbs_urls: common.logger.info ("automatic verification DBS2/3 not possible for non standard dbs_url=%s"%dbs_url) verifyDBS23 = False # support shortcuts for local scope DBS's if dbs_url == "dbs2_caf_01" : dbs_url=caf_dbs2_01 if dbs_url == "analysis_01" : dbs_url=local_dbs2_01 if dbs_url == "analysis_02" : dbs_url=local_dbs2_02 if dbs_url == "caf01" : dbs_url=caf_dbs3_01 if dbs_url == "phys01" : dbs_url=local_dbs3_01 if dbs_url == "phys02" : dbs_url=local_dbs3_02 if dbs_url == "phys03" : dbs_url=local_dbs3_03 # if user asked for DBS3, remap DBS url if needed # and possible, i.e. using a known URL if useDBS3 and dbs_url in known_dbs_urls: dbs_url = dbs2to3 [dbs_url] common.logger.info("Accessing DBS at: "+dbs_url) endpoint_components = urlparse.urlsplit(dbs_url) if endpoint_components.hostname == DBS3HOST or useDBS3: useDBS3=True dbs_url_3 = dbs_url if dbs_url in known_dbs_urls: dbs_url_2 = dbs3to2[dbs_url] elif endpoint_components.hostname == DBS2HOST: useDBS2=True dbs_url_2 = dbs_url if dbs_url in known_dbs_urls: dbs_url_3 = dbs2to3[dbs_url] else: # if we do not know this URL, better be a DBS3 test instance useDBS3=True dbs_url_3 = dbs_url if useDBS2 and useDBS3: msg = "trying to use DBS2 and DBS3 at same time ?" raise CrabException(msg) if self.cfg_params.get('CMSSW.use_das'): useDAS = int(self.cfg_params.get('CMSSW.use_das'))==1 if useDBS2: common.logger.info("Will do Data Discovery using DBS2") if useDBS3: common.logger.info("Will do Data Discovery using DBS3") if useDAS : common.logger.info("will use DAS to talk to DBS") if verifyDBS23: common.logger.info("Will verify that DBS2 and DBS3 return same information") ## check if runs are selected runselection = [] if (self.cfg_params.has_key('CMSSW.runselection')): runselection = parseRange2(self.cfg_params['CMSSW.runselection']) if len(runselection)>1000000: common.logger.info("ERROR: runselection range has more then 1M numbers") common.logger.info("ERROR: Too large. runselection is ignored") runselection=[] ## check if various lumi parameters are set self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None) self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \ self.cfg_params.get('CMSSW.lumis_per_job',None) lumiList = None if self.lumiMask: lumiList = LumiList(filename=self.lumiMask) if runselection: runList = LumiList(runs = runselection) self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0)) self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0)) common.logger.log(10-1,"runselection is: %s"%runselection) if not self.splitByRun: self.splitByLumi = self.lumiMask or self.lumiParams or self.ads if self.splitByRun and not runselection: msg = "Error: split_by_run must be combined with a runselection" raise CrabException(msg) ## service API if useDBS2 or verifyDBS23: args = {} args['url'] = dbs_url_2 args['level'] = 'CRITICAL' ## check if has been requested to use the parent info useparent = int(self.cfg_params.get('CMSSW.use_parent',0)) defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt' ## check if has been asked for a non default file to store/read analyzed fileBlocks #SB no no, we do not want this, it is not even documented ! #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName)) if self.cfg_params.get('CMSSW.fileblocks_file') : msg = "CMSSW.fileblocks_file option non supported" raise CrabException(msg) fileBlocks_FileName = os.path.abspath(defaultName) if useDBS2 or verifyDBS23: #common.logger.info("looking up DBS2 ...") import DBSAPI.dbsApi import DBSAPI.dbsApiException start_time=time.time() api2 = DBSAPI.dbsApi.DbsApi(args) files2 = self.queryDbs(api2,path=self.datasetPath,runselection=runselection,useParent=useparent) elapsed=time.time() - start_time common.logger.info("DBS2 lookup took %5.2f sec" % elapsed) if useDBS2: self.files = files2 if useDBS3 or verifyDBS23: #common.logger.info("looking up DBS3 ...") from dbs.apis.dbsClient import DbsApi start_time=time.time() api3 = DbsApi(dbs_url_3) files3 = self.queryDbs3(api3,path=self.datasetPath,runselection=runselection,useParent=useparent) elapsed=time.time() - start_time common.logger.info("DBS3 lookup took %5.2f sec" % elapsed) if useDBS3: self.files = files3 if useDAS : self.files = self.queryDas(path=self.datasetPath,runselection=runselection,useParent=useparent) if verifyDBS23: if not self.compareFilesStructure(files2,files3): common.logger.info("ERROR: DBS2 - DB3 comparsion failed, please run crab -uploadLog and report to crabFeedback") # Check to see what the dataset is pdsName = self.datasetPath.split("/")[1] if useDBS2 : primDSs = api2.listPrimaryDatasets(pdsName) dataType = primDSs[0]['Type'] elif useDBS3 : dataType=api3.listDataTypes(dataset=self.datasetPath)[0]['data_type'] common.logger.info("Datatype is %s" % dataType) if dataType == 'data' and not \ (self.splitByRun or self.splitByLumi or self.splitDataByEvent): msg = 'Data must be split by lumi or by run. ' \ 'Please see crab -help for the correct settings' raise CrabException(msg) anFileBlocks = [] if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName) # parse files and fill arrays for file in self.files : parList = [] fileLumis = [] # List of tuples # skip already analyzed blocks fileblock = file['Block']['Name'] if fileblock not in anFileBlocks : filename = file['LogicalFileName'] # asked retry the list of parent for the given child if useparent==1: parList = [x['LogicalFileName'] for x in file['ParentList']] if self.splitByLumi: fileLumis = [ (x['RunNumber'], x['LumiSectionNumber']) for x in file['LumiList'] ] self.parent[filename] = parList # For LumiMask, intersection of two lists. if self.lumiMask and runselection: self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis)) elif runselection: self.lumis[filename] = runList.filterLumis(fileLumis) elif self.lumiMask: self.lumis[filename] = lumiList.filterLumis(fileLumis) else: self.lumis[filename] = fileLumis if filename.find('.dat') < 0 : events = file['NumberOfEvents'] # Count number of events and lumis per block if fileblock in self.eventsPerBlock.keys() : self.eventsPerBlock[fileblock] += events else : self.eventsPerBlock[fileblock] = events # Number of events per file self.eventsPerFile[filename] = events # List of files per block if fileblock in self.blocksinfo.keys() : self.blocksinfo[fileblock].append(filename) else : self.blocksinfo[fileblock] = [filename] # total number of events self.maxEvents += events self.maxLumis += len(self.lumis[filename]) if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0: msg = "No new fileblocks available for dataset: "+str(self.datasetPath) raise CrabException(msg) if len(self.eventsPerBlock) <= 0: msg="No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath raise CrabException(msg)
def fetchDBSInfo(self): """ Contact DBS """ # make assumption that same host won't be used for both # this check should catch most deployed servers (useDBS2, useDBS3, dbs2_url, dbs3_url) = verify_dbs_url(self) # DBS2 is gone dbs_url = dbs3_url useDBS2 = False useDBS3 = True verifyDBS23 = False common.logger.info("Accessing DBS at: %s" % dbs_url) ## check if runs are selected runselection = [] if (self.cfg_params.has_key('CMSSW.runselection')): runselection = parseRange2(self.cfg_params['CMSSW.runselection']) if len(runselection) > 1000000: common.logger.info( "ERROR: runselection range has more then 1M numbers") common.logger.info("ERROR: Too large. runselection is ignored") runselection = [] ## check if various lumi parameters are set self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask', None) self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \ self.cfg_params.get('CMSSW.lumis_per_job',None) lumiList = None if self.lumiMask: lumiList = LumiList(filename=self.lumiMask) if runselection: runList = LumiList(runs=runselection) self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0)) self.splitDataByEvent = int( self.cfg_params.get('CMSSW.split_by_event', 0)) common.logger.log(10 - 1, "runselection is: %s" % runselection) if not self.splitByRun: self.splitByLumi = self.lumiMask or self.lumiParams or self.ads if self.splitByRun and not runselection: msg = "Error: split_by_run must be combined with a runselection" raise CrabException(msg) ## service API if useDBS2 or verifyDBS23: args = {} args['url'] = dbs2_url args['level'] = 'CRITICAL' ## check if has been requested to use the parent info useparent = int(self.cfg_params.get('CMSSW.use_parent', 0)) defaultName = common.work_space.shareDir() + 'AnalyzedBlocks.txt' ## check if has been asked for a non default file to store/read analyzed fileBlocks #SB no no, we do not want this, it is not even documented ! #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName)) if self.cfg_params.get('CMSSW.fileblocks_file'): msg = "CMSSW.fileblocks_file option non supported" raise CrabException(msg) fileBlocks_FileName = os.path.abspath(defaultName) if useDBS2 or verifyDBS23: #common.logger.info("looking up DBS2 ...") import DBSAPI.dbsApi import DBSAPI.dbsApiException start_time = time.time() api2 = DBSAPI.dbsApi.DbsApi(args) files2 = self.queryDbs(api2, path=self.datasetPath, runselection=runselection, useParent=useparent) elapsed = time.time() - start_time common.logger.info("DBS2 lookup took %5.2f sec" % elapsed) if useDBS2: self.files = files2 if useDBS3 or verifyDBS23: #common.logger.info("looking up DBS3 ...") from dbs.apis.dbsClient import DbsApi start_time = time.time() api3 = DbsApi(dbs3_url) files3 = self.queryDbs3(api3, path=self.datasetPath, runselection=runselection, useParent=useparent) elapsed = time.time() - start_time common.logger.info("DBS3 lookup took %5.2f sec" % elapsed) if useDBS3: self.files = files3 # Check to see what the dataset is pdsName = self.datasetPath.split("/")[1] if useDBS2: primDSs = api2.listPrimaryDatasets(pdsName) dataType = primDSs[0]['Type'] elif useDBS3: dataType = api3.listDataTypes( dataset=self.datasetPath)[0]['data_type'] common.logger.info("Datatype is %s" % dataType) if dataType == 'data' and not \ (self.splitByRun or self.splitByLumi or self.splitDataByEvent): msg = 'Data must be split by lumi or by run. ' \ 'Please see crab -help for the correct settings' raise CrabException(msg) anFileBlocks = [] if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName) # parse files and fill arrays for file in self.files: parList = [] fileLumis = [] # List of tuples # skip already analyzed blocks fileblock = file['Block']['Name'] if fileblock not in anFileBlocks: filename = file['LogicalFileName'] # asked retry the list of parent for the given child if useparent == 1: parList = [ x['LogicalFileName'] for x in file['ParentList'] ] if self.splitByLumi: fileLumis = [(x['RunNumber'], x['LumiSectionNumber']) for x in file['LumiList']] self.parent[filename] = parList # For LumiMask, intersection of two lists. if self.lumiMask and runselection: self.lumis[filename] = runList.filterLumis( lumiList.filterLumis(fileLumis)) elif runselection: self.lumis[filename] = runList.filterLumis(fileLumis) elif self.lumiMask: self.lumis[filename] = lumiList.filterLumis(fileLumis) else: self.lumis[filename] = fileLumis if filename.find('.dat') < 0: events = file['NumberOfEvents'] # Count number of events and lumis per block if fileblock in self.eventsPerBlock.keys(): self.eventsPerBlock[fileblock] += events else: self.eventsPerBlock[fileblock] = events # Number of events per file self.eventsPerFile[filename] = events # List of files per block if fileblock in self.blocksinfo.keys(): self.blocksinfo[fileblock].append(filename) else: self.blocksinfo[fileblock] = [filename] # total number of events self.maxEvents += events self.maxLumis += len(self.lumis[filename]) if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0: msg = "No new fileblocks available for dataset: " + str( self.datasetPath) raise CrabException(msg) if len(self.eventsPerBlock) <= 0: msg = "No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath raise CrabException(msg)