def main(): options = get_options() import platform if 'ingrid' in platform.node(): storagePrefix = "/storage/data/cms" else: storagePrefix = "root://cms-xrd-global.cern.ch/" print "##### Get information out of the crab config file (work area, dataset, pset)" module = load_file(options.CrabConfig) workArea = module.config.General.workArea requestName = module.config.General.requestName psetName = module.config.JobType.psetName inputDataset = unicode(module.config.Data.inputDataset) print "done" print("") print "##### Check if the dataset exists in the database" # if yes then grab its ID # if not then run das_import.py to add it # print inputDataset values = get_dataset(inputDataset) # print values if( len(values) == 0 ): tmp_sysargv = sys.argv sys.argv = ["das_import.py", inputDataset] print "calling das_import" das_import.main() print "done" sys.argv = tmp_sysargv values = get_dataset(inputDataset) # if there is more than one sample then we're in trouble, crash here assert( len(values) == 1 ) dataset_name, dataset_id, dataset_nevents = values[0] print "done" print("") print "##### Get info from crab (outputs, report)" # Since the API outputs AND prints the same data, hide whatever is printed on screen saved_stdout, saved_stderr = sys.stdout, sys.stderr if not options.debug: sys.stdout = sys.stderr = open(os.devnull, "w") taskdir = os.path.join(workArea, 'crab_' + requestName) # list output output_files = crabCommand('getoutput', '--dump', dir = taskdir ) # get crab report report = crabCommand('report', dir = taskdir ) # restore print to stdout if not options.debug: sys.stdout, sys.stderr = saved_stdout, saved_stderr # print "log_files=", log_files # print "output_files=", output_files # print "report=", report print "done" print("") print "##### Get information from the output files" files = [] for (i, lfn) in enumerate(output_files['lfn']): pfn = output_files['pfn'][i] files.append({'lfn': lfn, 'pfn': pfn}) folder = os.path.dirname(output_files['lfn'][0]) folder = storagePrefix + folder db_files = [] dataset_sumw = 0 dataset_nselected = 0 for f in files: (sumw, entries) = get_file_data(storagePrefix + f['lfn']) if not sumw: print("Warning: failed to retrieve sum of event weight for %r" % f['lfn']) dataset_sumw += sumw if not entries: print("Warning: failed to retrieve number of entries for %r" % f['lfn']) dataset_nselected += entries db_files.append(File(unicode(f['lfn']), unicode(f['pfn']), sumw, entries)) print "∑w = %.4f" % dataset_sumw print "Number of selected events: %d" % dataset_nselected print("") print "##### Check if the job processed the whole sample" has_job_processed_everything = (dataset_nevents == report['eventsRead']) is_data = (module.config.Data.splitting == 'LumiBased') if has_job_processed_everything: print "done" else: if is_data: # This is data, it is expected to not run on everything given we use a lumiMask print "done" else: # Warn print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % (report['eventsRead'], dataset_nevents, report['eventsRead'] / dataset_nevents * 100) print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments." print("") processed_lumi = None if is_data: processed_lumi = report['analyzedLumis'] print "##### Figure out the code(s) version" # first the version of the framework FWHash, FWRepo, FWUrl = getGitTagRepoUrl( os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework') ) print "FWUrl=", FWUrl # then the version of the analyzer AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl( os.path.dirname( psetName ) ) print "AnaUrl=", AnaUrl print("") print "##### Put it all together: write this sample into the database" # all the info we have gathered is: # workArea # requestName # psetName # inputDataset # dataset_id # report['eventsRead']) (not necessarily equal to dataset_nevents) # log_files # output_files # report # FWHash, FWRepo, FWUrl # AnaHash, AnaRepo, AnaUrl # dataset_nselected # localpath NAME = requestName + '_' + FWHash + '_' + AnaRepo + '_' + AnaHash add_sample(NAME, folder, "NTUPLES", report['eventsRead'], dataset_nselected, AnaUrl, FWUrl, dataset_id, dataset_sumw, has_job_processed_everything, dataset_nevents, db_files, processed_lumi)
def main(): options = get_options() import platform if "ingrid" in platform.node(): storagePrefix = "/storage/data/cms" else: storagePrefix = "root://cms-xrd-global.cern.ch/" print "##### Get information out of the crab config file (work area, dataset, pset)" module = load_file(options.CrabConfig) workArea = module.config.General.workArea requestName = module.config.General.requestName psetName = module.config.JobType.psetName inputDataset = unicode(module.config.Data.inputDataset) print "done" print ("") print "##### Check if the dataset exists in the database" # if yes then grab its ID # if not then run das_import.py to add it # print inputDataset values = get_dataset(inputDataset) # print values if len(values) == 0: tmp_sysargv = sys.argv sys.argv = ["das_import.py", inputDataset] print "calling das_import" das_import.main() print "done" sys.argv = tmp_sysargv values = get_dataset(inputDataset) # if there is more than one sample then we're in trouble, crash here assert len(values) == 1 dataset_name, dataset_id, dataset_nevents = values[0] print "done" print ("") print "##### Get info from crab (outputs, report)" # Since the API outputs AND prints the same data, hide whatever is printed on screen saved_stdout, saved_stderr = sys.stdout, sys.stderr if not options.debug: sys.stdout = sys.stderr = open(os.devnull, "w") taskdir = os.path.join(workArea, "crab_" + requestName) # list output output_files = crabCommand("getoutput", "--dump", dir=taskdir) # get crab report report = crabCommand("report", dir=taskdir) # restore print to stdout if not options.debug: sys.stdout, sys.stderr = saved_stdout, saved_stderr # print "log_files=", log_files # print "output_files=", output_files # print "report=", report print "done" print ("") print "##### Get information from the output files" files = [] for (i, lfn) in enumerate(output_files["lfn"]): pfn = output_files["pfn"][i] files.append({"lfn": lfn, "pfn": pfn}) # DEBUG # files.append({'lfn': '/store/user/sbrochet/TTTo2L2Nu_13TeV-powheg/TTTo2L2Nu_13TeV-powheg_MiniAODv2/160210_181039/0000/output_mc_1.root', 'pfn': 'srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/sbrochet/TTTo2L2Nu_13TeV-powheg/TTTo2L2Nu_13TeV-powheg_MiniAODv2/160210_181039/0000/output_mc_1.root'}) folder = os.path.dirname(output_files["lfn"][0]) folder = storagePrefix + folder db_files = [] dataset_sumw = 0 dataset_extras_sumw = {} dataset_nselected = 0 file_missing = False for f in files: (sumw, extras_sumw, entries) = get_file_data(storagePrefix + f["lfn"]) if not sumw: print ("Warning: failed to retrieve sum of event weight for %r" % f["lfn"]) file_missing = True continue dataset_sumw += sumw dataset_extras_sumw = sum_dicts(dataset_extras_sumw, extras_sumw) if not entries: print ("Warning: failed to retrieve number of entries for %r" % f["lfn"]) dataset_nselected += entries # Convert python dict to json extras_sumw_json = unicode(json.dumps(extras_sumw, separators=(",", ":"))) db_files.append(File(unicode(f["lfn"]), unicode(f["pfn"]), sumw, extras_sumw_json, entries)) print "∑w = %.4f" % dataset_sumw print "Number of selected events: %d" % dataset_nselected print "Number of output files (crab / really on the storage): %d / %d" % (len(files), len(db_files)) print ("") print "##### Check if the job processed the whole sample" has_job_processed_everything = (dataset_nevents == report["eventsRead"]) and not file_missing is_data = module.config.Data.splitting == "LumiBased" if has_job_processed_everything: print "done" else: if is_data: # This is data, it is expected to not run on everything given we use a lumiMask print "done" else: # Warn print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % ( report["eventsRead"], dataset_nevents, report["eventsRead"] / dataset_nevents * 100, ) print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments." print ("") processed_lumi = None if is_data: processed_lumi = report["analyzedLumis"] print "##### Figure out the code(s) version" # first the version of the framework FWHash, FWRepo, FWUrl = getGitTagRepoUrl(os.path.join(CMSSW_BASE, "src/cp3_llbb/Framework")) print "FWUrl=", FWUrl # then the version of the analyzer AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl(os.path.dirname(psetName)) print "AnaUrl=", AnaUrl print ("") print "##### Put it all together: write this sample into the database" # all the info we have gathered is: # workArea # requestName # psetName # inputDataset # dataset_id # report['eventsRead']) (not necessarily equal to dataset_nevents) # log_files # output_files # report # FWHash, FWRepo, FWUrl # AnaHash, AnaRepo, AnaUrl # dataset_nselected # localpath NAME = requestName + "_" + FWHash + "_" + AnaRepo + "_" + AnaHash add_sample( NAME, folder, "NTUPLES", report["eventsRead"], dataset_nselected, AnaUrl, FWUrl, dataset_id, dataset_sumw, dataset_extras_sumw, has_job_processed_everything, dataset_nevents, db_files, processed_lumi, )
def main(): options = get_options() import platform if 'ingrid' in platform.node(): storagePrefix = "/storage/data/cms" else: storagePrefix = "root://cms-xrd-global.cern.ch/" print "##### Get information out of the crab config file (work area, dataset, pset)" module = load_file(options.CrabConfig) workArea = module.config.General.workArea requestName = module.config.General.requestName psetName = module.config.JobType.psetName inputDataset = unicode(module.config.Data.inputDataset) print "done" print("") print "##### Check if the dataset exists in the database" # if yes then grab its ID # if not then run das_import.py to add it # print inputDataset values = get_dataset(inputDataset) # print values if (len(values) == 0): tmp_sysargv = sys.argv sys.argv = ["das_import.py", inputDataset] print "calling das_import" das_import.main() print "done" sys.argv = tmp_sysargv values = get_dataset(inputDataset) # if there is more than one sample then we're in trouble, crash here assert (len(values) == 1) dataset_name, dataset_id, dataset_nevents = values[0] print "done" print("") print "##### Get info from crab (outputs, report)" # Since the API outputs AND prints the same data, hide whatever is printed on screen saved_stdout, saved_stderr = sys.stdout, sys.stderr if not options.debug: sys.stdout = sys.stderr = open(os.devnull, "w") taskdir = os.path.join(workArea, 'crab_' + requestName) # list output output_files = crabCommand('getoutput', '--dump', dir=taskdir) # get crab report report = crabCommand('report', dir=taskdir) # restore print to stdout if not options.debug: sys.stdout, sys.stderr = saved_stdout, saved_stderr # print "log_files=", log_files # print "output_files=", output_files # print "report=", report print "done" print("") print "##### Get information from the output files" files = [] for (i, lfn) in enumerate(output_files['lfn']): pfn = output_files['pfn'][i] files.append({'lfn': lfn, 'pfn': pfn}) folder = os.path.dirname(output_files['lfn'][0]) folder = storagePrefix + folder db_files = [] dataset_sumw = 0 dataset_nselected = 0 for f in files: (sumw, entries) = get_file_data(storagePrefix + f['lfn']) if not sumw: print("Warning: failed to retrieve sum of event weight for %r" % f['lfn']) dataset_sumw += sumw if not entries: print("Warning: failed to retrieve number of entries for %r" % f['lfn']) dataset_nselected += entries db_files.append( File(unicode(f['lfn']), unicode(f['pfn']), sumw, entries)) print "∑w = %.4f" % dataset_sumw print "Number of selected events: %d" % dataset_nselected print("") print "##### Check if the job processed the whole sample" has_job_processed_everything = (dataset_nevents == report['eventsRead']) is_data = (module.config.Data.splitting == 'LumiBased') if has_job_processed_everything: print "done" else: if is_data: # This is data, it is expected to not run on everything given we use a lumiMask print "done" else: # Warn print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % ( report['eventsRead'], dataset_nevents, report['eventsRead'] / dataset_nevents * 100) print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments." print("") processed_lumi = None if is_data: processed_lumi = report['analyzedLumis'] print "##### Figure out the code(s) version" # first the version of the framework FWHash, FWRepo, FWUrl = getGitTagRepoUrl( os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework')) print "FWUrl=", FWUrl # then the version of the analyzer AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl(os.path.dirname(psetName)) print "AnaUrl=", AnaUrl print("") print "##### Put it all together: write this sample into the database" # all the info we have gathered is: # workArea # requestName # psetName # inputDataset # dataset_id # report['eventsRead']) (not necessarily equal to dataset_nevents) # log_files # output_files # report # FWHash, FWRepo, FWUrl # AnaHash, AnaRepo, AnaUrl # dataset_nselected # localpath NAME = requestName + '_' + FWHash + '_' + AnaRepo + '_' + AnaHash add_sample(NAME, folder, "NTUPLES", report['eventsRead'], dataset_nselected, AnaUrl, FWUrl, dataset_id, dataset_sumw, has_job_processed_everything, dataset_nevents, db_files, processed_lumi)
def main(): options = get_options() ingridStoragePrefix = "/storage/data/cms" print "##### Get information out of the crab config file (work area, dataset, pset)" module = load_file(options.CrabConfig) workArea = module.config.General.workArea requestName = module.config.General.requestName psetName = module.config.JobType.psetName inputDataset = unicode(module.config.Data.inputDataset) print "##### Check if the dataset exists in the database" # if yes then grab its ID # if not then run das_import.py to add it print inputDataset values = get_dataset(inputDataset) print values if( len(values) == 0 ): tmp_sysargv = sys.argv sys.argv = ["das_import.py", inputDataset] print "calling das_import" das_import.main() print "done" sys.argv = tmp_sysargv values = get_dataset(inputDataset) # if there is more than one sample then we're in trouble, crash here assert( len(values) == 1 ) dataset_name, dataset_id, dataset_nevents = values[0] print "##### Get info from crab (logs, outputs, report)" taskdir = os.path.join(workArea, 'crab_' + requestName) # list logs log_files = crabCommand('getlog', '--dump', dir = taskdir ) # list output output_files = crabCommand('getoutput', '--dump', dir = taskdir ) # get crab report report = crabCommand('report', dir = taskdir ) # print report print "##### Check if the job processed the whole sample" # FIXME: should it rather be just a warning ? Will likely be problematic at some point for big samples # FIXME: if changing to a warning, be careful what you actually insert in the DB # assert( dataset_nevents == report['eventsRead'] ) print "##### Figure out the code(s) version" # first the version of the framework FWHash, FWBranch, FWUrl = getGitTagBranchUrl( os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework') ) # then the version of the analyzer AnaHash, AnaBranch, AnaUrl = getGitTagBranchUrl( os.path.dirname( psetName ) ) print "##### Figure out the number of selected events" # Need to get this from the output log of the jobs, and sum them all # log_files = {'lfn': ['/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/log/cmsRun_1.log.tar.gz'], 'pfn': ['srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/log/cmsRun_1.log.tar.gz']} nselected = 0 for lfn in log_files['lfn']: tarLog = ingridStoragePrefix + lfn with tarfile.open(tarLog) as tar: for tarFile in tar.getmembers(): if 'stdout' not in tarFile.name: continue # For some reason, even though we are using python 2.7, the with statement here seems broken... Using contextlib to handle the file opening / reading cleanly with contextlib.closing(tar.extractfile(tarFile)) as file: for line in file: if 'processed' not in line and 'selected' not in line: continue l = line.split() nselected += int(line.split()[3]) # print nselected print "##### For the path, check that the files there do correspond EXACTLY to the list of output files from crab" # (crab being crab, we're never too careful!) # output_files = {'lfn': ['/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/output_mc_1.root'], 'pfn': ['srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/output_mc_1.root']} # first check what we do have locally p = os.path.dirname( output_files['lfn'][0] ) localpath = ingridStoragePrefix + p localfiles = [ os.path.join(p, f) for f in os.listdir(localpath) if os.path.isfile( os.path.join(localpath, f) ) and 'root' in f ] # unordered comparison: the two list should be equal if set(localfiles) != set(output_files['lfn']): print "ERROR: the content of the path and the list of crab outputs are different, abort now!" print "localfiles (what's here locally)= ", localfiles print "outputfiles (what crab is saying)= ", output_files['lfn'] raise AssertionError("CRAB3 and local list of files do not match") # print localpath print "##### Put it all together: write this sample into the database" # all the info we have gathered is: # workArea # requestName # psetName # inputDataset # dataset_id # dataset_nevents (asserted to be equal to report['eventsRead']) # log_files # output_files # report # FWHash, FWBranch, FWUrl # AnaHash, AnaBranch, AnaUrl # nselected # localpath NAME = requestName + '_' + FWHash + '_' + AnaHash add_sample(NAME, localpath, "NTUPLES", dataset_nevents, nselected, AnaUrl, FWUrl, dataset_id)