def harvestAnalysisResults( channel=None, samples=None, inputFilePath=None, outputFilePath=None, jobId=None, tmpFilePath=None, # Pre-scale and factorize samples ana_defs=None, plot_defs=None, plotters=None, use_job_report=False, useCastor=True): # check that channel, samples, inputFilePath, outputFilePath, tmpFilePath and jobId # parameters are defined and non-empty if channel is None: raise ValueError("Undefined channel Parameter !!") if samples is None: raise ValueError("Undefined samples Parameter !!") if inputFilePath is None: raise ValueError("Undefined inputFilePath Parameter !!") if outputFilePath is None: raise ValueError("Undefined outputFilePath Parameter !!") if tmpFilePath is None: raise ValueError("Undefined tmpFilePath Parameter !!") if jobId is None: raise ValueError("Undefined jobId Parameter !!") if not os.path.exists(tmpFilePath): os.mkdir(tmpFilePath) if not os.path.exists(outputFilePath): os.mkdir(outputFilePath) files_and_times = [] #if not ana_defs or not use_job_report: if not use_job_report: # Use CASTOR to find the files to merge print "Finding CASTOR files" inputFilePath = '/castor/cern.ch' + '/' + inputFilePath inputFilePath = inputFilePath.replace('//', '/') inputFilePath = inputFilePath.replace('/castor/cern.ch/castor/cern.ch', '/castor/cern.ch') print(" inputFilePath = " + inputFilePath) files_and_times = [ (file['time'], file['path']) for file in harvest_tools.clean_by_crab_id( file for file in harvest_tools.castor_source(inputFilePath) if '_%s_' % jobId in file['path']) ] else: print "Using job reports to find output files" for sample in samples['SAMPLES_TO_ANALYZE']: crab_dir = '' if useCastor: crab_dir = os.path.join( 'crab', 'crabdir_run%s_%s_%s' % (channel, sample, jobId)) print "Getting output files from:", crab_dir files_and_times.extend( (None, file['path']) for file in harvest_tools.crabdir_source(crab_dir)) else: crab_dir = os.path.join( 'crab', 'crabdir_run%s_%s_%s' % (channel, sample, jobId)) print "Getting output files from:", crab_dir if not os.path.exists(crab_dir): continue files_and_times.extend( (None, file) for file in harvest_tools.crabdir_source_stdout(crab_dir)) #print files_and_times plot_harvest_jobs = [] skim_harvest_jobs = [] ntuple_harvest_jobs = [] for sample in samples['SAMPLES_TO_ANALYZE']: print "Finding input files for", sample output_file = "harvested_%s_%s_%s.root" % (channel, sample, jobId) output_path = os.path.join(outputFilePath, output_file) files_to_merge = list() if useCastor: files_to_merge = list('rfio:%s' % file for time, file in files_and_times if file.find('plots_%s_%s_%s_' % (channel, sample, jobId)) != -1) else: files_to_merge = list('%s' % file for time, file in files_and_times if file.find('plots_%s_%s_%s_' % (channel, sample, jobId)) != -1) plot_harvest_jobs.append((sample, output_path, files_to_merge)) # Get final event skims that need to be merged if useCastor: event_files_to_merge = list( 'rfio:%s' % file for time, file in files_and_times if file.find('final_events_%s_%s_%s_' % (channel, sample, jobId)) != -1) else: event_files_to_merge = list( '%s' % file for time, file in files_and_times if file.find('final_events_%s_%s_%s_' % (channel, sample, jobId)) != -1) skim_output_path = os.path.join( outputFilePath, "skim_%s_%s_%s.root" % (channel, sample, jobId)) skim_harvest_jobs.append( (sample, skim_output_path, event_files_to_merge)) # Gen ntuple files that need to be merged ntuple_files_to_merge = list( '%s' % file for time, file in files_and_times if file.find('diTauNtuple_%s_%s_%s_' % (channel, sample, jobId)) != -1) ntuple_output_path = os.path.join( outputFilePath, "ntuple_%s_%s_%s.root" % (channel, sample, jobId)) ntuple_harvest_jobs.append( (sample, ntuple_output_path, ntuple_files_to_merge)) print "Creating Makefile for histogram files" MakefileName = 'Makefile.harvest_%s_%s' % (channel, jobId) buildMakefile(plot_harvest_jobs, tmpFilePath, MakefileName, merge_per_job=7, ana_defs=ana_defs, plot_defs=plot_defs, plotters=plotters) print "Creating Makefile for skimmed event files" skim_MakefileName = "Makefile.mergeSkims_%s_%s" % (channel, jobId) # Make merge_per_job absurdly high, so it doesn't create unnecessary layers. buildMakefile(skim_harvest_jobs, tmpFilePath, skim_MakefileName, merge_per_job=1e9, harvest_tool='genericSkimMerger.py') print "Creating Makefile for ntuple files" ntuple_MakefileName = "Makefile.mergeNtuples_%s_%s" % (channel, jobId) # Make merge_per_job absurdly high, so it doesn't create unnecessary layers. buildMakefile(ntuple_harvest_jobs, tmpFilePath, ntuple_MakefileName, merge_per_job=1e9, harvest_tool='genericSkimMerger.py') print "Makefile built. In order to start harvesting, execute 'make -f %s -j 8 -k'" % MakefileName
print tmpFilePath plot_regex = r"plots_%s_(?P<sample>\w+?)_%s_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" % (channel, jobId) skim_regex = r"final_events_%s_(?P<sample>\w+?)_%s_(?P<gridJob>\d*)(_(?P<gridTry>\d*))*_(?P<hash>[a-zA-Z0-9]*).root" % (channel, jobId) def matches_either(files): # Check if the file matches either of the regexes we are interested in. # We do this to skip extra files in the directories before we pass them to # clean_by_crab_id plot_matcher = re.compile(plot_regex) skim_matcher = re.compile(skim_regex) for file in files: if plot_matcher.match(file['file']) or skim_matcher.match(file['file']): yield file def local_copy_mapper(sample): " Define where we want to copy the final output locally " return os.path.join( harvestingFilePath, "_".join(['harvested', channel, sample, jobId]) + ".root") make_harvest_scripts( plot_regex, skim_regex, channel, clean_by_crab_id(matches_either(castor_source(analysisFilePath))), tmpFilePath, local_copy_mapper = local_copy_mapper, chunk_size = 2.e+9, # 3 GB )
def matches_either(files): # Check if the file matches either of the regexes we are interested in. # We do this to skip extra files in the directories before we pass them to # clean_by_crab_id plot_matcher = re.compile(plot_regex) skim_matcher = re.compile(skim_regex) for file in files: if plot_matcher.match(file['file']) or skim_matcher.match( file['file']): yield file def local_copy_mapper(sample): " Define where we want to copy the final output locally " return os.path.join( harvestingFilePath, "_".join(['harvested', channel, sample, jobId]) + ".root") make_harvest_scripts( plot_regex, skim_regex, channel, clean_by_crab_id(matches_either(castor_source(analysisFilePath))), tmpFilePath, local_copy_mapper=local_copy_mapper, chunk_size=2.e+9, # 3 GB )
def harvestAnalysisResults(channel = None, samples = None, inputFilePath = None, outputFilePath = None, jobId = None, tmpFilePath = None, # Pre-scale and factorize samples ana_defs = None, plot_defs = None, plotters = None, use_job_report = False, useCastor = True): # check that channel, samples, inputFilePath, outputFilePath, tmpFilePath and jobId # parameters are defined and non-empty if channel is None: raise ValueError("Undefined channel Parameter !!") if samples is None: raise ValueError("Undefined samples Parameter !!") if inputFilePath is None: raise ValueError("Undefined inputFilePath Parameter !!") if outputFilePath is None: raise ValueError("Undefined outputFilePath Parameter !!") if tmpFilePath is None: raise ValueError("Undefined tmpFilePath Parameter !!") if jobId is None: raise ValueError("Undefined jobId Parameter !!") if not os.path.exists(tmpFilePath): os.mkdir(tmpFilePath) if not os.path.exists(outputFilePath): os.mkdir(outputFilePath) files_and_times = [] #if not ana_defs or not use_job_report: if not use_job_report: # Use CASTOR to find the files to merge print "Finding CASTOR files" inputFilePath = '/castor/cern.ch' + '/' + inputFilePath inputFilePath = inputFilePath.replace('//', '/') inputFilePath = inputFilePath.replace('/castor/cern.ch/castor/cern.ch', '/castor/cern.ch') print(" inputFilePath = " + inputFilePath) files_and_times = [ (file['time'], file['path']) for file in harvest_tools.clean_by_crab_id( file for file in harvest_tools.castor_source( inputFilePath) if '_%s_' % jobId in file['path']) ] else: print "Using job reports to find output files" for sample in samples['SAMPLES_TO_ANALYZE']: crab_dir = '' if useCastor: crab_dir = os.path.join( 'crab', 'crabdir_run%s_%s_%s' % (channel, sample, jobId)) print "Getting output files from:", crab_dir files_and_times.extend( (None, file['path']) for file in harvest_tools.crabdir_source(crab_dir)) else: crab_dir = os.path.join( 'crab', 'crabdir_run%s_%s_%s' % (channel, sample, jobId)) print "Getting output files from:", crab_dir if not os.path.exists(crab_dir): continue files_and_times.extend( (None, file) for file in harvest_tools.crabdir_source_stdout(crab_dir)) #print files_and_times plot_harvest_jobs = [] skim_harvest_jobs = [] ntuple_harvest_jobs = [] for sample in samples['SAMPLES_TO_ANALYZE']: print "Finding input files for", sample output_file = "harvested_%s_%s_%s.root" % (channel, sample, jobId) output_path = os.path.join(outputFilePath, output_file) files_to_merge = list() if useCastor: files_to_merge = list( 'rfio:%s' % file for time, file in files_and_times if file.find('plots_%s_%s_%s_' % (channel, sample, jobId)) != -1) else: files_to_merge = list( '%s' % file for time, file in files_and_times if file.find('plots_%s_%s_%s_' % (channel, sample, jobId)) != -1) plot_harvest_jobs.append( (sample, output_path, files_to_merge) ) # Get final event skims that need to be merged if useCastor: event_files_to_merge = list( 'rfio:%s' % file for time, file in files_and_times if file.find('final_events_%s_%s_%s_' % (channel, sample, jobId)) != -1) else: event_files_to_merge = list( '%s' % file for time, file in files_and_times if file.find('final_events_%s_%s_%s_' % (channel, sample, jobId)) != -1) skim_output_path = os.path.join( outputFilePath, "skim_%s_%s_%s.root" % (channel, sample, jobId)) skim_harvest_jobs.append( (sample, skim_output_path, event_files_to_merge)) # Gen ntuple files that need to be merged ntuple_files_to_merge = list( '%s' % file for time, file in files_and_times if file.find('diTauNtuple_%s_%s_%s_' % (channel, sample, jobId)) != -1) ntuple_output_path = os.path.join( outputFilePath, "ntuple_%s_%s_%s.root" % (channel, sample, jobId)) ntuple_harvest_jobs.append( (sample, ntuple_output_path, ntuple_files_to_merge)) print "Creating Makefile for histogram files" MakefileName = 'Makefile.harvest_%s_%s' % (channel, jobId) buildMakefile(plot_harvest_jobs, tmpFilePath, MakefileName, merge_per_job = 7, ana_defs = ana_defs, plot_defs = plot_defs, plotters = plotters) print "Creating Makefile for skimmed event files" skim_MakefileName = "Makefile.mergeSkims_%s_%s" % (channel, jobId) # Make merge_per_job absurdly high, so it doesn't create unnecessary layers. buildMakefile(skim_harvest_jobs, tmpFilePath, skim_MakefileName, merge_per_job = 1e9, harvest_tool = 'genericSkimMerger.py') print "Creating Makefile for ntuple files" ntuple_MakefileName = "Makefile.mergeNtuples_%s_%s" % (channel, jobId) # Make merge_per_job absurdly high, so it doesn't create unnecessary layers. buildMakefile(ntuple_harvest_jobs, tmpFilePath, ntuple_MakefileName, merge_per_job = 1e9, harvest_tool = 'genericSkimMerger.py') print "Makefile built. In order to start harvesting, execute 'make -f %s -j 8 -k'" % MakefileName