def skim(in_filename, out_filename, entry_list, tree_name = "Events"): ''' Args: in_filename: string, Path to the input ROOT file out_filename: string, Path to the output ROOT file entry_list: int array, List of entries (not RLE numbers!) which are used to select the event tree_name: string, TTree name (default: tree) Returns: True, if none of the entry numbers didn't exceed the actual number of entries in the ROOT file False, otherwise ''' logging.debug("Processing file {in_filename}".format(in_filename = in_filename)) f_in = ROOT.TFile(in_filename, "read") t_in = f_in.Get(tree_name) nof_entries = t_in.GetEntries() if max(entry_list) > nof_entries: logging.error("Max entry ID exceeds the number of entries in {root_filename}: {max_entry} > {nof_entries}".format( root_filename = in_filename, max_entry = max(entry_list), nof_entries = nof_entries, )) return False f_out = ROOT.TFile(out_filename, "recreate") t_out = t_in.CloneTree(0) for i in entry_list: t_in.GetEntry(i) t_out.Fill() t_out.AutoSave() logging.debug("Saved events to {out_filename}".format(out_filename = out_filename)) return True
def get_rle(in_filename, tree_name = "tree"): '''Fetches all RLE numbers in a given file Args: in_filename: string, Path to the ROOT file from which the RLE numbers are fetched tree_name: string, TTree name (default: tree) Returns: string array, List of RLE numbers (in the format returned by join_rle()) The function expects the input file to exist in the file system. ''' rle_list = [] f_in = ROOT.TFile(in_filename, "read") t_in = f_in.Get(tree_name) run = array.array('I', [0]) lumi = array.array('I', [0]) evt = array.array('L', [0]) t_in.SetBranchAddress("run", run) t_in.SetBranchAddress("luminosityBlock", lumi) t_in.SetBranchAddress("event", evt) nof_entries = t_in.GetEntries() for i in range(nof_entries): t_in.GetEntry(i) rle_list.append(join_rle(run, lumi, evt)) return rle_list
def check_that_histogram_is_not_zombie(input_file): print("<check_that_histogram_is_not_zombie>: input file = '%s'" % input_file) root_tfile = ROOT.TFile(input_file, "read") if root_tfile.IsZombie(): # MP: THIS IS NOT WORKING :( print("ERROR: Input file '%s' is zombie !!" % input_file) sys.exit(1)
def is_file_ok(output_file_name, validate_outputs=True, min_file_size=20000): if not (output_file_name and os.path.exists(output_file_name)): return False logging.info("Output file %s already exists" % output_file_name) if not output_file_name.lower().endswith('.root'): return True command = "rm %s" % output_file_name ret_value = False if min_file_size > 0: output_file_size = os.stat(output_file_name).st_size if output_file_size > min_file_size: if not validate_outputs: ret_value = True else: logging.info( "Deleting output file and resubmitting job because it has size smaller than %d bytes" % min_file_size) if validate_outputs: root_tfile = ROOT.TFile(output_file_name, "read") if not root_tfile: logging.info("Not a valid ROOT file, deleting it") else: if root_tfile.IsZombie(): logging.info( "Output file is corrupted, deleting file and resubmitting job" ) else: # Let's open the file via bash as well to see if ROOT tries to recover the file open_cmd = "root -b -l -q %s 2>&1 > /dev/null | grep 'trying to recover' | wc -l" % output_file_name open_out = run_cmd(open_cmd) if open_out.rstrip('\n') != '0': logging.info( "Output file is probably corrupted, deleting file and resubmitting job" ) else: ret_value = True root_tfile.Close() if not ret_value: run_cmd(command) return ret_value
def validate(output_dir, verbose=False): '''Validates the job execution carried out by dump_rle_parallel() Args: output_dir: string, The directory where all RLE files are stored verbose: bool, Enable verbose output Returns: None The validation is quite basic: the program will loop over the subdirectories of output_dir, matches them against the dictionary entries specified by sample variable and counts the number of lines in each RLE file. If the number of files doesn't match to the number of entries in the corresponding ROOT file, the user will be notified about such discrepancies. In principle, the script could also print relevant commands to fix the issues (and dump them to an easily executable file) but let's leave it for another time. ''' if verbose: logging.getLogger().setLevel(logging.DEBUG) root_file_regex = re.compile('^tree_(\d+).root$') file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']} try: for s_key, s_value in samples.iteritems(): sample_name = s_value['process_name_specific'] sample_dir = os.path.join(output_dir, sample_name) if os.path.isdir(sample_dir): logging.debug("Found sample directory {sample_dir}".format( sample_dir=sample_dir)) #NB! assume that there are no secondary paths in the dictionary (hence index 0!) sample_path_dict = s_value['local_paths'][0] sample_path = sample_path_dict['path'] blacklist = sample_path_dict['blacklist'] for sample_subdir in os.listdir(sample_path): sample_subpath_idx = -1 try: sample_subpath_idx = int(sample_subdir) except ValueError: continue if sample_subpath_idx < 0: raise ValueError("Internal error") sample_subpath = os.path.join(sample_path, sample_subdir) logging.debug( "Processing sample subdirectory {sample_subpath}". format(sample_subpath=sample_subpath)) for sample_file in os.listdir(sample_subpath): sample_file_fullpath = os.path.join( sample_subpath, sample_file) if not sample_file.endswith( '.root') or not os.path.isfile( sample_file_fullpath): continue root_file_regex_match = root_file_regex.search( sample_file) if not root_file_regex_match: continue root_file_idx = int(root_file_regex_match.group(1)) expected_rle_file_basename = '{root_file_idx}.txt'.format( root_file_idx=root_file_idx) expected_rle_file = os.path.join( sample_dir, expected_rle_file_basename) file_dict_entry = (expected_rle_file, sample_file_fullpath) if root_file_idx in blacklist: if os.path.isfile(expected_rle_file): logging.warning( 'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) ' 'which you ought to delete'.format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['excess'].append(file_dict_entry) continue if not os.path.isfile(expected_rle_file): logging.warning( 'Missing RLE file {rle_file} (corresponding to {root_file})' .format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['missing'].append(file_dict_entry) continue nof_rle_events = raw_linecount(expected_rle_file) if nof_rle_events == 1 and os.path.getsize( expected_rle_file) == 1: # the RLE file contains only a newline, hence no events nof_rle_events = 0 root_file = ROOT.TFile(sample_file_fullpath, 'read') root_tree = root_file.Get('tree') nof_entries = root_tree.GetEntries() nof_events_diff = nof_rle_events - nof_entries if nof_events_diff < 0: logging.error( 'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): ' 'expected {expected}, got {actual}'.format( nof_events=abs(nof_events_diff), rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) elif nof_events_diff > 0: logging.error( 'Got {nof_events} more event than expected in {rle_filename} (corresponding ' 'to {sample_file}): expected {expected}, got {actual}' .format( nof_events=nof_events_diff, rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) else: logging.debug( 'File {rle_filename} (corresponding to {sample_file}) looks OK' .format( rle_filename=expected_rle_file, sample_file=sample_file_fullpath, )) except KeyboardInterrupt: pass if any(map(bool, file_dict.values())): logging.info('Validation finished with errors') for key in file_dict.keys(): if file_dict[key]: logging.info('Number of {key} RLE files: {nof_key}'.format( key=key, nof_key=len(file_dict[key]))) for entry in file_dict[key]: logging.info('{rle_file} <=> {sample_file}'.format( rle_file=entry[0], sample_file=entry[1])) else: logging.info('Validation finished successfully') return
def build_rle_file(rles, output): rles_aggregated = collections.OrderedDict() region_plus_sys_arr = collections.OrderedDict() for channel in rles: rles_aggregated[channel] = collections.OrderedDict() region_plus_sys_arr[channel] = [] for region in rles[channel]: for sample_name in rles[channel][region]: if sample_name not in rles_aggregated[channel]: rles_aggregated[channel][ sample_name] = collections.OrderedDict() for central_or_shift in rles[channel][region][sample_name]: region_plus_sys = region + '_' + central_or_shift if region_plus_sys not in region_plus_sys_arr: region_plus_sys_arr[channel].append(region_plus_sys) for rle in rles[channel][region][sample_name][ central_or_shift]: assert (REGEX_RLE.match(rle)) if rle not in rles_aggregated[channel][sample_name]: rles_aggregated[channel][sample_name][rle] = [] rles_aggregated[channel][sample_name][rle].append( region_plus_sys) output_file = ROOT.TFile(output, 'recreate') for channel in rles_aggregated: channel_dir = output_file.mkdir(channel) channel_dir.cd() event_count = 0 for sample_name in rles_aggregated[channel]: tree = ROOT.TTree(sample_name, sample_name) run = array.array('I', [0]) luminosityBlock = array.array('I', [0]) event = array.array('L', [0]) region_plus_sys_brs = collections.OrderedDict() for region_plus_sys in region_plus_sys_arr[channel]: region_plus_sys_brs[region_plus_sys] = array.array('I', [0]) tree.Branch('run', run, 'run/i') tree.Branch('luminosityBlock', luminosityBlock, 'luminosityBlock/i') tree.Branch('event', event, 'event/l') for region_plus_sys in region_plus_sys_brs: tree.Branch(region_plus_sys, region_plus_sys_brs[region_plus_sys], '%s/i' % region_plus_sys) event_count_sample = len(rles_aggregated[channel][sample_name]) logging.info( 'Found a total of {} events in sample {} and channel {}'. format(event_count_sample, sample_name, channel)) event_count += event_count_sample for rle in rles_aggregated[channel][sample_name]: rle_split = rle.split(':') run[0] = int(rle_split[0]) luminosityBlock[0] = int(rle_split[1]) event[0] = int(rle_split[2]) for region_plus_sys in region_plus_sys_brs: region_plus_sys_brs[region_plus_sys][0] = int( region_plus_sys in rles_aggregated[channel] [sample_name][rle]) tree.Fill() output_file.Write() logging.info('Found a total of {} events in channel {}'.format( event_count, channel)) logging.info('Wrote file {}'.format(output))