def has_rles(input_filename, rles): if not hdfs.isfile(input_filename): raise RuntimeError("No such file: %s" % input_filename) input_file = ROOT.TFile.Open(input_filename, 'read') assert(input_file) events_tree = input_file.Get('Events') assert (events_tree) run_branch = array.array('I', [0]) ls_branch = array.array('I', [0]) events_branch = array.array('L', [0]) events_tree.SetBranchAddress('run', run_branch) events_tree.SetBranchAddress('luminosityBlock', ls_branch) events_tree.SetBranchAddress('event', events_branch) rle_matches = [] nof_events = events_tree.GetEntries() for idx in range(nof_events): events_tree.GetEntry(idx) rle = ':'.join(map(lambda x: str(x[0]), [ run_branch, ls_branch, events_branch ])) if rle in rles: rle_matches.append(rle) if len(rle_matches) == len(rles): break input_file.Close() return rle_matches
def get_evt_subdir_names(fn): assert (hdfs.isfile(fn)) fptr = ROOT.TFile.Open(fn, 'read') assert (fptr) logging.info('Opened file {}'.format(fptr.GetName())) results = {} root_keys = [k.GetName() for k in fptr.GetListOfKeys()] for root_key in root_keys: dir_cand = fptr.Get(root_key) if type(dir_cand) != ROOT.TDirectoryFile: continue evt_dir_name = os.path.join(root_key, 'sel', 'evt') evt_dir = fptr.Get(evt_dir_name) if not evt_dir: continue evt_dir_keys = sorted( os.path.join(evt_dir_name, k.GetName()) for k in evt_dir.GetListOfKeys() if k.GetName().startswith('htxs_')) results[root_key] = evt_dir_keys logging.info('Closing file: {}'.format(fptr.GetName())) fptr.Close() return results
def project(input_file, output_file, binnings): if not hdfs.isfile(input_file): raise RuntimeError('No such file: %s' % input_file) root_file = ROOT.TFile.Open(input_file, 'read') if not root_file: print('Unable to read file %s' % input_file) return False events = root_file.Get('Events') assert (events) histograms = [] for branch_name, binning_array in binnings.items(): binning = array.array('f', binning_array) histogram = ROOT.TH1F(branch_name, branch_name, len(binning) - 1, binning) assert (histogram) events.Project(branch_name, branch_name) histograms.append(histogram) out_file = ROOT.TFile.Open(output_file, 'recreate') out_file.cd() for histogram in histograms: histogram.Write() out_file.Close() root_file.Close() return True
def find_hadd_stage_files(input_path, regions, find_hadd_stage1): path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: if os.path.basename(region_path).startswith( tuple(ANALYSIS_REGIONS[region] for region in regions)): next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if not (find_hadd_stage1 != (next_path_basename != 'hadd')): next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] metadata = extract_metadata(current_path) if metadata['region_key'] not in regions: continue for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue if is_hadd_stage_file(candidate_file, find_hadd_stage1, metadata): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths return current_paths
def skim_debug(out_filename, rle_list, tree_name = "tree"): '''Checks if skimming was successful by comparing RLE number in the output file to the given list of RLE numbers Args: out_filename: string, Path to the file the RLE numbers of which is compared against the RLE array rle_list: string array, List of RLE numbers as strings tree_name: string, TTree name (default: tree) Returns: True, if the RLE numbers in the file match exactly to the given input list of RLE numbers False, otherwise ''' logging.debug("Checking if {out_filename} contains exactly the same events as provided by the RLE file".format( out_filename = out_filename, )) if not hdfs.isfile(out_filename): return False out_rle_list = get_rle(out_filename, tree_name) missing_from_file = list(set(rle_list) - set(out_rle_list)) excess_in_file = list(set(out_rle_list) - set(rle_list)) ret_val = True if missing_from_file: logging.error("There are {nof_missing} events missing from {out_filename}: {list_of_missing_events}".format( nof_missing = len(missing_from_file), out_filename = out_filename, list_of_missing_events = ', '.join(missing_from_file), )) ret_val = False if excess_in_file: logging.error("There are {nof_excess} event in excess in the file {out_filename}: {list_of_excess_events}".format( nof_excess = len(excess_in_file), out_filename = out_filename, list_of_excess_events = ', '.join(excess_in_file), )) ret_val = False return ret_val
def exists(filename): if filename.startswith('/eos'): try: logging.debug('Trying eos on %s' % filename) result = cmd_execute('eos stat %s' % filename) return result.split()[1].replace('`', '').replace("'", '') == filename except Exception as err: raise ValueError('Cannot access file %s on eos because: %s' % (filename, err)) elif filename.startswith('root://eoscms.cern.ch/'): try: logging.debug('Trying XRD on %s' % filename) filename_noprefix = filename.replace('root://eoscms.cern.ch/', '') result = cmd_execute('xrdfs root://eoscms.cern.ch stat %s' % filename_noprefix) return result.split()[1] == filename_noprefix except Exception as err: raise ValueError('Cannot access file %s on eos because: %s' % (filename, err)) else: logging.debug('Trying local file system on %s' % filename) return hdfs.isfile(filename)
def testIsNotFile(self): # The function returns False even if there is no such path on the file system self.assertFalse(hdfs.isfile(self.nonExistingHDFSfile)) self.assertFalse(hdfs.isfile(self.nonExistingHomeFile))
def testIsFile(self): self.assertTrue(hdfs.isfile(self.userHDFSfile)) self.assertTrue(hdfs.isfile(self.userHomeFile))
def find_parents(input_file, input_rles): if input_file.startswith('/store'): return [] if not hdfs.isfile(input_file): raise RuntimeError("No such file: %s" % input_file) if not all(RLE_REGEX.match(rle) for rle in input_rles): raise RuntimeError("Not all input run:lumi:event numbers conform to the expected format") input_file_basename = os.path.basename(input_file) tree_match = TREE_REGEX.match(input_file_basename) if not tree_match: raise RuntimeError("Not a valid Ntuple: %s" % input_file) tree_idx = int(tree_match.group('idx')) assert(tree_idx > 0) parent_candidates = [] if input_file.startswith('/hdfs/local'): input_file_split = input_file.split(os.path.sep) assert(len(input_file_split) == 11) process_name = input_file_split[-3] version = input_file_split[-5] era = input_file_split[-6] modes = [ mode for mode in ALLOWED_MODES.keys() if version.endswith(mode) ] if len(modes) != 1: raise RuntimeError("Unable to deduce mode from input path: %s" % input_file) mode = modes[0] version_no_mode = version[:-len(mode) - 1] nom_signifier = version_no_mode.split('_')[-1] version_no_mode_nom = version_no_mode[:-len(nom_signifier) - 1] presel_signifier = version_no_mode_nom.split('_')[-1] sample_base = ALLOWED_MODES[mode]['base'] sample_suffix = ALLOWED_MODES[mode]['suffix'] if presel_signifier == 'wPresel': sample_suffix = 'preselected_{}'.format(sample_suffix) if mode == 'all' else '{}_preselected'.format(sample_suffix) samples = load_samples(era, True, base = sample_base, suffix = sample_suffix) dbs_key = '' for sample_key, sample_info in samples.items(): if sample_key == 'sum_events': continue if sample_info['process_name_specific'] == process_name: dbs_key = sample_key break if not dbs_key: raise RuntimeError("Unable to find an entry from sample dictionary that corresponds to file: %s" % input_file) sample_nfiles = samples[dbs_key]['nof_files'] if sample_nfiles < tree_idx: raise RuntimeError( "Tree index found from input path %s larger than expected number of Ntuples: %d" % (input_file, sample_nfiles) ) if presel_signifier == 'wPresel': parent_samples = load_samples(era, True, base = sample_base, suffix = sample_suffix.replace('preselected_', '').replace('_preselected', '') ) parent_sample = parent_samples[dbs_key] elif presel_signifier == 'woPresel': parent_samples = load_samples(era, False, base = sample_base) parent_sample = parent_samples[dbs_key] else: raise RuntimeError("Invalid preselection signifier found from input file %s: %s" % (input_file, presel_signifier)) parent_sample_nfiles = parent_sample['nof_files'] parent_sample_path = parent_sample['local_paths'][0]['path'] parent_sample_blacklist = parent_sample['local_paths'][0]['blacklist'] assert(parent_sample_nfiles >= sample_nfiles) whitelisted_indices = [ idx for idx in range(1, parent_sample_nfiles + 1) if idx not in parent_sample_blacklist ] len_whitelisted_indices = len(whitelisted_indices) if len_whitelisted_indices == sample_nfiles: # it's 1-1 correspondence parent_candidate = os.path.join(parent_sample_path, "%04d" % (tree_idx // 1000), 'tree_%d.root' % tree_idx) rle_matches = has_rles(parent_candidate, input_rles) if len(rle_matches) == len(input_rles): parent_candidates.append((parent_candidate, rle_matches)) else: raise RuntimeError("Unable to find parent for: %s" % input_file) elif len_whitelisted_indices > sample_nfiles: # partition chunk_len = int(math.ceil(float(len_whitelisted_indices) / sample_nfiles)) chunks = [ whitelisted_indices[idx:idx + chunk_len] for idx in range(0, len_whitelisted_indices, chunk_len) ] assert(len(chunks) == sample_nfiles) parent_chunk = chunks[tree_idx - 1] for parent_idx in parent_chunk: parent_candidate = os.path.join(parent_sample_path, "%04d" % (parent_idx // 1000), 'tree_%d.root' % parent_idx) rle_matches = has_rles(parent_candidate, input_rles) if rle_matches: parent_candidates.append((parent_candidate, rle_matches)) else: raise RuntimeError("Fewer parent Ntuples than sibling Ntuples for the Ntuple: %s" % input_file) elif input_file.startswith('/hdfs/cms/store/user'): input_file_dirname = os.path.dirname(input_file) log_file = os.path.join(input_file_dirname, 'log', 'cmsRun_{}.log.tar.gz'.format(tree_idx)) if hdfs.isfile(log_file): tar = tarfile.open(log_file, 'r:gz') tar_contents = tar.getnames() xml_filename = 'FrameworkJobReport-{}.xml'.format(tree_idx) if xml_filename in tar_contents: xml_tarmember = tar.getmember(xml_filename) xml_file = tar.extractfile(xml_tarmember) xml_contents = xml_file.read() xml_tree = ET.ElementTree(ET.fromstring(xml_contents)) last_lfn = '' matched_ls = [] expected_ls = { int(rle.split(':')[1]) : rle for rle in input_rles } for elem in xml_tree.iter(): if elem.tag == 'Inputs' or len(expected_ls) == len(matched_ls): break if elem.tag == 'LFN': if last_lfn and matched_ls: parent_candidates.append((last_lfn, matched_ls)) last_lfn = elem.text matched_ls = [] elif elem.tag == 'LumiSection': ls = int(elem.attrib['ID']) if ls in expected_ls: matched_ls.append(expected_ls[ls]) if last_lfn and matched_ls: parent_candidates.append((last_lfn, matched_ls)) tar.close() else: raise RuntimeError("Invalid path: %s" % input_file) return parent_candidates
create_output_dir(output_file) if plot_files: for plot_file in plot_files: if not plot_file.endswith(ACCEPTED_PLOT_EXTS_TUPLE): raise RuntimeError( "Expected extensions %s, instead of whatever this is: %s" % (', '.join(ACCEPTED_PLOT_EXTS), plot_file)) create_output_dir(plot_file) input_files = [] with open(input_txt_file, 'r') as input_file_ptr: for line in input_file_ptr: input_file_cand = line.rstrip() if not hdfs.isfile(input_file_cand): raise RuntimeError("No such file: %s" % input_file_cand) input_files.append(input_file_cand) logging.info("Found {} input files".format(len(input_files))) weights_map = {} def record_weights(file_name): fptr = ROOT.TFile.Open(file_name, 'read') tree = fptr.Get('Events') genWeight = array.array('f', [0.]) tree.SetBranchAddress(GENWEIGHT_NAME, genWeight) tree.SetBranchStatus("*", 0)
'-v', '--verbose', dest='verbose', action='store_true', default=False, help='R|Verbose output', ) args = parser.parse_args() input_file_names = args.input output_dir = os.path.abspath(args.output) rles = args.rle logging.getLogger().setLevel( logging.DEBUG if args.verbose else logging.INFO) for input_file_name in input_file_names: if not hdfs.isfile(input_file_name): raise ValueError("No such file: %s" % input_file_name) for rle in rles: assert (re.match('^\d+:\d+:\d+$', rle)) mtable = MassTable() for input_file_name in input_file_names: graph_map = get_graph(input_file_name, rles, mtable) for rle in graph_map: output_file_filename = '{}-{}.png'.format( os.path.splitext(os.path.basename(input_file_name))[0], rle.replace(':', '-')) output_file_name = os.path.join(output_dir, output_file_filename) save_graph(graph_map[rle], output_file_name, args.keep)
def waitForJobs(self): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobIds out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) while nofJobs_left > 0: # Get the list of jobs submitted to batch system and convert their jobIds to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary logging.debug("sleeping for %i seconds." % 60) time.sleep(60) polled_ids = set() if poll_result != '': polled_ids = set(poll_result.split(delimiter)) # Check if number of jobs submitted to batch system is below maxSubmittedJobs; # if it is, take jobs from queuedJobs list and submit them, # until a total of maxSubmittedJobs is submitted to batch system nofJobs_toSubmit = min(len(self.queuedJobs), self.maxSubmittedJobs - len(polled_ids)) if nofJobs_toSubmit > 0: logging.debug( "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs." .format(len(polled_ids), len(self.queuedJobs), nofJobs_toSubmit)) else: logging.debug( "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing." .format(len(polled_ids), len(self.queuedJobs))) for i in range(0, nofJobs_toSubmit): # randomly submit a job from the queue two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_idx = len(self.queuedJobs) - 1 random_idx = random.randint(0, max_idx) job = self.queuedJobs.pop(random_idx) job['status'] = Status.submitted job_id = self.submit(job['sbatch_command']) self.submittedJobs[job_id] = job # Now check status of jobs submitted to batch system: # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for job_id, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(job_id) elif details.status == Status.running: running_jobs.append(job_id) else: failed_jobs.append(job_id) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[job_id].status for job_id in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for job_id in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=job_id, exitCode=completion[job_id].exit_code, derivedExitCode=completion[job_id]. derived_exit_code, state=completion[job_id].state, status=Status.toString( completion[job_id].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.submittedJobs[failed_job][ log[1]] if hdfs.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \ completion[failed_job].status == Status.io_error: # The job is eligible for resubmission if the job hasn't been resubmitted more # than a preset limit of resubmissions AND if the job failed due to I/O errors logging.warning( "Job w/ ID {id} and arguments {args} FAILED because: {reason} " "-> resubmission attempt #{attempt}". format( id=failed_job, args=self.submittedJobs[failed_job] ['args'], reason=Status.toString( completion[failed_job].status), attempt=self.submittedJobs[failed_job] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[failed_job]['args']) # The old ID must be deleted, b/c otherwise it would be used to compare against # squeue output and we would resubmit the failed job ad infinitum del self.submittedJobs[failed_job] else: # We've exceeded the maximum number of resubmissions -> fail the workflow raise Status.raiseError( completion[failed_job].status) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for job_id in completed_jobs: if not all( map( lambda outputFile: is_file_ok( outputFile, validate_outputs=True, min_file_size=self.min_file_size), self .submittedJobs[job_id]['outputFiles'])): if self.submittedJobs[job_id][ 'nof_submissions'] < self.max_resubmissions: logging.warning( "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file " "-> resubmission attempt #{attempt}". format( id=job_id, args=self.submittedJobs[job_id] ['args'], attempt=self.submittedJobs[job_id] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[job_id]['args']) del self.submittedJobs[job_id] else: raise ValueError( "Job w/ ID {id} FAILED because it repeatedly produces bogus output " "file {output} yet the job still exits w/o any errors" .format( id=job_id, output=', '.join( self.submittedJobs[job_id] ['outputFiles']), )) else: # Job completed just fine self.submittedJobs[job_id][ 'status'] = Status.completed jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) if nofJobs_left > 0: two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) logging.debug("sleeping for %i seconds." % random_delay) time.sleep(self.poll_interval + random_delay) else: break logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left)
def get_hadd_stage2(input_paths): results = {} for input_path in input_paths: path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(path_split) if not (5 < nof_levels < 11): raise ValueError("Invalid path: %s" % input_path) current_paths = [input_path] if nof_levels == 6: assert (len(current_paths) == 1) current_path = os.path.join(current_paths.pop(), 'histograms') if not hdfs.isdir(current_path): return [] current_paths = [current_path] nof_levels += 1 if nof_levels == 7: assert (len(current_paths) == 1) current_path = current_paths.pop() current_paths = hdfs.listdir(current_path) nof_levels += 1 if nof_levels == 8: next_paths = [] for current_path in current_paths: region_paths = hdfs.listdir(current_path) for region_path in region_paths: next_paths.append(region_path) current_paths = next_paths nof_levels += 1 if nof_levels == 9: next_paths = [] for current_path in current_paths: for next_path in hdfs.listdir(current_path): next_path_basename = os.path.basename(next_path) if next_path_basename == 'hadd': next_paths.append(next_path) current_paths = next_paths nof_levels += 1 if nof_levels == 10: next_paths = [] for current_path in current_paths: candidate_files = [] for candidate_file in hdfs.listdir(current_path): if not hdfs.isfile(candidate_file): continue candidate_file_basename = os.path.basename(candidate_file) if candidate_file_basename.startswith('hadd_stage2') and \ not HADD_STAGE2_RE.match(candidate_file_basename.split('.')[0]): candidate_files.append(candidate_file) if candidate_files: assert (len(candidate_files) == 1) next_paths.append(candidate_files[0]) current_paths = next_paths for current_path in current_paths: current_path_split = [ subpath for subpath in current_path.split(os.path.sep) if subpath != '' ] channel = current_path_split[7] region = current_path_split[8] channel_region = '{}_{}'.format(channel, region) if channel_region in results: raise RuntimeError( "Found two paths corresponding to the same channel (%s) and region (%s): %s and %s" % \ (channel, region, current_path, results[channel_region]) ) results[channel_region] = current_path logging.debug( 'Found hadd stage2 file corresponding to channel {} and region {}: {}' .format(channel, region, current_path)) return [results[k] for k in sorted(results.keys())]
input_files.append(infile) else: with open(infile, 'r') as f: for line in f: line_stripped = line.rstrip('\n') if not line_stripped: # empty line continue if not line_stripped.endswith('.root'): logging.warning( 'File %s does not appear to be a ROOT file' % line_stripped) continue line_path = '/hdfs%s' % line_stripped if line_stripped.startswith( ('/local', '/cms')) else line_stripped if not hdfs.isfile(line_path): logging.error('File %s does not exist, skipping' % line_path) continue if line_path not in input_files: # require the input files to be unique input_files.append(line_path) logging.debug('Preparing job for file: %s' % line_path) # check if the script directory exists, and if not, create it if not os.path.isdir(script_dir): logging.info('Directory %s does not exist, attempting to create it' % script_dir) try: os.makedirs(script_dir) except IOError as err:
allowed_systematics = args.systematics searchable_regions = args.regions show_by_nodes = args.node show_by_sample = args.sample show_htxs = args.htxs if len(allowed_decay_modes) > 1 and '' in allowed_decay_modes: raise ValueError("Conflicting values to 'decay_modes' parameter") input_file_names_hadd_stage1 = [] input_file_names_hadd_stage2 = [] for input_file_path in input_file_paths: if not input_file_path.startswith('/hdfs/local'): raise ValueError("Invalid path: %s" % input_file_path) if hdfs.isfile(input_file_path): input_file_abs_path = os.path.abspath(input_file_path) if is_hadd_stage_file(input_file_abs_path, True): input_file_names_hadd_stage1.append(input_file_abs_path) elif is_hadd_stage_file(input_file_abs_path, False): input_file_names_hadd_stage2.append(input_file_abs_path) else: raise ValueError( "Not a valid hadd stage 1 or stage 2 file: %s" % input_file_path) else: input_file_names_hadd_stage1.extend( find_hadd_stage_files(input_file_path, searchable_regions, True)) input_file_names_hadd_stage2.extend( find_hadd_stage_files(input_file_path, searchable_regions,
def get_evt_yields(input_file_name, results=None): if not results: results = collections.OrderedDict() metadata = extract_metadata(input_file_name) if not input_file_name: return results, metadata assert ('sample' in metadata) sample = metadata['sample'] assert (hdfs.isfile(input_file_name)) input_file = ROOT.TFile.Open(input_file_name, 'read') subdirectories = get_keys( input_file, exclude = lambda key: key in [ 'analyzedEntries', 'selectedEntries' ] or \ key.endswith(('_fake', '_nonfake')) ) for whitelist in ['1l_1tau_Fakeable_wFakeRateWeights', '1l_1tau_Tight']: if whitelist in subdirectories: subdirectories = [whitelist] is_single_subcategory = len(subdirectories) == 1 for subdirectory in subdirectories: evt_directory_path = os.path.join(subdirectory, 'sel', 'evt') evt_directory_ptr = get_dir(input_file, evt_directory_path) subcategory_name = '' if is_single_subcategory else subdirectory processes = get_keys(evt_directory_ptr, exclude=lambda key: key.startswith( ('tHq', 'tHW', 'HH')) and 'kt_' in key) for process in processes: process_path = os.path.join(evt_directory_path, process) process_dir_ptr = get_dir(input_file, process_path) process_name, decay_mode, gen_match, htxs = parse_process(process) histogram_names = get_keys( process_dir_ptr, include = lambda key: EVENTCOUNTER in key or \ OUTPUT_NN_RE_CAT.match(key) or \ OUTPUT_NN_RE.match(key) ) for histogram_name in histogram_names: histogram_path = os.path.join(process_path, histogram_name) histogram = input_file.Get(histogram_path) if type(histogram) != ROOT.TH1D: raise RuntimeError( "Object at '%s' in file %s is not an instance of ROOT.TH1D" % (histogram_path, input_file_name)) systematics = get_systematics(histogram_name) node, category = get_node_subcategory(histogram_name) event_count = int(histogram.GetEntries()) event_yield = histogram.Integral() if systematics.startswith('FR'): continue if subcategory_name not in results: results[subcategory_name] = collections.OrderedDict() if process_name not in results[subcategory_name]: results[subcategory_name][ process_name] = collections.OrderedDict() if sample not in results[subcategory_name][process_name]: results[subcategory_name][process_name][ sample] = collections.OrderedDict() if decay_mode not in results[subcategory_name][process_name][ sample]: results[subcategory_name][process_name][sample][ decay_mode] = collections.OrderedDict() if gen_match not in results[subcategory_name][process_name][ sample][decay_mode]: results[subcategory_name][process_name][sample][ decay_mode][gen_match] = collections.OrderedDict() if htxs not in results[subcategory_name][process_name][sample][ decay_mode][gen_match]: results[subcategory_name][process_name][sample][ decay_mode][gen_match][htxs] = collections.OrderedDict( ) if systematics not in results[subcategory_name][process_name][ sample][decay_mode][gen_match][htxs]: results[subcategory_name][process_name][sample][ decay_mode][gen_match][htxs][ systematics] = collections.OrderedDict() if node not in results[subcategory_name][process_name][sample][ decay_mode][gen_match][htxs][systematics]: results[subcategory_name][process_name][sample][ decay_mode][gen_match][htxs][systematics][ node] = collections.OrderedDict() if category in results[subcategory_name][process_name][sample][ decay_mode][gen_match][htxs][systematics][node]: raise RuntimeError( "Possible double-counting the event counts and yields by reading histogram '%s' in file %s: " "subcategory name = %s, process name = %s, sample = %s, decay mode = %s, gen matching = %s, " "htxs = %s, systematics = %s, node = %s, category = %s" % ( histogram_path, input_file_name, subcategory_name, process_name, sample, decay_mode, gen_match, htxs, systematics, node, category, )) results[subcategory_name][process_name][sample][decay_mode][ gen_match][htxs][systematics][node][category] = { 'count': event_count, 'yield': event_yield, } del histogram del process_dir_ptr del evt_directory_ptr input_file.Close() return results, metadata
sample_name = args.sample_name output_file = args.output grep_directory = args.directory grep_individually = args.all try: sample_name_re = re.compile(sample_name) except: logging.error( "Argument {arg} not a valid regex".format(arg=sample_name)) sys.exit(1) if grep_individually and not grep_directory: logging.warning( 'Option -a/--all has no effect unless you specify -d/--directory') if not hdfs.isfile(rle_file): logging.error("No such file: '{rle_filename}'".format( rle_filename=rle_file, )) sys.exit(1) if output_file and not hdfs.isdir(os.path.dirname(output_file)): logging.error( "Parent directory of '{output_file}' doesn't exist".format( output_file=output_file, )) sys.exit(1) if grep_directory and not hdfs.isdir(grep_directory): logging.error("Grep directory '{grep_directory}' doesn't exist".format( grep_directory=grep_directory, )) sys.exit(1)
def plot(input_files, output_files, title, expected_neff, mode): histogram_dict = {} for sample_name, sample_entry in input_files.items(): if not hdfs.isfile(sample_entry['input']): logging.error('Could not find file {}'.format(sample_entry['input'])) continue root_file = ROOT.TFile.Open(sample_entry['input'], 'read') logging.debug('Opened file {}'.format(sample_entry['input'])) root_directories = list(filter( lambda root_dir: root_dir != None, [ root_file.Get(os.path.join(key.GetName(), mode, 'genEvt')) \ for key in root_file.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] )) if len(root_directories) != 1: raise RuntimeError('Expected single directory in %s' % sample_entry['input']) root_dir = root_directories[0] histogram_dirs = [ root_dir.Get(key.GetName()) \ for key in root_dir.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] if len(histogram_dirs) != 1: raise RuntimeError( 'Expected single directory containing lumiScale histograms in %s' % sample_entry['input'] ) histogram_dir = histogram_dirs[0] histograms = [ key.GetName() for key in histogram_dir.GetListOfKeys() \ if key.GetClassName().startswith('TH1') and 'lumiScale' in key.GetName() ] for histogram_name_actual in histograms: histogram_name = histogram_name_actual.replace('_lumiScale', '').replace('CMS_ttHl_', '') \ if histogram_name_actual != 'lumiScale' else 'central' histogram = histogram_dir.Get(histogram_name_actual).Clone() histogram.SetDirectory(0) if histogram.GetEntries() != sample_entry['nentries'] and mode == 'unbiased': raise RuntimeError('Expected {} entries from {} in file {}, but got {} entries'.format( sample_entry['nentries'], histogram_name, sample_entry['input'], histogram.GetEntries(), )) if histogram_name not in histogram_dict: histogram_dict[histogram_name] = { 'histogram' : histogram, 'nentries' : histogram.GetEntries(), 'nfiles' : 1, } else: histogram_dict[histogram_name]['histogram'].Add(histogram) histogram_dict[histogram_name]['nentries'] += histogram.GetEntries() histogram_dict[histogram_name]['nfiles'] += 1 root_file.Close() if not histogram_dict: logging.error('Could not find histograms for samples {}'.format(', '.join(list(input_files.keys())))) return if len(set(histogram_dict[histogram_name]['nfiles'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of files found for samples %s' % ', '.join(list(input_files.keys())) ) if len(set(histogram_dict[histogram_name]['nentries'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of entries found in samples %s' % ', '.join(list(input_files.keys())) ) min_y = -1 max_y = -1 nentries = -1 for histograms in histogram_dict.values(): histogram = histograms['histogram'] y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_down = y_content - y_error y_up = y_content + y_error if min_y < 0: min_y = y_down if max_y < 0: max_y = y_up if y_down < min_y: min_y = y_down if y_up > max_y: max_y = y_up if nentries < 0: nentries = histograms['nentries'] else: assert(nentries == histograms['nentries']) if not (y_down < expected_neff < y_up) and mode == 'unbiased': logging.warning( "Effective event count {} not within {} +- {}".format(expected_neff, y_content, y_error) ) if mode == 'unbiased': min_y = min(min_y, expected_neff) max_y = max(max_y, expected_neff) diff = 0.2 * (max_y - min_y) min_y -= diff max_y += diff canvas = ROOT.TCanvas('c', 'c', 1200, 900) canvas.SetGrid() ROOT.gStyle.SetOptStat(0) legend = ROOT.TLegend(0.1, 0.7, 0.48, 0.9) legend.SetHeader('N_{eff} (%d entries)' % nentries) expected_histogram = None line_width = 3 marker_style = 20 fill_style = 4000 lines = [] for idx, histogram_name in enumerate(sorted(histogram_dict.keys())): histogram = histogram_dict[histogram_name]['histogram'] color = 2 + idx histogram.SetTitle(title) histogram.SetAxisRange(min_y, max_y, "Y") histogram.SetLineColor(color) histogram.SetMarkerColor(color) histogram.SetLineWidth(line_width) histogram.SetMarkerStyle(marker_style) histogram.SetFillStyle(fill_style) histogram.Draw("l e1%s" % (" same" if idx > 0 else "")) y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_up = y_content + y_error y_down = y_content - y_error bin_width = histogram.GetBinWidth(1) bin_center = histogram.GetBinCenter(1) line_min_x = bin_center - bin_width / 4 line_max_x = bin_center + bin_width / 4 line_down = ROOT.TLine(line_min_x, y_down, line_max_x, y_down) line_down.SetLineColor(color) line_down.SetLineWidth(line_width) line_down.Draw() lines.append(line_down) line_up = ROOT.TLine(line_min_x, y_up, line_max_x, y_up) line_up.SetLineColor(color) line_up.SetLineWidth(line_width) line_up.Draw() lines.append(line_up) sig_digits = max(8 - int(math.ceil(math.log10(y_content))), 1) if y_content > 0. else 1 leg_pattern = '%s (%.{}f #pm %.{}f)'.format(sig_digits, sig_digits) leg_name = leg_pattern % (histogram_name, y_content, y_error) legend.AddEntry(histogram, leg_name) logging.debug( 'Effective event count for the sys unc option {} is {} +- {}'.format( histogram_name, y_content, y_error ) ) if not expected_histogram and mode == 'unbiased': expected_histogram = histogram.Clone() expected_histogram.Reset() expected_histogram.SetBinContent(1, expected_neff) expected_histogram.SetBinError(1, 0) expected_histogram.SetLineColor(ROOT.kBlack) expected_histogram.SetMarkerColor(ROOT.kBlack) expected_histogram.SetLineWidth(line_width) expected_histogram.SetMarkerStyle(marker_style) expected_histogram.SetLineStyle(9) expected_histogram.SetFillStyle(fill_style) if expected_histogram: logging.debug('Expecting {} events'.format(expected_neff)) expected_histogram.Draw("e2 same") legend.AddEntry(expected_histogram, 'expected (%.1f)' % expected_neff) legend.Draw() for output_file in output_files: canvas.SaveAs(output_file) canvas.Close() legend.Delete() if expected_histogram: expected_histogram.Delete() for histogram_name in histogram_dict: histogram_dict[histogram_name]['histogram'].Delete() for line in lines: line.Delete()
help = 'R|Enable verbose printout') args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) rle_filename = args.input out_filename = os.path.abspath(args.output) grep_dir = args.directory sample_name = args.sample_name force = args.force debug_output = args.debug nof_files = args.nof_files # check if input RLE file exists if not hdfs.isfile(rle_filename): logging.error("File {rle_filename} does not exist or is not a file!".format(rle_filename = rle_filename)) sys.exit(1) # check if the directory into which we have to write the output ROOT file already exists out_parent_dir = os.path.dirname(out_filename) if not hdfs.isdir(out_parent_dir): if not force: logging.error("Parent directory of the output file {out_filename} does not exist".format( out_filename = out_filename), ) sys.exit(1) else: logging.debug("Output directory {out_parent_dir} does not exist, attempting to create one".format( out_parent_dir = out_parent_dir, ))